In [29]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

testset = pd.read_csv('training.csv') #replace with whatever our testing data is called, you can change test set variable name

warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=FutureWarning)
from matplotlib import rcParams
rcParams["figure.figsize"] = 12, 6
sns.set_style('darkgrid')

In [30]:
testset.head()
# Confirm that there are no null values in the testing dataset as good practice
testset.isna().sum().sum()

264982

In [31]:
testset.drop(['average_frac_fluid_per_stage', 'average_proppant_per_stage', 'average_stage_length',
          'number_of_stages'], axis=1, inplace=True) # Dropping columns with too many NaN values

testset.drop(['Unnamed: 0', 'pad_id', 'frac_type', 'batch_frac_classification', 
        'standardized_operator_name'], axis=1, inplace=True) # Dropping columns that are not useful

In [32]:
from sklearn.linear_model import LinearRegression

subset_train_df = testset[['surface_x', 'bh_x']].dropna()

X = subset_train_df[['surface_x']]
y = subset_train_df['bh_x']

lr_model = LinearRegression()
lr_model.fit(X, y)

missing_bh_x_rows = testset['bh_x'].isna()

testset.loc[missing_bh_x_rows, 'bh_x'] = lr_model.predict(testset.loc[missing_bh_x_rows, ['surface_x']])

In [33]:
# Repeat for bh_y
subset_train_df = testset[['surface_y', 'bh_y']].dropna()

X = subset_train_df[['surface_y']]
y = subset_train_df['bh_y']

lr_model_y = LinearRegression()
lr_model_y.fit(X, y)

missing_bh_x_rows = testset['bh_y'].isna()

testset.loc[missing_bh_x_rows, 'bh_y'] = lr_model_y.predict(testset.loc[missing_bh_x_rows, ['surface_y']])

In [34]:
testset.dropna(subset=['OilPeakRate'], inplace=True)
testset.replace([np.inf, -np.inf], np.nan, inplace=True)
def find_mean_std(df, col):
    mean = df[col].mean()
    std = df[col].std()
    return mean, std

for col in testset.columns:  
    if(testset[col].dtype == 'object'):
        continue
    mean, std = find_mean_std(testset, col)
    testset[col] = testset[col].apply(lambda x: mean if x > mean + 3 * std else x)
    testset[col] = testset[col].apply(lambda x: mean if x < mean - 3 * std else x)

# Not sure if these should be here
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

newdf = testset.select_dtypes(include=numerics)

In [35]:
def impute_columns(df, threshold=0.5):
    cols = []
    for col in df.columns:
        if (df[col].isnull().sum() / len(df) < threshold) and (df[col].dtype != 'object'):
            cols.append(col)
    return cols

from sklearn.impute import KNNImputer

knn_imputer = KNNImputer(n_neighbors=10)  

full_impute_df = testset.copy()
cols_to_imputate = impute_columns(testset, threshold=0.3)

full_impute_df[cols_to_imputate] = knn_imputer.fit_transform(testset[cols_to_imputate])
full_impute_df.isnull().sum()

surface_x                          0
surface_y                          0
bh_x                               0
bh_y                               0
gross_perforated_length            0
total_proppant                     0
total_fluid                        0
true_vertical_depth                0
ffs_frac_type                   4996
proppant_intensity                 0
frac_fluid_intensity               0
proppant_to_frac_fluid_ratio       0
frac_fluid_to_proppant_ratio       0
bin_lateral_length                 0
relative_well_position             0
well_family_relationship           0
frac_seasoning                     0
horizontal_midpoint_x              0
horizontal_midpoint_y              0
horizontal_toe_x                   0
horizontal_toe_y                   0
OilPeakRate                        0
dtype: int64

In [38]:
# Don't really need this but keeping in case
full_impute_df.dropna(inplace=True)
full_impute_df.isnull().sum()

surface_x                       0
surface_y                       0
bh_x                            0
bh_y                            0
gross_perforated_length         0
total_proppant                  0
total_fluid                     0
true_vertical_depth             0
ffs_frac_type                   0
proppant_intensity              0
frac_fluid_intensity            0
proppant_to_frac_fluid_ratio    0
frac_fluid_to_proppant_ratio    0
bin_lateral_length              0
relative_well_position          0
well_family_relationship        0
frac_seasoning                  0
horizontal_midpoint_x           0
horizontal_midpoint_y           0
horizontal_toe_x                0
horizontal_toe_y                0
OilPeakRate                     0
dtype: int64

In [37]:
# We have formatted the test dataset, now we can export this set to csv file
# uncomment when ready
# full_impute_df.to_csv('cleaned_test_data.csv', index=False)