In [188]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=FutureWarning)
from matplotlib import rcParams
rcParams["figure.figsize"] = 12, 6
sns.set_style('darkgrid')

df = pd.read_csv('training.csv')
df = df.drop('Unnamed: 0', axis=1)
df.head()

Unnamed: 0,surface_x,surface_y,bh_x,bh_y,standardized_operator_name,gross_perforated_length,number_of_stages,total_proppant,total_fluid,true_vertical_depth,...,relative_well_position,batch_frac_classification,well_family_relationship,frac_type,frac_seasoning,horizontal_midpoint_x,horizontal_midpoint_y,horizontal_toe_x,horizontal_toe_y,OilPeakRate
0,1383493.751,717329.2368,1382854.564,712767.37,1121,3963.582677,,76000.0,,8712.598425,...,Standalone Well,Unknown,Standalone Well,Primary Frac,,1383030.678,714796.361,1382668.912,712798.4321,46.623023
1,1380344.035,711916.2732,1379738.429,707860.5643,1022,3179.133858,,113000.0,561096.4,8627.952756,...,Outer Well,Unknown,Infill Child Well,Primary Frac,,1379965.126,709541.7145,1379630.025,707907.6708,59.750009
2,1379915.895,717845.0063,1379274.644,713471.9952,1304,3810.03937,12.0,286571.4286,1621424.0,8801.181102,...,Standalone Well,Non-Batch Frac,Standalone Well,Primary Frac,92.0,1379357.25,715381.2327,1378960.372,713530.449,10.785716
3,1383567.443,706640.201,1382891.477,702068.3838,1022,3723.425197,11.0,106878.9286,437713.4,8635.826772,...,Standalone Well,Unknown,Standalone Well,Primary Frac,,1383099.61,704058.8508,1382727.299,702098.4882,123.797638
4,1378962.585,702623.9787,1379643.546,707204.9131,1022,3811.023622,,112789.7143,622980.2,8664.370079,...,Outer Well,Unknown,Infill Child Well,Primary Frac,,1379546.414,705150.7519,1379987.487,707123.7258,102.30954


In [189]:
df.shape

(29692, 30)

In [190]:
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()

print("Categorical Columns:", categorical_columns)

Categorical Columns: ['ffs_frac_type', 'relative_well_position', 'batch_frac_classification', 'well_family_relationship', 'frac_type']


In [191]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29692 entries, 0 to 29691
Data columns (total 30 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   surface_x                     29692 non-null  float64
 1   surface_y                     29692 non-null  float64
 2   bh_x                          27605 non-null  float64
 3   bh_y                          27605 non-null  float64
 4   standardized_operator_name    29692 non-null  int64  
 5   gross_perforated_length       22135 non-null  float64
 6   number_of_stages              2752 non-null   float64
 7   total_proppant                19080 non-null  float64
 8   total_fluid                   19076 non-null  float64
 9   true_vertical_depth           28197 non-null  float64
 10  ffs_frac_type                 14585 non-null  object 
 11  proppant_intensity            18971 non-null  float64
 12  frac_fluid_intensity          18946 non-null  float64
 13  a

In [192]:
df.shape

(29692, 30)

In [193]:
df.isnull().sum()

surface_x                           0
surface_y                           0
bh_x                             2087
bh_y                             2087
standardized_operator_name          0
gross_perforated_length          7557
number_of_stages                26940
total_proppant                  10612
total_fluid                     10616
true_vertical_depth              1495
ffs_frac_type                   15107
proppant_intensity              10721
frac_fluid_intensity            10746
average_stage_length            26968
average_proppant_per_stage      27003
average_frac_fluid_per_stage    27005
proppant_to_frac_fluid_ratio    11036
frac_fluid_to_proppant_ratio    11036
bin_lateral_length               7557
pad_id                              0
relative_well_position           9225
batch_frac_classification        9225
well_family_relationship         9225
frac_type                           0
frac_seasoning                  14924
horizontal_midpoint_x            1001
horizontal_m

In [194]:
df['number_of_stages'].fillna(round(df['number_of_stages'].mean(), 0), inplace=True)
df['number_of_stages'].isnull().sum()

0

In [195]:
df.shape

(29692, 30)

In [196]:
df['drift(distance formula)'] = ((df['bh_x']-df['surface_x'])**2+(df['bh_y']-df['surface_y'])**2)**0.5
df = df.drop(['bh_x', 'bh_y', 'surface_y', 'surface_x'], axis=1)
df.dropna(subset=['OilPeakRate'], inplace=True)
df.head()

Unnamed: 0,standardized_operator_name,gross_perforated_length,number_of_stages,total_proppant,total_fluid,true_vertical_depth,ffs_frac_type,proppant_intensity,frac_fluid_intensity,average_stage_length,...,batch_frac_classification,well_family_relationship,frac_type,frac_seasoning,horizontal_midpoint_x,horizontal_midpoint_y,horizontal_toe_x,horizontal_toe_y,OilPeakRate,drift(distance formula)
0,1121,3963.582677,28.0,76000.0,,8712.598425,,19.174572,,,...,Unknown,Standalone Well,Primary Frac,,1383030.678,714796.361,1382668.912,712798.4321,46.623023,4606.429064
1,1022,3179.133858,28.0,113000.0,561096.4,8627.952756,,35.544272,176.49348,,...,Unknown,Infill Child Well,Primary Frac,,1379965.126,709541.7145,1379630.025,707907.6708,59.750009,4100.674738
2,1304,3810.03937,12.0,286571.4286,1621424.0,8801.181102,,75.214821,425.566078,317.503281,...,Non-Batch Frac,Standalone Well,Primary Frac,92.0,1379357.25,715381.2327,1378960.372,713530.449,10.785716,4419.777022
3,1022,3723.425197,11.0,106878.9286,437713.4,8635.826772,,28.704465,117.556659,338.4932,...,Unknown,Standalone Well,Primary Frac,,1383099.61,704058.8508,1382727.299,702098.4882,123.797638,4621.519506
4,1022,3811.023622,28.0,112789.7143,622980.2,8664.370079,,29.595648,163.467933,,...,Unknown,Infill Child Well,Primary Frac,,1379546.414,705150.7519,1379987.487,707123.7258,102.30954,4631.270653


In [197]:
df = pd.get_dummies(df, drop_first=True)

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso
from sklearn.impute import SimpleImputer

X = df.drop('OilPeakRate', axis=1)
y = df['OilPeakRate']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)
print(X_train.shape, X_test.shape)

(15444, 35) (3862, 35)


In [198]:
# features = df.drop('OilPeakRate', axis=1).columns
# pipeline = Pipeline([
#     ('imputer', SimpleImputer(strategy='mean')),
#     ('scaler', StandardScaler()),
#     ('lasso', Lasso())
# ])
# X.replace([np.inf, -np.inf], np.nan, inplace=True)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# search = GridSearchCV(pipeline, {'lasso__alpha': np.arange(0.1, 3, 0.1)}, cv=5, scoring='neg_mean_squared_error', verbose=3)
# search.fit(X_train, y_train)

In [199]:
# Assuming df is your DataFrame with null values
# Drop columns inplace
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)

columns_to_drop = ["pad_id", "standardized_operator_name"]
X_train.drop(columns=columns_to_drop, inplace=True)
X.drop(columns=columns_to_drop, inplace=True)
X_test.drop(columns=columns_to_drop, inplace=True)

# Create an instance of SimpleImputer with strategy='mean'
imputer = SimpleImputer(strategy='mean')

# Fit and transform the imputer on X_train
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)

# Transform X and X_test using the imputer
X = pd.DataFrame(imputer.transform(X), columns=X.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

# Check for null values in X
X.isnull().sum()


gross_perforated_length                            0
number_of_stages                                   0
total_proppant                                     0
total_fluid                                        0
true_vertical_depth                                0
proppant_intensity                                 0
frac_fluid_intensity                               0
average_stage_length                               0
average_proppant_per_stage                         0
average_frac_fluid_per_stage                       0
proppant_to_frac_fluid_ratio                       0
frac_fluid_to_proppant_ratio                       0
bin_lateral_length                                 0
frac_seasoning                                     0
horizontal_midpoint_x                              0
horizontal_midpoint_y                              0
horizontal_toe_x                                   0
horizontal_toe_y                                   0
drift(distance formula)                       

In [207]:
X['frac_fluid_to_proppant_ratio']

0        5.575835
1        4.965455
2        5.658008
3        4.095414
4        5.523377
           ...   
19301    5.737403
19302    5.575835
19303    7.073290
19304    5.096893
19305    4.779506
Name: frac_fluid_to_proppant_ratio, Length: 19306, dtype: float64

In [208]:
from sklearn.metrics import mean_squared_error, r2_score

# Assuming X_train and X_test are your training and test feature matrices
selected_features = np.array(X.columns)  # Use coef[:-1] to match the length

# Refit Lasso regression with selected features
lasso_model_selected_features = Lasso(alpha=0.1)  
lasso_model_selected_features.fit(X_train[selected_features], y_train)

# Make predictions on the test set
y_pred = lasso_model_selected_features.predict(X_test[selected_features])

# Evaluate the model performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the results
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")


Mean Squared Error: 14027.408553178759
R-squared: 0.40564643709003967
