In [1]:
import pandas as pd
import numpy as np
from os.path import join, dirname
from os import getcwd

In [2]:
path = dirname(getcwd())
path = join(path, "data")

In [3]:
data = pd.read_csv(join(path, "completion.csv")).drop(['Unnamed: 0', 'pump rate (cubic feet/min)','proppant weight (lbs)'], axis=1)

In [4]:
X = data[["easting", "northing"]]
y = data.drop(["easting", "northing"], axis=1)

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
train_data = [y_train[x] for x in y_train.columns]
test_data = [y_test[x] for x in y_test.columns]

In [7]:
def create_forest():
    return RandomForestRegressor()
def create_bagging():
    return BaggingRegressor()

def create_boosting():
    return GradientBoostingRegressor()

def create_perceptron():
    return MLPRegressor()

In [8]:
models = [create_forest, create_bagging, create_boosting]

In [9]:
data_sets = []

In [10]:
gdef print_scores(model, train_x, train_y, test_x, test_y, output=True):
    train_score = model.score(train_x, train_y)
    test_score = model.score(test_x, test_y)
    if output:
        print("Training: {}".format(train_score))
        print("Testing: {}".format(test_score))
    return train_score, test_score

In [11]:
good_models = []
for i in range(len(train_data)):
    max_score = -99999999
    print("---------")
    print(y.columns[i], "\n")

    for model in models:
        current_model = model().fit(X_train, train_data[i])
        print(current_model)
        (train, test) = print_scores(current_model, X_train, train_data[i], X_test, test_data[i])
        if test > max_score:
            best_model = current_model
            max_score = test
        print("\n")
    good_models.append(best_model)
    print("---------")

---------
porosity 

RandomForestRegressor()
Training: 0.9546055299382701
Testing: 0.6810667953166563


BaggingRegressor()
Training: 0.9409568151441846
Testing: 0.6611728853991392


GradientBoostingRegressor()
Training: 0.7332538045586356
Testing: 0.7253005308093994


---------
---------
permeability 

RandomForestRegressor()
Training: 0.9484920535617455
Testing: 0.6455653918010315


BaggingRegressor()
Training: 0.9325695836992649
Testing: 0.6222445097530629


GradientBoostingRegressor()
Training: 0.6880616475070405
Testing: 0.6804454768339836


---------
---------
Poisson's ratio 

RandomForestRegressor()
Training: 0.9999644745807965
Testing: 0.9997774022990471


BaggingRegressor()
Training: 0.9999478469809303
Testing: 0.9997407509521802


GradientBoostingRegressor()
Training: 0.9573936559930271
Testing: 0.9550340240396877


---------
---------
Young's Modulus 

RandomForestRegressor()
Training: 0.9999820319353482
Testing: 0.9998862383813243


BaggingRegressor()
Training: 0.9999723961

In [12]:
for i in range(len(train_data)):
    current_model = good_models[i]
    train, test = print_scores(current_model, X_train, train_data[i], X_test, test_data[i], output=False)
    print("{}, model: {}".format(train_data[i].name, current_model))
    print("train score: {}, test score: {}".format(train, test), end="\n\n")

porosity, model: GradientBoostingRegressor()
train score: 0.7332538045586356, test score: 0.7253005308093994

permeability, model: GradientBoostingRegressor()
train score: 0.6880616475070405, test score: 0.6804454768339836

Poisson's ratio, model: RandomForestRegressor()
train score: 0.9999644745807965, test score: 0.9997774022990471

Young's Modulus, model: RandomForestRegressor()
train score: 0.9999820319353482, test score: 0.9998862383813243

water saturation, model: RandomForestRegressor()
train score: 0.9999844072273653, test score: 0.9998936817817972

oil saturation, model: RandomForestRegressor()
train score: 0.9999823600206093, test score: 0.9998965988917111



In [13]:
# using joblib since it handles np arrays better than pickle
from joblib import dump

In [14]:
model_path = join(getcwd(), 'models')

for i in range(len(train_data)):
    dump(good_models[i], join(model_path, train_data[i].name + '.model'))