In [260]:
# Set up Notebook
% matplotlib inline

# Standard imports
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from matplotlib import cm


# We do this to ignore several specific Pandas warnings
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import r2_score

In [268]:
df = pd.read_csv('tz_clust.csv')
df = df.dropna(axis=1, how='all')
df = df.dropna(axis=0, how='any')

labels = ['logFCS', 'rcsi', 'hdds']
X=df.drop(labels, axis=1)
y = df[labels]



X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.33, random_state=42)

In [269]:
from sklearn.preprocessing import Imputer,StandardScaler

imp = Imputer(missing_values='NaN', strategy='mean', axis=0, copy=False)
X_train = imp.fit_transform(X_train)
X_test = imp.fit_transform(X_test)

ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.fit_transform(X_test)


In [270]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(2)
X_train = poly.fit_transform(X_train)
X_test =  poly.fit_transform(X_test)



In [271]:
from sklearn.linear_model import LinearRegression

# Create and fit our linear regression model to training data
model = LinearRegression(fit_intercept=True)
model.fit(X_train, y_train[labels[1]])
 
# Compute model predictions for test data
pred = model.predict(X_test)

actual = y_test[labels[1]]
r2_linear = stats.pearsonr(actual, pred)[0] ** 2
r2_linear

0.002850238692245421

In [272]:
# fit on logFCS 
from sklearn.ensemble import RandomForestRegressor

# Create Regressor with default properties
rfc = RandomForestRegressor(random_state =0,n_jobs =4,warm_start = True)

parameters = {'max_depth':np.arange( 1,10, 1 ).tolist(), 'min_samples_leaf':np.arange( 1, 4, 1 ).tolist()}
clf = GridSearchCV(rfc, parameters,cv=6, n_jobs= 4, iid = True,  refit= True,pre_dispatch= '2*n_jobs')
clf.fit(X_train, y_train[labels[1]])

# Fit estimator and display score

# Regress on test data
pred = clf.predict(X_test)

actual = y_test[labels[1]]
r2_rfc = stats.pearsonr(actual, pred)[0] ** 2
r2_rfc

0.08747172724410322

In [157]:
# fit on logFCS 
from sklearn.ensemble import RandomForestRegressor

# Create Regressor with default properties
rfc = RandomForestRegressor(random_state =0,n_jobs =4,warm_start = True,max_depth=4, min_samples_leaf=5 )

rfc.fit(X_train, y_train[labels[2]])
# Fit estimator and display score

# Regress on test data
pred = rfc.predict(X_test)

actual = y_test[labels[2]]
r2_rfc = stats.pearsonr(actual, pred)[0] ** 2
r2_rfc

0.7393084231401277

In [102]:
?stats.pearsonr

In [216]:
from sklearn.linear_model import RidgeCV
ridge = RidgeCV(alphas=(0,1), fit_intercept=True, normalize=False, scoring=None, cv=5, gcv_mode='auto', store_cv_values=False) 

# Define different alpha values for different fits
# alpha = [0.0, 1E-6, 1E-4, 1E-2, 1.0]

ridge.fit(X_train, y_train[labels[1]])
pred = ridge.predict(X_test)

actual = y_test[labels[1]]
r2_ridge= stats.pearsonr(actual, pred)[0] ** 2
r2_ridge

0.00011933130163295735

In [259]:
from sklearn.linear_model import BayesianRidge
bridge = BayesianRidge(alpha_1=0.2, alpha_2=10,lambda_1=0.01, compute_score=True)
bridge.fit(X_train, y_train[labels[0]])
pred = bridge.predict(X_test)

actual = y_test[labels[0]]
r2_bridge= stats.pearsonr(actual, pred)[0] ** 2
r2_bridge

0.73639384861396

In [230]:
from sklearn.linear_model import LassoCV

ls = LassoCV(eps=0.1, n_alphas=100, alphas=(0.1, 2), fit_intercept=True,precompute='auto',n_jobs=4, random_state=0, selection='cyclic')

ls = ls.fit(X_train, y_train[labels[0]])
pred = ls.predict(X_test)

actual = y_test[labels[0]]
r2_ls= stats.pearsonr(actual, pred)[0] ** 2
r2_ls

0.7523613480570914

In [235]:
from sklearn.linear_model import ElasticNetCV
en = ElasticNetCV(alphas=(0.01,0.05,0.3), copy_X=True, cv=10, eps=0.004, fit_intercept=True,
       l1_ratio=0.7, max_iter=1000, n_alphas=100, n_jobs=1,
       normalize=False, positive=False, precompute='auto', random_state=0,
       selection='cyclic', tol=0.0001, verbose=0)

en.fit(X_train, y_train[labels[1]])

pred = en.predict(X_test)

actual = y_test[labels[1]]
r2_en= stats.pearsonr(actual, pred)[0] ** 2
r2_en


0.126702402786075

In [252]:
from sklearn.ensemble import GradientBoostingRegressor

# Create Regressor with default properties
gbr = GradientBoostingRegressor(random_state=0,learning_rate=0.05, n_estimators=50,subsample=1, criterion='friedman_mse', min_samples_split=3)

gbr.fit(X_train, y_train[labels[2]])
pred = gbr.predict(X_test)

actual = y_test[labels[2]]
r2_gbr= stats.pearsonr(actual, pred)[0] ** 2
r2_gbr


0.7924555827615288