In [183]:
# Set up Notebook
% matplotlib inline

# Standard imports
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from matplotlib import cm


# We do this to ignore several specific Pandas warnings
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import r2_score

In [184]:
df = pd.read_csv('tz_clust.csv')
df = df.dropna(axis=1, how='all')
df = df.dropna(axis=0, how='any')

labels = ['logFCS', 'rcsi', 'hdds']
X=df.drop(labels, axis=1)
y = df[labels]



X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.33, random_state=42)

In [150]:
X_train.head()

Unnamed: 0,cellphone,number_celphones,asset_index,housing_index,nutri_reten_constrained,nutri_avail_constrained,dist_km,maize_price,maize_mktthin,rice_price,rice_mktthin,tmean,floodmax,raincytot,day1rain,maxdaysnorain
2132,0.285714,0.285714,-0.398681,0.910648,0.0,0.0,120.15645,26583.0,0.0,147500.0,0.0,21.880798,0.0,716.417804,14.0,84.0
2157,0.875,1.5,-0.398681,-0.885824,0.0,0.0,104.9172,41333.0,0.0,148750.0,0.0,24.135103,0.0,831.109474,0.0,124.0
762,0.444444,1.444444,-0.398681,0.162118,1.0,1.0,62.444283,41694.285,0.0,122828.57,0.0,25.390187,0.0,1121.393012,64.0,122.0
1504,1.0,2.0,-0.398678,-0.885821,1.0,1.0,46.217674,52304.168,0.0,97866.664,0.0,24.226832,0.0,724.905183,20.0,114.0
1629,0.0,0.0,-0.398681,2.258011,1.0,1.0,59.450397,39775.0,0.285714,129646.57,0.285714,25.319608,0.0,324.277307,133.0,174.0


In [151]:
y_test.head()

Unnamed: 0,logFCS,rcsi,hdds
1323,3.908837,16.416666,4.083334
916,4.091643,6.666666,5.555555
2311,1.543737,2.8,1.6
722,4.295629,1.555556,6.111111
1226,4.049976,0.888889,6.333334


In [164]:
from sklearn.preprocessing import Imputer,StandardScaler

imp = Imputer(missing_values='NaN', strategy='mean', axis=0, copy=False)
X_train = imp.fit_transform(X_train)
X_test = imp.fit_transform(X_test)

ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.fit_transform(X_test)


In [165]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(2)
X_train = poly.fit_transform(X_train)
X_test =  poly.fit_transform(X_test)



In [166]:
from sklearn.linear_model import LinearRegression

# Create and fit our linear regression model to training data
model = LinearRegression(fit_intercept=True)
model.fit(X_train, y_train[labels[2]])
 
# Compute model predictions for test data
pred = model.predict(X_test)

actual = y_test[labels[2]]
r2_linear = stats.pearsonr(actual, pred)[0] ** 2
r2_linear

0.4717113023757812

In [167]:
# fit on logFCS 
from sklearn.ensemble import RandomForestRegressor

# Create Regressor with default properties
rfc = RandomForestRegressor(random_state =0,n_jobs =4,warm_start = True)

parameters = {'max_depth':np.arange( 1,10, 1 ).tolist(), 'min_samples_leaf':np.arange( 1, 4, 1 ).tolist()}
clf = GridSearchCV(rfc, parameters,cv=6, n_jobs= 4, iid = True,  refit= True,pre_dispatch= '2*n_jobs')
clf.fit(X_train, y_train[labels[2]])

# Fit estimator and display score

# Regress on test data
pred = clf.predict(X_test)

actual = y_test[labels[2]]
r2_rfc = stats.pearsonr(actual, pred)[0] ** 2
r2_rfc

0.759745727934612

In [157]:
# fit on logFCS 
from sklearn.ensemble import RandomForestRegressor

# Create Regressor with default properties
rfc = RandomForestRegressor(random_state =0,n_jobs =4,warm_start = True,max_depth=4, min_samples_leaf=5 )

rfc.fit(X_train, y_train[labels[2]])
# Fit estimator and display score

# Regress on test data
pred = rfc.predict(X_test)

actual = y_test[labels[2]]
r2_rfc = stats.pearsonr(actual, pred)[0] ** 2
r2_rfc

0.7393084231401277

In [102]:
?stats.pearsonr

In [168]:
from sklearn.linear_model import RidgeCV
ridge = RidgeCV(alphas=(0,2), fit_intercept=True, normalize=False, scoring=None, cv=5, gcv_mode='auto', store_cv_values=False) 

# Define different alpha values for different fits
# alpha = [0.0, 1E-6, 1E-4, 1E-2, 1.0]

ridge.fit(X_train, y_train[labels[2]])
pred = ridge.predict(X_test)

actual = y_test[labels[2]]
r2_ridge= stats.pearsonr(actual, pred)[0] ** 2
r2_ridge

0.4868157217016907

In [44]:
from sklearn.linear_model import BayesianRidge
bridge = BayesianRidge(alpha_1=30, alpha_2=70,lambda_1=0.01, compute_score=True)
bridge.fit(X_train, y_train[labels[2]])
pred = bridge.predict(X_test)

actual = y_test[labels[2]]
r2_bridge= stats.pearsonr(actual, pred)[0] ** 2
r2_bridge

0.004029267771234072

In [161]:
from sklearn.linear_model import LassoCV

ls = LassoCV(eps=0.1, n_alphas=100, alphas=(0.6, 1), fit_intercept=True,precompute='auto',n_jobs=4, random_state=0, selection='cyclic')

ls = ls.fit(X_train, y_train[labels[2]])
pred = ls.predict(X_test)

actual = y_test[labels[2]]
r2_ls= stats.pearsonr(actual, pred)[0] ** 2
r2_ls

0.6547893206227314

In [162]:
from sklearn.linear_model import ElasticNetCV
en = ElasticNetCV(alphas=(0.1,0.02,3,2), copy_X=True, cv=10, eps=0.004, fit_intercept=True,
       l1_ratio=0.33, max_iter=1000, n_alphas=100, n_jobs=1,
       normalize=False, positive=False, precompute='auto', random_state=0,
       selection='cyclic', tol=0.0001, verbose=0)

en.fit(X_train, y_train[labels[2]])

pred = en.predict(X_test)

actual = y_test[labels[2]]
r2_en= stats.pearsonr(actual, pred)[0] ** 2
r2_en


0.7664872571648382

In [163]:
from sklearn.ensemble import GradientBoostingRegressor

# Create Regressor with default properties
gbr = GradientBoostingRegressor(random_state=0,learning_rate=0.1, n_estimators=19,subsample=1, criterion='friedman_mse', min_samples_split=3)

gbr.fit(X_train, y_train[labels[2]])
pred = gbr.predict(X_test)

actual = y_test[labels[2]]
r2_gbr= stats.pearsonr(actual, pred)[0] ** 2
r2_gbr


0.749254871599868