In [140]:
# Set up Notebook
% matplotlib inline

# Standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import seaborn as sns
import scipy.stats as stats
from matplotlib import cm
from sklearn.model_selection import GridSearchCV



# We do this to ignore several specific Pandas warnings
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import r2_score

In [141]:
train_data = pd.read_csv('tz_train_clust.csv')
test_data = pd.read_csv('tz_test_clust.csv')
train_data = train_data.dropna(axis=1, how='all')
test_data = test_data.dropna(axis=1, how='all')
train_data = train_data.dropna(axis=0, how='any')
test_data = test_data.dropna(axis=0, how='any')


In [142]:
labels = ['logFCS', 'rCSI', 'HDDS']

In [143]:
x_train=train_data.drop(labels, axis=1)
x_train = x_train.drop(x_train.columns[0], axis=1)
x_test=test_data.drop(labels, axis=1)
x_test = x_test.drop(x_test.columns[0], axis=1)

x_test.head()

Unnamed: 0,cellphone,number_celphones,asset_index,housing_index,nutri_reten_constrained,nutri_avail_constrained,dist_km,maize_price,maize_mktthin,rice_price,rice_mktthin,tmean,lhz_maxdaysnorain,floodmax,raincytot,day1rain,maxdaysnorain
7,1.0,1.25,-0.398681,-0.885833,0.0,0.0,85.343202,57091.0,0.0,153636.0,0.0,27.126058,181.0,0.0,672.757605,171.0,184.0
8,0.875,1.125,-0.398681,1.86503,0.0,0.0,122.521901,34280.0,0.0,130750.0,0.0,27.126058,181.0,0.0,798.059355,58.0,177.0
9,0.75,1.625,-0.398681,0.877928,0.0,0.0,116.855826,49850.0,0.0,141041.7,0.0,27.126058,181.0,0.0,750.990698,57.0,169.0
11,0.75,1.75,-0.398681,-0.88583,0.0,0.0,81.256278,37000.0,0.0,140227.0,0.0,26.275643,185.0,0.0,852.666398,57.0,178.0
13,1.0,1.5,0.327972,-0.492849,0.0,0.0,58.98569,59556.0,0.0,156667.0,0.0,26.275643,185.0,0.0,1045.150783,2.0,174.0


In [144]:
y_train = train_data[labels]
y_test = test_data[labels]
y_test.head()

Unnamed: 0,logFCS,rCSI,HDDS
7,2.320894,2.0,1.625
8,2.677456,5.625,2.5
9,3.260911,2.5,3.0
11,2.830805,6.625,2.375
13,2.991706,4.375,2.375


In [145]:
from sklearn.preprocessing import Imputer,StandardScaler

imp = Imputer(missing_values='NaN', strategy='mean', axis=0, copy=False)
x_train = imp.fit_transform(x_train)
x_test = imp.fit_transform(x_test)

ss = StandardScaler()
x_train = ss.fit_transform(x_train)
x_test = ss.fit_transform(x_test)


In [130]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(2)
x_train = poly.fit_transform(x_train)
x_test =  poly.fit_transform(x_test)



In [147]:
from sklearn.linear_model import LinearRegression

# Create and fit our linear regression model to training data
model = LinearRegression(fit_intercept=True)
model.fit(x_train, y_train[labels[2]])

# Compute model predictions for test data
pred = model.predict(x_test)

actual = y_test[labels[2]]
r2_linear = stats.pearsonr(actual, pred)[0] ** 2
r2_linear

0.02991389748377062

In [150]:
 y_train[labels[0]]

41      4.347080
43      4.471639
53      4.610158
54      4.182505
59      4.032708
62      3.941582
68      4.001299
69      4.088714
79      4.069027
83          -inf
85      4.170247
89      4.388257
91      3.890500
93      3.895277
96      4.548600
98      4.205883
100     4.089570
104     4.073953
106     3.891820
108     4.001299
109     4.088714
114     4.347080
116     4.471639
120     4.610158
121     4.182505
137     4.338525
140     4.269697
142     4.110513
146     4.263481
151     4.345593
          ...   
4467    4.369448
4468    4.624973
4471        -inf
4474    4.436752
4476    4.025352
4491    4.384392
4494    4.388257
4497    4.218181
4511    4.377014
4526    4.320571
4530        -inf
4538    4.314312
4541    4.151036
4542    4.060443
4544    4.317488
4546        -inf
4548    4.226834
4549    4.360043
4552    3.401197
4558    4.274552
4567    4.465908
4569    4.278727
4582    4.439623
4583        -inf
4586    4.382417
4589        -inf
4592    4.303726
4599    4.2116

In [106]:
# fit on logFCS 
from sklearn.ensemble import RandomForestRegressor

# Create Regressor with default properties
rfc = RandomForestRegressor(random_state =0,n_jobs =4,warm_start = True)

parameters = {'max_depth':np.arange( 1,8, 1 ).tolist(), 'min_samples_leaf':np.arange( 1, 6, 1 ).tolist()}
clf = GridSearchCV(rfc, parameters,cv=6, n_jobs= 4, iid = True,  refit= True,pre_dispatch= '2*n_jobs')
clf.fit(x_train, y_train[labels[1]])

# Fit estimator and display score

# Regress on test data
pred = clf.predict(x_test)
actual = y_test[labels[1]]
r2_rfc = stats.pearsonr(actual, pred)[0] ** 2
r2_rfc

0.00408306689596873

In [67]:
from sklearn.ensemble import ExtraTreesRegressor

# Create Regressor with default properties
extratree = ExtraTreesRegressor(random_state=0)

# Fit estimator and display score
extratree = extratree.fit(x_train, y_train[labels[2]])
# Regress on test data
pred = extratree.predict(x_test)

actual = y_test[labels[2]]
r2_et = stats.pearsonr(actual, pred)[0] ** 2
r2_et




0.0013496236653385944

In [107]:
from sklearn.neural_network import MLPRegressor

mlp = MLPRegressor(warm_start=False,random_state =0, validation_fraction=0.1,learning_rate='adaptive')
# Fit estimator and display score
mlp = mlp.fit(x_train, y_train[labels[2]])
pred = mlp.predict(x_test)

actual = y_test[labels[2]]
r2_mlp= stats.pearsonr(actual, pred)[0] ** 2
r2_mlp

0.0014163196007047176

In [139]:
from sklearn.svm import SVR

# Create Regressor with default properties
svr = SVR(kernel='linear',degree =4)

# Fit estimator and display score
svr = svr.fit(x_train, y_train[labels[2]])
pred = svr.predict(x_test)

actual = y_test[labels[2]]
r2_svr= stats.pearsonr(actual, pred)[0] ** 2
r2_svr


0.027575132000468145

In [16]:
from sklearn.gaussian_process import  GaussianProcessRegressor

# Import our kernels
from sklearn.gaussian_process.kernels import Matern, WhiteKernel



# Define custom kernel (Matern + noise)
krnl = 2. + Matern(length_scale=1, nu=1.8) + WhiteKernel(noise_level=10)

# Create Regressor with specified properties
gpr = GaussianProcessRegressor(kernel=krnl, random_state=0)

# Fit estimator and display score
gpr= gpr.fit(x_train, y_train[labels[2]])
pred = gpr.predict(x_test)
actual = y_test[labels[2]]
r2_gpr= stats.pearsonr(actual, pred)[0] ** 2
r2_gpr



0.69498912222257903

In [109]:

from sklearn.linear_model import Ridge

# Define different alpha values for different fits
alpha = [0.0, 1E-6, 1E-4, 1E-2, 1.0]

ridge = Ridge(alpha=20,fit_intercept=False)

ridge.fit(x_train, y_train[labels[2]])
pred = ridge.predict(x_test)

actual = y_test[labels[2]]
r2_ridge= stats.pearsonr(actual, pred)[0] ** 2
r2_ridge

0.004891184646259924

In [110]:
from sklearn.linear_model import BayesianRidge
bridge = BayesianRidge(compute_score=True)
bridge.fit(x_train, y_train[labels[2]])
pred = bridge.predict(x_test)

actual = y_test[labels[2]]
r2_bridge= stats.pearsonr(actual, pred)[0] ** 2
r2_bridge

0.016619255797532755

In [117]:
from sklearn.linear_model import Lasso
alpha = [1E-4, 1E-2, 0.1, 1.0]

ls = Lasso(alpha=alpha[0], fit_intercept=True)


ls = ls.fit(x_train, y_train[labels[1]])
pred = ls.predict(x_test)

actual = y_test[labels[1]]
r2_ls= stats.pearsonr(actual, pred)[0] ** 2
r2_ls

0.020322407466060177

In [112]:
from sklearn.linear_model import ElasticNet
alpha = [1E-4, 1E-2, 0.1, 1.0]

en = ElasticNet(alpha=alpha[2], l1_ratio=0.8)


en.fit(x_train, y_train[labels[2]])
pred = en.predict(x_test)

actual = y_test[labels[2]]
r2_en= stats.pearsonr(actual, pred)[0] ** 2
r2_en


0.003970719995282263

In [118]:
from sklearn.ensemble import GradientBoostingRegressor

# Create Regressor with default properties
gbr = GradientBoostingRegressor(random_state=0)

gbr.fit(x_train, y_train[labels[1]])
pred = gbr.predict(x_test)

actual = y_test[labels[1]]
r2_gbr= stats.pearsonr(actual, pred)[0] ** 2
r2_gbr


0.031751391418244776