In [7]:
# Set up Notebook
% matplotlib inline

# Standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from matplotlib import cm


# We do this to ignore several specific Pandas warnings
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import r2_score

In [8]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
labels = ['clust_logFCS', 'clust_RCSI', 'clust_HDDS']

In [13]:
x_train=train_data.drop(labels, axis=1)
x_train = x_train.drop(x_train.columns[0], axis=1)
x_test=test_data.drop(labels, axis=1)
x_test = x_test.drop(x_test.columns[0], axis=1)

x_train.head()

Unnamed: 0,clust_L12raincytot,clust_L12day1rain,clust_L12maxdays,clust_floodmax,clust_cells_own,clust_price,clust_thinn,clust_roof,clust_hhsize,clust_hh_age,clust_hh_gender,clust_asset,clust_dist_road,clust_dist_admarc,clust_percent_ag,clust_nutri_reten_constrained,clust_elevation,ipc_lag1,ipc_lag12
0,1090.0613,43,17,0.0,0.125,3.824215,0.359375,0.125,5.4375,41.625,1.25,-0.327686,1.395625,5.176875,0.6,0.0,1311.875,1.0,
1,855.86176,58,40,0.0,0.4375,3.917409,0.25,0.1875,5.6875,36.5625,1.3125,-0.202549,0.756875,5.40875,0.6,0.9375,496.0,1.0,
2,1300.2437,53,29,0.0,1.5,3.683867,0.25,0.5,6.375,40.4375,1.375,0.548275,0.163125,17.038126,0.45,1.0,526.75,1.0,
3,1036.9712,52,29,0.0,0.625,4.061391,0.5,0.25,5.4375,46.875,1.1875,-0.077412,5.80375,7.724375,0.4375,0.0,564.1875,1.0,
4,952.1828,54,25,0.0,0.5625,3.8795,0.25,0.0625,7.0,43.8125,1.0,-0.202549,11.286875,11.103125,0.4875,1.0,1539.9375,1.0,


In [14]:
y_train = train_data[labels]
y_test = test_data[labels]
y_test.head()

Unnamed: 0,clust_logFCS,clust_RCSI,clust_HDDS
0,3.666015,12.3125,4.6875
1,3.718879,5.352941,5.235294
2,3.79983,5.08,5.72
3,3.844891,4.285714,5.095238
4,3.687936,0.8125,5.3125


In [15]:
from sklearn.preprocessing import Imputer,StandardScaler

imp = Imputer(missing_values='NaN', strategy='mean', axis=0, copy=False)
x_train = imp.fit_transform(x_train)
x_test = imp.fit_transform(x_test)

ss = StandardScaler()
x_train = ss.fit_transform(x_train)
x_test = ss.fit_transform(x_test)


In [16]:
from sklearn.linear_model import LinearRegression

# Create and fit our linear regression model to training data
model = LinearRegression(fit_intercept=True)
model.fit(x_train, y_train[labels[0]])

# Compute model predictions for test data
pred = model.predict(x_test)

actual = y_test[labels[0]]
r2_linear = stats.pearsonr(actual, pred)[0] ** 2
r2_linear

0.53194513056515569

In [27]:
# fit on logFCS 
from sklearn.ensemble import RandomForestRegressor

# Create Regressor with default properties
rfc = RandomForestRegressor(random_state=0,warm_start=True,max_depth=5,min_samples_split= 4)

# Fit estimator and display score
rfc = rfc.fit(x_train, y_train[labels[0]])

# Regress on test data
pred = rfc.predict(x_test)

actual = y_test[labels[0]]
r2_rfc = stats.pearsonr(actual, pred)[0] ** 2
r2_rfc

0.5504910015697726

In [49]:
from sklearn.ensemble import ExtraTreesRegressor

# Create Regressor with default properties
extratree = ExtraTreesRegressor(random_state=0)

# Fit estimator and display score
extratree = extratree.fit(x_train, y_train[labels[0]])
# Regress on test data
pred = extratree.predict(x_test)

actual = y_test[labels[0]]
r2_et = stats.pearsonr(actual, pred)[0] ** 2
r2_et




0.47628240545612804

In [35]:
from skgarden import RandomForestQuantileRegressor

ModuleNotFoundError: No module named 'skgarden'

In [53]:
from sklearn.neural_network import MLPRegressor

mlp = MLPRegressor(warm_start=False,random_state =0, validation_fraction=0.1,learning_rate='adaptive')
ne
# Fit estimator and display score
mlp = mlp.fit(x_train, y_train[labels[0]])
pred = mlp.predict(x_test)

actual = y_test[labels[0]]
r2_mlp= stats.pearsonr(actual, pred)[0] ** 2
r2_mlp

0.03907877225950885

In [52]:
from sklearn.svm import SVR

# Create Regressor with default properties
svr = SVR(kernel='linear',degree =4)

# Fit estimator and display score
svr = svr.fit(x_train, y_train[labels[0]])
pred = svr.predict(x_test)

actual = y_test[labels[0]]
r2_svr= stats.pearsonr(actual, pred)[0] ** 2
r2_svr


0.51530533746819529

In [39]:
from sklearn.gaussian_process import  GaussianProcessRegressor

# Import our kernels
from sklearn.gaussian_process.kernels import Matern, WhiteKernel



# Define custom kernel (Matern + noise)
krnl = 2. + Matern(length_scale=1, nu=1.8) + WhiteKernel(noise_level=10)

# Create Regressor with specified properties
gpr = GaussianProcessRegressor(kernel=krnl, random_state=23)

# Fit estimator and display score
gpr= gpr.fit(x_train, y_train[labels[0]])
pred = gpr.predict(x_test)
actual = y_test[labels[0]]
r2_gpr= stats.pearsonr(actual, pred)[0] ** 2
r2_gpr



0.53303005952019766

In [67]:

from sklearn.linear_model import Ridge

# Define different alpha values for different fits
alpha = [0.0, 1E-6, 1E-4, 1E-2, 1.0]

ridge = Ridge(alpha=20,fit_intercept=False)

ridge.fit(x_train, y_train[labels[0]])
pred = ridge.predict(x_test)

actual = y_test[labels[0]]
r2_ridge= stats.pearsonr(actual, pred)[0] ** 2
r2_ridge

0.53204110298535601

In [70]:
from sklearn.linear_model import BayesianRidge
bridge = BayesianRidge(compute_score=True)
bridge.fit(x_train, y_train[labels[0]])
pred = bridge.predict(x_test)

actual = y_test[labels[0]]
r2_bridge= stats.pearsonr(actual, pred)[0] ** 2
r2_bridge

0.53210244730676637

In [58]:
from sklearn.linear_model import Lasso
alpha = [1E-4, 1E-2, 0.1, 1.0]

ls = Lasso(alpha=alpha[2], fit_intercept=True)


ls = ls.fit(x_train, y_train[labels[0]])
pred = ls.predict(x_test)

actual = y_test[labels[0]]
r2_ls= stats.pearsonr(actual, pred)[0] ** 2
r2_ls

0.57643988277186664

In [46]:
from sklearn.linear_model import ElasticNet
alpha = [1E-4, 1E-2, 0.1, 1.0]

en = ElasticNet(alpha=alpha[2], l1_ratio=0.5)


en.fit(x_train, y_train[labels[0]])
pred = en.predict(x_test)

actual = y_test[labels[0]]
r2_en= stats.pearsonr(actual, pred)[0] ** 2
r2_en


0.59856162067381946

In [69]:
from sklearn.ensemble import GradientBoostingRegressor

# Create Regressor with default properties
gbr = GradientBoostingRegressor(random_state=0)

gbr.fit(x_train, y_train[labels[0]])
pred = gbr.predict(x_test)

actual = y_test[labels[0]]
r2_gbr= stats.pearsonr(actual, pred)[0] ** 2
r2_gbr


0.47713375729068236