In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Lasso, LassoLarsCV, LassoCV, ElasticNetCV, SGDRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split, KFold
from sklearn.metrics import mean_squared_error

In [2]:
# Read in training data
df_train = pd.read_csv('sam_data/rdk_feat_eng_whole_df_train_orig_features.csv')

# Drop the 'smiles' column 
df_train = df_train.drop(['smiles'], axis=1)

# Store gap values
Y_train = df_train.gap.values

# Delete 'gap' column
df_train = df_train.drop(['gap'], axis=1)
X_train = df_train.values
print "Train features:", X_train.shape
print "Train gap:", Y_train.shape

Train features: (999997, 282)
Train gap: (999997,)


In [3]:
# Split training data into training and validation sets as well as begin some k-fold CV
cross_X_train, cross_X_valid, cross_Y_train, cross_Y_valid = train_test_split(X_train, Y_train, test_size=0.33, random_state=42)

In [4]:
print "Training set size:", cross_X_train.shape
print "Validation set size:", cross_X_valid.shape

Training set size: (669997, 282)
Validation set size: (330000, 282)


## MODEL SELECTION

Here I'm going to work through many of the same models that I had done before (in TWK_hacking.ipynb) but with a lot more care. I'm excited to nail down some great models.

In [18]:
alphas = np.logspace(-4, 1, 30)
pca_components = [10,15,30,45,60,100,150]
num_estimators = [50,100,200,300]

#### LassoLarsCV

This model performs cross validation, it determines the best and most relevant alphas by itself.

In [15]:
lassoLars_clf = LassoLarsCV()
lassoLars_clf.fit(cross_X_train,cross_Y_train)
print "LassoLars best alpha: {0}".format(lassoLars_clf.alpha_)

LassoLars best alpha: 2.91116456737e-07


In [16]:
y_pred = lassoLars_clf.predict(cross_X_valid) 
    
# Calculate RMSE and update minimum RMSE if possible
LassoLars_RMSE = np.sqrt(mean_squared_error(cross_Y_valid, y_pred))
print "LassoLars RMSE: {}".format(LassoLars_RMSE)

LassoLars RMSE: 0.400655750739


#### LassoCV

Regular LassoCV

In [None]:
lasso_clf = LassoCV()
lasso_clf.fit(cross_X_train,cross_Y_train)
print "Lasso best alpha: {}".format(lasso_clf.alpha_)

In [None]:
y_pred = lasso_clf.predict(cross_X_valid)

# Calculate RMSE
Lasso_RMSE = np.sqrt(mean_squared_error(cross_Y_valid, y_pred))
print "Lasso RMSE: {}".format(Lasso_RMSE)

#### Stochastic Gradient Descent

The idea with SGD is that it minimizes empirical loss by following the path that minimizes the gradient by some learning rate (usually $\propto \alpha$). 

In [13]:
SGD_alpha = 100
SGD_RMSE = 100

for alpha in alphas:
    # Fit model and predict on validation
    sgd_clf = SGDRegressor(alpha=alpha)
    sgd_clf.fit(cross_X_train,cross_Y_train)
    y_pred = sgd_clf.predict(cross_X_valid)
    
    # Calculate RMSE and update minimum RMSE if possible
    RMSE = np.sqrt(mean_squared_error(cross_Y_valid, y_pred))
    if RMSE < SGD_RMSE:
        SGD_RMSE = RMSE
        SGD_alpha = alpha
        best_sgd = sgd_clf
print "SGD minimized with alpha: {0}, resulting in RMSE of: {1}".format(SGD_alpha,SGD_RMSE)

SGD minimized with alpha: 100, resulting in RMSE of: 100


Well, that was a disappointment...

#### Gradient Boosting Regression
Similar in nature to how SGD works. It's main identifying feature is that it fits a regression tree at each stage of the gradient minimization.

In [None]:
gradBoost_RMSE = 100

for n_estimators in num_estimators:

    gradBoost_clf = GradientBoostingRegressor(n_estimators=n_estimators)
    gradBoost_clf.fit(cross_X_train,cross_Y_train)
    y_pred = gradBoost_clf.predict(cross_X_valid)
    
    # Calculate RMSE and update minimum RMSE if possible
    RMSE = np.sqrt(mean_squared_error(cross_Y_valid, y_pred))
    if RMSE < gradBoost_RMSE:
        gradBoost_RMSE = RMSE
        gradBoost_estimators = n_estimators
        best_gradBoost = gradBoost_clf
print "Gradient Boosting minimized with {0} estimators, resulting in RMSE of: {1}".format(gradBoost_estimators,gradBoost_RMSE)

#### PCA + Extra Trees Regressor
Just because this is what was giving us the best performance before. I want to verify that this will rock

In [None]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error

# Set parameters to test
alphas = np.logspace(-4, -.5, 30)

# Initialize minimums 

minimum_alpha = 100
minimum_RMSE = 100
for alpha in alphas:
    
    # Fit model and predict on validation
    clf = linear_model.Lasso(alpha=alpha)
    clf.fit(cross_X_train,cross_Y_train)
    y_pred = clf.predict(cross_X_valid) 
    
    # Calculate RMSE and update minimum RMSE if possible
    RMSE = np.sqrt(mean_squared_error(cross_Y_valid, y_pred))
    if RMSE < minimum_RMSE:
        minimum_RMSE = RMSE
        minimum_alpha = alpha