In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, ElasticNetCV, SGDRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV

In [2]:
"""
Read in train and test as Pandas DataFrames
"""
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [None]:
# df_train.head()

In [None]:
# df_test.head()

In [3]:
#store gap values
Y_train = df_train.gap.values
#row where testing examples start
test_idx = df_train.shape[0]
#delete 'Id' column
df_test = df_test.drop(['Id'], axis=1)
#delete 'gap' column
df_train = df_train.drop(['gap'], axis=1)

In [4]:
#DataFrame with all train and test examples so we can more easily apply feature engineering on
df_all = pd.concat((df_train, df_test), axis=0)
df_all.head()

Unnamed: 0,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,...,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256
0,c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...,0,0,0,0,1,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
1,C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si...,1,0,0,0,1,0,1,0,0,...,0,1,0,0,1,0,0,0,0,0
2,[nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2...,1,0,0,0,1,1,1,0,0,...,0,1,0,0,0,1,0,0,0,0
3,[nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13...,1,0,0,0,1,1,1,0,0,...,0,1,0,0,0,1,0,0,0,0
4,c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1,0,0,0,0,1,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0


In [None]:
"""
Example Feature Engineering

this calculates the length of each smile string and adds a feature column with those lengths
Note: this is NOT a good feature and will result in a lower score!
"""
#smiles_len = np.vstack(df_all.smiles.astype(str).apply(lambda x: len(x)))
#df_all['smiles_len'] = pd.DataFrame(smiles_len)


## This is a first attempt at some [feature engineering](http://machinelearningmastery.com/discover-feature-engineering-how-to-engineer-features-and-how-to-get-good-at-it/). The idea is that the number of  [branches](https://en.wikipedia.org/wiki/Simplified_molecular-input_line-entry_system#Branching) influences the gap between HOMO and LUMO levels.

## We pull the specific compounds of intereste from [this poster.](http://digitalscholarship.unlv.edu/cgi/viewcontent.cgi?article=1036&context=focs_ug_research)

## This feature is aggregating the number of active features. The idea is that if there are more features who are active in a compound, it may result in higher orbital overlap and thus lower HOMO/LUMO gap

## The last feature is determining, from the SMILES encoding whether or not there is a benzene ring in the compound. Benzene rings are held together with pi bonds which are more conjugated (which means they are closer together in energy)

## We can do the same for double, triple and quadruple bonds

In [5]:
num_branches = df_all.smiles.apply(lambda x: x.count('('))
num_active_feat = df_all.sum(axis=1)
has_benzene_ring = df_all.smiles.apply(lambda x: min(1,x.count('c1ccccc1')))
has_thiophene = df_all.smiles.apply(lambda x: min(1,x.count('c1ccsc1')))
has_benzothiophene = df_all.smiles.apply(lambda x: min(1,x.count('s2c1ccccc1cc2')))
has_dibenzothiophene = df_all.smiles.apply(lambda x: min(1,x.count('c1ccc2c(c1)c3ccccc3s2')))
has_carbazole = df_all.smiles.apply(lambda x: min(1,x.count('c1ccc2c(c1)c3ccccc3[nH]2')))
has_fluorene = df_all.smiles.apply(lambda x: min(1,x.count('c1ccc-2c(c1)Cc3c2cccc3')))
num_double_bonds = df_all.smiles.apply(lambda x: x.count('='))
# num_triple_bonds = np.vstack(df_all.smiles.astype(str).apply(lambda x: x.count('$')))
# num_quad_bonds = np.vstack(df_all.smiles.astype(str).apply(lambda x: x.count('#')))
df_all['num_branches'] = num_branches
df_all['num_active_feat'] = num_active_feat
df_all['has_benzene_ring'] = has_benzene_ring
df_all['has_thiophene'] = has_thiophene
df_all['has_benzothiophene'] = has_benzothiophene
df_all['has_dibenzothiophene'] = has_dibenzothiophene
df_all['has_carbazole'] = has_carbazole
df_all['has_fluorene'] = has_fluorene
df_all['num_double_bonds'] = num_double_bonds

# df_all['num_triple_bonds'] = pd.DataFrame(num_triple_bonds)
# df_all['num_quad_bonds'] = pd.DataFrame(num_quad_bonds)

In [6]:
df_all.head()

Unnamed: 0,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,...,feat_256,num_branches,num_active_feat,has_benzene_ring,has_thiophene,has_benzothiophene,has_dibenzothiophene,has_carbazole,has_fluorene,num_double_bonds
0,c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...,0,0,0,0,1,0,1,0,0,...,0,3,10,0,0,0,0,0,0,0
1,C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si...,1,0,0,0,1,0,1,0,0,...,0,1,17,0,0,0,0,0,0,5
2,[nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2...,1,0,0,0,1,1,1,0,0,...,0,2,19,0,0,0,0,0,0,1
3,[nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13...,1,0,0,0,1,1,1,0,0,...,0,1,18,0,0,0,0,0,0,4
4,c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1,0,0,0,0,1,0,1,0,0,...,0,1,10,0,0,0,0,0,0,0


In [7]:
#Drop the 'smiles' column
df_all = df_all.drop(['smiles'], axis=1)
vals = df_all.values
X_train = vals[:test_idx]
X_test = vals[test_idx:]
print "Train features:", X_train.shape
print "Train gap:", Y_train.shape
print "Test features:", X_test.shape
Xsp_train = sp.sparse.csr_matrix(X_train)
Xsp_test = sp.sparse.csr_matrix(X_test)

Train features: (1000000, 265)
Train gap: (1000000,)
Test features: (824230, 265)


In [8]:
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")

## I want to do [PCA](http://sebastianraschka.com/Articles/2014_pca_step_by_step.html) to get some of the interdependencies between features figured out and use the reduced data set to improve regression characteristics. 

## Looking into things, it appears that it may be a good idea to do GridSearchCV over a pipelined regression where we compute the principal components and then fit a regression within the cross-validation construct.

In [None]:
pca = PCA(n_components=30)
lassoCV = LassoCV()

X_transf = pca.fit_transform(X_train)
X_test_transf = pca.transform(X_test)

estimator = lassoCV.fit(X_transf,Y_train)
estimator.fit(X_transf,Y_train)

pca_lasso_pred = estimator.predict(X_test_transf)

In [None]:
write_to_file("pcaLasso_fullSet_TWK_1Feb.csv", pca_lasso_pred)

## Here I'll try just an Extra Trees Regressor, both with and without PCA

In [None]:
pca = PCA(n_components=30)
extraTrees_pca = ExtraTreesRegressor(n_estimators=25,n_jobs=2)
extraTrees = ExtraTreesRegressor(n_estimators=25,n_jobs=2)

X_transf = pca.fit_transform(X_train)
X_test_transf = pca.transform(X_test)

tree_est_wPCA = extraTrees_pca.fit(X_transf, Y_train)
tree_estimator = extraTrees.fit(X_train, Y_train)

pca_exTree_pred = tree_est_wPCA.predict(X_test_transf)
exTree_pred = tree_estimator.predict(X_test)

In [None]:
write_to_file("pcaExtraTree_fullSet_TWK_2Feb.csv", pca_exTree_pred)
write_to_file("ExtraTree_fullSet_TWK_2Feb.csv", exTree_pred)

## Now will incorporate Adaboost with Extra Trees to weight the representations, using a premilinary GridSearchCV representation to do some parameter tuning

In [None]:
# pca = PCA(n_components=30)
# X_transf = pca.fit_transform(X_train)
# X_test_transf = pca.transform(X_test)
rng = np.random.RandomState(1)

param_grid = {'loss': ['linear','square']}

ETR = ExtraTreesRegressor(n_estimators=100, random_state=rng)
ABR = AdaBoostRegressor(base_estimator=ETR, n_estimators=300)

estimator = GridSearchCV(ABR, param_grid = param_grid, cv=3)
estimator.fit(Xsp_train, Y_train)

pca_adaBoost_gs_pred = estimator.predict(Xsp_test)

In [None]:
write_to_file("pca_adaBoost_gsCV_fullSet_TWK_3Feb.csv", pca_adaBoost_gs_pred)

## Stochastic Gradient Descent Regression on CSR matrices...

In [9]:
rng = np.random.RandomState(0)
clf = SGDRegressor(penalty='elasticnet', random_state=rng)
clf.fit(Xsp_train,Y_train)

sgd_pred = clf.predict(Xsp_test)

In [None]:
write_to_file("sgd_regression_fullSet_TWK_3Feb.csv", sgd_pred)

## Gradient Boosting Regression

In [None]:
rng = np.random.RandomState(2)
clf = GradientBoostingRegressor(random_state=rng)
clf.fit(Xsp_train, Y_train)

gradBoost_pred = clf.predict(Xsp_test)

In [None]:
write_to_file("gradBoost_regression_fullSet_TWK_3Feb.csv",gradBoost_pred)

## Attempted pipelining a grid search on pca fronting Lasso regression

In [None]:
lasso = Lasso()
pca = PCA()
pipe = Pipeline(steps=[('pca', pca), ('lasso', lasso)])
n_components = [15, 45]
alphas = [0.01, 1]
n_jobs, n_folds = 2, 3

estimator = GridSearchCV(pipe,dict(pca__n_components=n_components,lasso__alpha=alphas),n_jobs=n_jobs, cv=n_folds)
estimator.fit(X_train, Y_train)

pca_lasso_pred = estimator.predict(X_test)

In [None]:
write_to_file("pcaLasso_TWK_1Feb.csv", pca_lasso_pred)

## The hope is to investigate several methods of regression... Like potentially ElasticNet or Lasso regression with Cross-validation... Lots of ideas.

## The idea here is that I'll train a type of SVM used for regression purposes a grid search cross-validation on top of it so as to get the best RMSE using that method.

### I'm using the set-up used in HW 3 from data science.

In [None]:
from sklearn.svm import SVR

In [None]:
svr=SVR()
# Cs=[0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
# Cs=[0.001, 0.01, 0.1, 1.0]
Cs=[1.0]
kernels = ['linear', 'rbf']
eps = [0.05, 0.1, 0.15, 0.2]
vals = reducedf_RFfeatimport.values
X_train = vals[:test_idx]
X_test = vals[test_idx:]

In [None]:
# parameters = {"C": Cs, 'kernel': kernels, 'epsilon': eps}
parameters = {"C": Cs}
n_jobs, n_folds = 3, 5
gs = GridSearchCV(svr, param_grid=parameters, n_jobs=n_jobs, cv=n_folds)
gs.fit(X_train, Y_train)
print "BEST", gs.best_params_, gs.best_score_, gs.grid_scores_
best = gs.best_estimator_
best = best.fit(X_train, Y_train)

In [None]:
SVR_1stpass = best.predict(X_test)

In [None]:
write_to_file("SVR_1Feb16.csv", SVR_1stpass)