In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV

In [None]:
"""
Read in train and test as Pandas DataFrames
"""
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [None]:
# df_train.head()

In [None]:
# df_test.head()

In [None]:
#store gap values
Y_train = df_train.gap.values
#row where testing examples start
test_idx = df_train.shape[0]
#delete 'Id' column
df_test = df_test.drop(['Id'], axis=1)
#delete 'gap' column
df_train = df_train.drop(['gap'], axis=1)

In [None]:
#DataFrame with all train and test examples so we can more easily apply feature engineering on
df_all = pd.concat((df_train, df_test), axis=0)
df_all.head()

In [None]:
"""
Example Feature Engineering

this calculates the length of each smile string and adds a feature column with those lengths
Note: this is NOT a good feature and will result in a lower score!
"""
#smiles_len = np.vstack(df_all.smiles.astype(str).apply(lambda x: len(x)))
#df_all['smiles_len'] = pd.DataFrame(smiles_len)


## This is a first attempt at some [feature engineering](http://machinelearningmastery.com/discover-feature-engineering-how-to-engineer-features-and-how-to-get-good-at-it/). The idea is that the number of  [branches](https://en.wikipedia.org/wiki/Simplified_molecular-input_line-entry_system#Branching) influences the gap between HOMO and LUMO levels.

## This feature is aggregating the number of active features. The idea is that if there are more features who are active in a compound, it may result in higher orbital overlap and thus lower HOMO/LUMO gap

## The last feature is determining, from the SMILES encoding whether or not there is a benzene ring in the compound. Benzene rings are held together with pi bonds which are more conjugated (which means they are closer together in energy)

## We can do the same for double, triple and quadruple bonds

In [None]:
num_branches = df_all.smiles.apply(lambda x: x.count('('))
num_active_feat = df_all.sum(axis=1)
has_benzene_ring = df_all.smiles.apply(lambda x: x.count('c1ccccc1')>0)
num_double_bonds = df_all.smiles.apply(lambda x: x.count('='))
# num_triple_bonds = np.vstack(df_all.smiles.astype(str).apply(lambda x: x.count('$')))
# num_quad_bonds = np.vstack(df_all.smiles.astype(str).apply(lambda x: x.count('#')))
df_all['num_branches'] = pd.DataFrame(num_branches)
df_all['num_active_feat'] = pd.DataFrame(num_active_feat)
df_all['has_benzene_ring'] = pd.DataFrame(has_benzene_ring)
df_all['num_double_bonds'] = pd.DataFrame(num_double_bonds)
# df_all['num_triple_bonds'] = pd.DataFrame(num_triple_bonds)
# df_all['num_quad_bonds'] = pd.DataFrame(num_quad_bonds)

In [None]:
df_all.head()

In [None]:
#Drop the 'smiles' column
df_all = df_all.drop(['smiles'], axis=1)
vals = df_all.values
X_train = vals[:test_idx]
X_test = vals[test_idx:]
print "Train features:", X_train.shape
print "Train gap:", Y_train.shape
print "Test features:", X_test.shape

In [None]:
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")

## I want to do [PCA](http://sebastianraschka.com/Articles/2014_pca_step_by_step.html) to get some of the interdependencies between features figured out and use the reduced data set to improve regression characteristics. 

## Looking into things, it appears that it may be a good idea to do GridSearchCV over a pipelined regression where we compute the principal components and then fit a regression within the cross-validation construct.

In [None]:
lasso = Lasso()
pca = PCA()
pipe = Pipeline(steps=[('pca', pca), ('lasso', lasso)])
n_components = [15, 30, 45]
alphas = [0.01, 0.1, 1]
n_jobs, n_folds = 3, 3

estimator = GridSearchCV(pipe,dict(pca__n_components=n_components,lasso__alpha=alphas),n_jobs=n_jobs, cv=n_folds)
estimator.fit(X_train, Y_train)

pca_lasso_pred = estimator.predict(X_test)

In [None]:
write_to_file("pcaLasso_TWK_1Feb.csv", pca_lasso_pred)

## The hope is to investigate several methods of regression... Like potentially ElasticNet or Lasso regression with Cross-validation... Lots of ideas.

## The idea here is that I'll train a type of SVM used for regression purposes a grid search cross-validation on top of it so as to get the best RMSE using that method.

### I'm using the set-up used in HW 3 from data science.

In [None]:
from sklearn.svm import SVR
from sklearn.grid_search import GridSearchCV


In [None]:
svr=SVR()
# Cs=[0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
# Cs=[0.001, 0.01, 0.1, 1.0]
Cs=[1.0]
kernels = ['linear', 'rbf']
eps = [0.05, 0.1, 0.15, 0.2]
vals = reducedf_RFfeatimport.values
X_train = vals[:test_idx]
X_test = vals[test_idx:]

In [None]:
# parameters = {"C": Cs, 'kernel': kernels, 'epsilon': eps}
parameters = {"C": Cs}
n_jobs, n_folds = 3, 5
gs = GridSearchCV(svr, param_grid=parameters, n_jobs=n_jobs, cv=n_folds)
gs.fit(X_train, Y_train)
print "BEST", gs.best_params_, gs.best_score_, gs.grid_scores_
best = gs.best_estimator_
best = best.fit(X_train, Y_train)

In [None]:
SVR_1stpass = best.predict(X_test)

In [None]:
write_to_file("SVR_1Feb16.csv", SVR_1stpass)