# This notebook outlines the final model tuning and set of predictions that ML Marauders have made for CS 181 Practical 1

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge, LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split, KFold
from sklearn.metrics import mean_squared_error

## Load and gently process the data (much of the preprocessing was done in FINAL.ipynb)

In [None]:
# df_train = pd.read_csv('sam_data/rdk_feat_eng_whole_df_train_orig_features.csv')
# df_test = pd.read_csv('sam_data/rdk_feat_eng_whole_df_test_orig_features.csv')
df_train = pd.read_csv('final_data/FINAL_train.csv')
df_test = pd.read_csv('final_data/FINAL_test.csv')
df_train.head()

In [None]:
# Using StandardScaler to normalize non-binary columns
scaler = StandardScaler()
binary_cols = ['feat_%03d' % ii for ii in range(1,257)]
binary_cols.append('has_benzothiophene')
binary_cols.append('has_carbazole')
binary_cols.append('has_fluorene')
binary_cols.append('smiles')
binary_cols.append('Id')
non_binary_cols = np.array([col for col in df_test.columns if col not in binary_cols]).flatten()

df_train[non_binary_cols] = scaler.fit_transform(df_train[non_binary_cols])
df_test[non_binary_cols] = scaler.transform(df_test[non_binary_cols])

In [None]:
# Drop the 'smiles' and 'Id' columns
df_train = df_train.drop(['smiles'], axis=1)
df_test = df_test.drop(['Id'], axis=1)

# Store gap values
Y_train = df_train.gap.values

# Delete 'gap' column
df_train = df_train.drop(['gap'], axis=1)
X_train = df_train.values
X_test = df_test.values
print "Train features:", X_train.shape, "Train gap:", Y_train.shape
print "Test features:", X_test.shape

In [None]:
# Split training data into training and validation sets as well as begin some k-fold CV
cross_X_train, cross_X_valid, cross_Y_train, cross_Y_valid = train_test_split(X_train, Y_train, test_size=0.3, random_state=42)

#### For classification purposes, round target values to nearest .25

In [None]:
# Round to nearest integer
# Y_clf_train, Y_clf_valid = cross_Y_train.round(), cross_Y_valid.round()
# Round to nearest .5
# Y_clf_train, Y_clf_valid = np.round(2*cross_Y_train)/2.0, np.round(2*cross_Y_valid)/2.0
# Round to nearest .25
Y_clf_train, Y_clf_valid = np.round(4*cross_Y_train)/4.0, np.round(4*cross_Y_valid)/4.0
Y_full_clf_train = np.round(4*Y_train)/4.0

In [None]:
print "'Training' features: ", cross_X_train.shape
print "'Validate' features: ", cross_X_valid.shape

# GOAL:

This notebook is set-up to chain together classification and regression methods. The thought is that we can, after we've trained the two models, to first apply a classifier to the data (in a clustering kind of sense) and then use the category or neighborhood that the sample is assigned as an additional feature to perform regression. Here the category or label will be the closest integer to the gap value. The idea behind this is to hijack the regression into a local region of the expected HOMO-LUMO gap based on the label. The hope is that this will pin the regressor closer to the right value. 

It's imperative that we get as accurate of a classifier as we can.

Fingers crossed.

### First: Let's build a classifier that will adequately label the samples

We'll start with Logistic Regression and try to fit the best model using a collection of C values

In [None]:
%%time

logReg_training_acc = 0
logReg_test_acc = 0

Cs = [0.001, 0.01, 0.1, 1.0, 10.0]

for c in Cs:
    clf_logReg=LogisticRegression(penalty="l2",C=c, solver='lbfgs')
    clf_logReg.fit(cross_X_train,Y_clf_train)
    training_acc = clf_logReg.score(cross_X_train,Y_clf_train)
    test_acc = clf_logReg.score(cross_X_valid,Y_clf_valid)
    print c, test_acc
    if logReg_test_acc < test_acc:
        logReg_test_acc = test_acc
        logReg_training_acc = training_acc
        best_logReg = clf_logReg

In [None]:
logReg_training_acc = best_logReg.score(cross_X_train,Y_clf_train)
logReg_test_acc = best_logReg.score(cross_X_valid,Y_clf_valid)
print "Training Accuracy: %0.3f" % logReg_training_acc
print "Test Accuracy: %0.3f" % logReg_test_acc

#### Concatenate predicted labels onto test/validation set

In [None]:
Y_clf_pred = best_logReg.predict(cross_X_valid)
X_train_clf = np.vstack((cross_X_train.T,Y_clf_train)).T
X_valid_clf = np.vstack((cross_X_valid.T,Y_clf_valid)).T

## Now onto ExtraTrees, fronted with PCA

In [None]:
%%time
pcaExtraTrees_RMSE = 100

pca_components = [30, 35, 40, 45]
num_estimators = [50, 100, 200]

for comps in pca_components:
    pca = PCA(n_components=comps)
    X_train_tr = pca.fit_transform(X_train_clf)
    X_valid_tr = pca.transform(X_valid_clf)
    
    for n_estimators in num_estimators:
        
        extratrees_clf = ExtraTreesRegressor(n_estimators=n_estimators,n_jobs=2)
        extratrees_clf.fit(X_train_tr,cross_Y_train)
        y_pred = extratrees_clf.predict(X_valid_tr)
        
        RMSE = np.sqrt(mean_squared_error(cross_Y_valid,y_pred))
        if RMSE < pcaExtraTrees_RMSE:
            print comps, n_estimators
            print RMSE
            pcaExtraTrees_RMSE = RMSE
            pcaExtraTrees_estimators = n_estimators
            pcaExtraTrees_components = comps
            best_pcaExtraTrees = extratrees_clf
            
print "PCA with {0} components chained ExtraTrees with {1} estimators had RMSE of {2}".format(pcaExtraTrees_components,pcaExtraTrees_estimators,pcaExtraTrees_RMSE)        

## We're also going to tune a Ridge Regression to have double coverage

In [None]:
ridge_RMSE = 100
alphas = np.logspace(-4, 1, 30)

for alpha in alphas:
    ridge_clf = Ridge(alpha=alpha)
    ridge_clf.fit(X_train_clf, cross_Y_train)
    y_pred = ridge_clf.predict(X_valid_clf)
    
    RMSE = np.sqrt(mean_squared_error(cross_Y_valid,y_pred))
    if RMSE < ridge_RMSE:
        ridge_RMSE = RMSE
        ridge_alpha = alpha
        best_ridge = ridge_clf
        
print "Ridge RMSE: {0} with alpha: {1}".format(ridge_RMSE,ridge_alpha)

## Train on full training set, run on full test set

Here we will train the classifier and the regressions

In [None]:
# Train classifier
best_logReg.fit(X_train,Y_full_clf_train)

# Concatenate full training labels to full test set
X_full_train_clf = np.vstack((X_train.T,Y_full_clf_train)).T

# Train ExtraTrees Regressor
pca = PCA(n_components=45)
X_full_train_tr = pca.fit_transform(X_full_train_clf)
best_pcaExtraTrees.fit(X_full_train_tr,Y_train)

# Train Ridge Regression
best_ridge.fit(X_full_train_clf,Y_train)

Next we will run classifier on full test set (get category assignments) and concatenate feature to test set

In [None]:
# Run classifier on test set
label_pred = best_logReg.predict(X_test)

# Concatenate predicted labels onto test set as a new feature
X_test_clf = np.vstack((X_test.T,label_pred)).T

Now, run the two regressions on the augmented test set

In [None]:
# Run ExtraTrees Regressor
pcaExtraTrees_pred = best_pcaExtraTrees.predict(X_test_clf)

# Run Ridge Regressor
ridge_pred = best_ridge.predict(X_test_clf)

Save the data

In [None]:
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")

In [None]:
# Save the ExtraTrees Predictions
write_to_file("pca_extraTrees_FINAL_TWK_10Feb.csv",pcaExtraTrees_pred)

# Save the Ridge Predictions
write_to_file("ridge_FINAL_TWK_10Feb.csv",ridge_pred)