In [1]:
import pandas as pd

X_train = pd.read_csv('X_train-MajInt.csv').drop(['Unnamed: 0'],axis=1)
Y_train = pd.read_csv('Y_train-MajInt.csv').drop(['Unnamed: 0'],axis=1)
X_validation = pd.read_csv('X_validation-MajInt.csv').drop(['Unnamed: 0'],axis=1)
Y_validation = pd.read_csv('Y_validation-MajInt.csv').drop(['Unnamed: 0'],axis=1)

print X_train.shape, Y_train.shape
print X_train.columns
print Y_train.columns

#Construct DataFrames for a BaseCase Model
dropcols = X_train.columns[range(7,42)]

X_train = X_train.drop(dropcols,axis=1)
X_validation = X_validation.drop(dropcols,axis=1)

print X_train.columns

(130185, 42) (130185, 1)
Index([u'FDA_Alerts', u'UserRating', u'Useful_Reviews', u'NReviews',
       u'Moderate', u'Minor', u'DurationCategory', u'CC', u'CD', u'DT', u'EX',
       u'FW', u'IN', u'JJ', u'JJR', u'JJS', u'LS', u'MD', u'NN', u'NNS',
       u'NNP', u'NNPS', u'PDT', u'POS', u'PRP', u'RB', u'RBR', u'RBS', u'RP',
       u'TO', u'UH', u'VB', u'VBD', u'VBG', u'VBN', u'VBP', u'VBZ', u'WDT',
       u'WP', u'WRB', u'Polarity', u'Subjectivity'],
      dtype='object')
Index([u'Major'], dtype='object')
Index([u'FDA_Alerts', u'UserRating', u'Useful_Reviews', u'NReviews',
       u'Moderate', u'Minor', u'DurationCategory'],
      dtype='object')


In [None]:
print X_train.columns

In [2]:
# Import Statements
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


In [3]:
#Spotcheck Algorithms
validation_size = 0.2
seed = 11

num_folds = 10
scoring = 'neg_mean_squared_error'

models = []
models.append(('LR1',LinearRegression()))
models.append(('LASSO',Lasso()))
models.append(('EN',ElasticNet()))
models.append(('KNN',KNeighborsRegressor()))
models.append(('CART',DecisionTreeRegressor()))
# models.append(('SVR',SVR()))

results = []
names = []
print 'ok'
for name,model in models:
#     print name, model
    kfold = KFold(n_splits=num_folds,random_state=seed)
    cv_results = cross_val_score(model,X_train,Y_train,cv=kfold,scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(),cv_results.std())
    print msg

print ""
print "ok_end"

ok
LR1: -11788.851902 (141.989698)
LASSO: -11789.206083 (141.869877)
EN: -11795.254989 (140.124445)
KNN: -192.939819 (44.553414)
CART: -4.449333 (4.091841)

ok_end


In [5]:
import matplotlib.pyplot as plt
%matplotlib tk

#Compare Algorithm Responses
fig = plt.figure()
fig.suptitle("Comparison of Algorithms (Base Case)",fontsize=20,fontweight='bold')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
ax.tick_params(axis='both',labelsize=15)
# ax.set_xticklabels(ax.get_xticks(),fontweight='bold')
# ax.set_yticklabels(ax.get_yticks(),fontweight='bold')
plt.xlabel('Algorithm',size=20,fontweight='bold')
plt.ylabel('Neg Mean Squared',size=20,fontweight='bold')
plt.show()

In [6]:
#Standardize the Data
pipelines = []

pipelines.append(('ScaledLR',Pipeline([('Scaler',StandardScaler()),('LR',LinearRegression())])))
pipelines.append(('ScaledLASSO',Pipeline([('Scaler',StandardScaler()),('LASSO',Lasso())])))
pipelines.append(('ScaledEN',Pipeline([('Scaler',StandardScaler()),('EN',ElasticNet())])))
# pipelines.append(('ScaledKNN',Pipeline([('Scaler',StandardScaler()),('KNN',KNeighborsRegressor())])))
pipelines.append(('ScaledCART',Pipeline([('Scaler',StandardScaler()),('CART',DecisionTreeRegressor())])))

results = []
names = []
print 'ok'
for name,model in pipelines:
    kfold = KFold(n_splits=num_folds,random_state=seed)
    cv_results = cross_val_score(model,X_train,Y_train,cv=kfold,scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(),cv_results.std())
    print msg
    
print 'ok_end'

ok
ScaledLR: -11788.851902 (141.989698)
ScaledLASSO: -11794.243203 (140.986427)
ScaledEN: -12199.006648 (129.293949)
ScaledCART: -5.893337 (5.413949)
ok_end


In [7]:
#Compare Algorithm Responses
fig = plt.figure()
fig.suptitle("Comparison of Algorithms (Base Case)",fontsize=20,fontweight='bold')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
ax.tick_params(axis='both',labelsize=15)
# ax.set_xticklabels(ax.get_xticks(),fontweight='bold')
# ax.set_yticklabels(ax.get_yticks(),fontweight='bold')
plt.xlabel('Algorithm',size=20,fontweight='bold')
plt.ylabel('Neg Mean Squared',size=20,fontweight='bold')
plt.show()

In [8]:
#Ensemble Methods
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor

ensembles = []
ensembles.append((' ScaledAB',Pipeline([('Scaler',StandardScaler()),('AB',AdaBoostRegressor())])))
# ensembles.append(('ScaledGBM',Pipeline([('Scaler',StandardScaler()),('GBM',GradientBoostingRegressor())])))
ensembles.append(('ScaledRF',Pipeline([('Scaler',StandardScaler()),('RF',RandomForestRegressor())])))
ensembles.append(('ScaledET',Pipeline([('Scaler',StandardScaler()),('ET',ExtraTreesRegressor())])))

seed = 7

results = []
names = []
for name, model in ensembles:
    kfold = KFold(n_splits=num_folds, random_state=seed)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
    
print 'ok_end'

  y = column_or_1d(y, warn=True)


 ScaledAB: -9758.741172 (365.813057)


  self._final_estimator.fit(Xt, y, **fit_params)


ScaledRF: -6.019589 (4.451734)
ScaledET: -4.098129 (3.542409)
ok_end


In [9]:
#Compare Algorithm Responses
fig = plt.figure()
fig.suptitle("Comparison of Algorithms (Base Case)",fontsize=20,fontweight='bold')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
ax.tick_params(axis='both',labelsize=15)
# ax.set_xticklabels(ax.get_xticks(),fontweight='bold')
# ax.set_yticklabels(ax.get_yticks(),fontweight='bold')
plt.xlabel('Algorithm',size=20,fontweight='bold')
plt.ylabel('Neg Mean Squared',size=20,fontweight='bold')
plt.show()

In [None]:
#Fine Tune Extra_Trees
import numpy as np

from sklearn.model_selection import GridSearchCV
seed = 11
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
param_grid = dict(n_estimators=np.array([10,50,100,500]))
model = ExtraTreesRegressor(random_state=seed)
kfold = KFold(n_splits=num_folds,random_state=seed)
grid = GridSearchCV(estimator=model,param_grid=param_grid,scoring=scoring,cv=kfold)
print 'ok'
grid_result = grid.fit(rescaledX,Y_train)

print("Best: %f using %s" % (grid_result.best_score_,grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means,stds,params):
    print("%f (%f) with: %r" % (mean,stdev,param))



In [None]:
param_grid
print("Best: %f using %s" % (grid_result.best_score_,grid_result.best_params_))

In [None]:
#Execute the final model: ExtraTrees(n_estimators:100) on the validation set
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

seed = 13
ET = ExtraTreesRegressor(n_estimators=100,random_state=seed)
ET.fit(X_train,Y_train)
predictions = ET.predict(X_validation)
print(mean_squared_error(Y_validation,predictions))

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import explained_variance_score

# print(explained_variance_score(Y_validation,predictions))
# print(confusion_matrix(Y_validation,predictions))
# print(classification_report(Y_validation,predictions))
