In [1]:
#Amy Edwards
#William Chirciu
#Final Project
#CSC 575 - Online 810
#March 17,2019

#Scikit Template based on Casey Bennett's from 2018
#importing necessary packages for model evaluation
import sys
import csv
import math
import numpy as np
from operator import itemgetter
import time

from sklearn.linear_model import LinearRegression, SGDRegressor , Ridge
from sklearn.svm import SVR 
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, BaggingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.externals import joblib
from sklearn.feature_selection import RFE, VarianceThreshold, SelectFromModel
from sklearn.feature_selection import SelectKBest, mutual_info_regression, mutual_info_classif, chi2
from sklearn import metrics
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.preprocessing import KBinsDiscretizer, scale


#Handle annoying warnings
import warnings, sklearn.exceptions
warnings.filterwarnings("ignore", category=sklearn.exceptions.ConvergenceWarning)


#####################
#
# Global parameters
#
#####################

target_idx=0           #Index of Target variable
cross_val=1           #Control Switch for CV
norm_target=0           #Normalize target switch
norm_features=0              #Normalize target switch
bin_cnt=2              #If bin target, this sets number of classes
feat_select=0     #Control Switch for Feature Selection
fs_type=3     #Feature Selection type (1=Stepwise Backwards Removal, 2=Wrapper Select, 3=Univariate Selection) 
feat_start=2  #Start column of features
feat_start_pred = 1 # feature start for test set
k_cnt='all'    #Number of 'Top k' best ranked features to select, only applies for fs_types 1 and 3

## get id from test
test_id = 0

#Set global model parameters
rand_st=0        #Set Random State variable for randomizing splits on runs


## Load Data

In [150]:
######################
#
# Load Data
#
#####################

file1= csv.reader(open('forRegression_jac.csv'), delimiter=',', quotechar='"')
file2= csv.reader(open('forRegressionTest_jac.csv'), delimiter=',', quotechar='"')


##########
# train
#########

#Read Header Line
header=next(file1)            

#Read data
data=[]
target=[]
for row in file1:
    #Load Target
    if row[target_idx]=='':                         #If target is blank, skip row                       
        continue
    else:
        target.append(float(row[target_idx]))       #If pre-binned class, change float to int

    #Load row into temp array, cast columns  
    temp=[]
                 
    for j in range(feat_start,len(header)):
        if row[j]=='':
            temp.append(float())
        else:
            temp.append(float(row[j]))

    #Load temp into Data array
    data.append(temp)

#Test Print
print(header)
print(len(target),len(data))
print('\n')

data_np=np.asarray(data)
target_np=np.asarray(target)


###########
# predict
##########
 
#Read Header Line
header2=next(file2)            

#Read data
data2=[]
test_id_list = []
for row in file2:
    #Load row into temp array, cast columns  
    temp2=[]
     
    for i in range(test_id, (test_id + 1)):
        if row[i]=='':
            test_id_list.append(float())
        else:
            test_id_list.append(float(row[i]))
            
    for j in range(feat_start_pred,len(header2)):
        if row[j]=='':
            temp2.append(float())
        else:
            temp2.append(float(row[j]))

    #Load temp into Data array
    data2.append(temp2)

#Test Print
print(header2)
print(len(data2))
print(len(test_id_list))
print('\n')

TestData = np.asarray(data2)


['target', 'id', 'cosine_title', 'cosine_description', 'cosine_attribute', 'query_ct', 'title_ct', 'desc_ct', 'jac_title', 'exact', 'title_query_ratio', 'desc_query_ratio', 'tq_common_ratio', 'dq_common_ratio']
74067 74067


['id', 'cosine_title', 'cosine_description', 'cosine_attribute', 'query_ct', 'title_ct', 'desc_ct', 'jac_title', 'exact', 'title_query_ratio', 'desc_query_ratio', 'tq_common_ratio', 'dq_common_ratio']
112067
112067




## Preprocessing- normalizing and feature selection

In [140]:
############################
#
# Preprocess data
#
###########################


if norm_target==1:
    #Target normalization for continuous values
    target_np=scale(target_np)

if norm_features==1:
    #Feature normalization for continuous values
    data_np=scale(data_np)


#############################
#
# Feature Selection
#
#############################


#Feature Selection
if feat_select==1:
    '''Three steps:
       1) Run Feature Selection
       2) Get lists of selected and non-selected features
       3) Filter columns from original dataset
       '''
    
    print('--FEATURE SELECTION ON--', '\n')
    
    ##1) Run Feature Selection #######
    if fs_type==1:
        #stepwise backward
        rgr = RandomForestRegressor(n_estimators=500, max_depth=None, min_samples_split=3, criterion='mse', random_state=None)
        sel = RFE(rgr, n_features_to_select=k_cnt, step=.1)
        print('Stepwise Recursive Backwards - Random Forest: ')
            
        fit_mod=sel.fit(data_np, target_np)
        print(sel.ranking_)
        sel_idx=fit_mod.get_support()      

    if fs_type==2:
        #Wrapper Select via model
    
        rgr =GradientBoostingRegressor(loss = 'ls', n_estimators = 200, learning_rate = 0.03, max_depth = 10, min_samples_split = 500, max_features = 'auto',random_state = rand_st,subsample = 0.8)
        sel = SelectFromModel(rgr, prefit=False, threshold='mean', max_features=None)
        print ('Wrapper Select - Random Forest: ')
            
        fit_mod=sel.fit(data_np, target_np)    
        sel_idx=fit_mod.get_support()

    if fs_type==3:       
    ######Only work if the Target is continuous###########
        #Univariate Feature Selection - Mutual Info Regression
        sel=SelectKBest(mutual_info_regression, k=k_cnt)
        fit_mod=sel.fit(data_np, target_np)
        print ('Univariate Feature Selection - Mutual Info: ')
        sel_idx=fit_mod.get_support()

        #Print ranked variables out sorted
        temp=[]
        scores=fit_mod.scores_
        for i in range(feat_start, len(header)):            
            temp.append([header[i], float(scores[i-feat_start])])

        print('Ranked Features')
        temp_sort=sorted(temp, key=itemgetter(1), reverse=True)
        for i in range(len(temp_sort)):
            print(i, temp_sort[i][0], ':', temp_sort[i][1])
        print('\n')

    ##2) Get lists of selected and non-selected features (names and indexes) #######
    temp=[]
    temp_idx=[]
    temp_del=[]
    for i in range(len(data_np[0])):
        if sel_idx[i]==1:                                                           #Selected Features get added to temp header
            temp.append(header[i+feat_start])
            temp_idx.append(i)
        else:                                                                       #Indexes of non-selected features get added to delete array
            temp_del.append(i)
    print('Selected', temp)
    print('Features (total/selected):', len(data_np[0]), len(temp))
    print('\n')
            
                
    ##3) Filter selected columns from original dataset #########
    header = header[0:feat_start]
    for field in temp:
        header.append(field)
    data_np = np.delete(data_np, temp_del, axis=1)                                 #Deletes non-selected features by index)


--LOW VARIANCE FILTER ON-- 

Selected ['query_ct', 'title_ct', 'desc_ct']
Features (total, selected): 8 3




## example of output from Univariate Feature Selection

Univariate Feature Selection - Mutual Info: 
Ranked Features
0 cosine_title : 0.043382674792738385
1 cosine_description : 0.03819413882648881
2 query_ct : 0.010914201602489904
3 cosine_attribute : 0.0008147022271858262
4 title_ct : 7.134835456046318e-05
5 desc_ct : 0.0

## Train/Test model split

In [151]:
#############################################################################
#
# Train SciKit Models
#
##########################################

#Test/Train split - if needed
data_train, data_test, target_train, target_test = train_test_split(data_np, target_np, test_size=0.35)

## Model training without CV

This next set of marked down code shows how we tested our model without cross validation. These were the models we used

####Regressors####
if cross_val==0:
    #SciKit Decision Tree Regressor
    rgr = DecisionTreeRegressor(criterion='friedman_mse', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features=None, random_state=rand_st)
    rgr.fit(data_train, target_train)

    scores_RMSE = math.sqrt(metrics.mean_squared_error(target_test,rgr.predict(data_test)))
    print('Decision Tree RMSE:', scores_RMSE)
    scores_Expl_Var = metrics.explained_variance_score(target_test,rgr.predict(data_test))
    print('Decision Tree Expl Var:', scores_Expl_Var)
    
    #SciKit Bagging Regressor
    start_ts=time.time()
    dgr = DecisionTreeRegressor(criterion='mse', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features=None, random_state=rand_st) 
    bag = BaggingRegressor(dgr, max_samples = 0.6, random_state = rand_st)
    bag.fit(data_train, target_train)

    scores_RMSE = math.sqrt(metrics.mean_squared_error(target_test,bag.predict(data_test)))
    print('Decision Bag Tree RMSE:', scores_RMSE)
    scores_Expl_Var = metrics.explained_variance_score(target_test,bag.predict(data_test))
    print('Decision Bag Tree Expl Var:', scores_Expl_Var)
    
    #SciKit Decision Tree Regressor
    fgr = RandomForestRegressor(criterion='friedman_mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features=None, random_state=rand_st)
    fgr.fit(data_train, target_train)

    scores_RMSE = math.sqrt(metrics.mean_squared_error(target_test,rgr.predict(data_test)))
    print('RF RMSE:', scores_RMSE)
    scores_Expl_Var = metrics.explained_variance_score(target_test,rgr.predict(data_test))
    print('RF Expl Var:', scores_Expl_Var)

Here we see the output from the models above. The RMSE is rather high for all of them, so we did not keep them when picking a best model

Out[]:
Decision Tree RMSE: 0.690063227758232
Decision Tree Expl Var: -0.6678101231732545

Decision Bag Tree RMSE: 0.5353676542726318
Decision Bag Tree Expl Var: -0.0036655880468252633

RF RMSE: 0.690063227758232
RF Expl Var: -0.6678101231732545

ExtraTrees() is another method of feature importance that we used.

model = ExtraTreeRegressor()
model.fit(data_train, target_train)
print(model.feature_importances_)

This is the output from ExtraTrees. You can see how many of the features were effecting the model similarily.

['cosine_title', 'cosine_description', 'cosine_attribute', 'query_ct', 'title_ct', 'desc_ct']
[0.24507603           0.24542375           0.03545412       0.04614564 0.16035267 0.26754779]

## Model Training with CV

These are the other models we tried with cross validation. We did Random Forest, Decision Tree, Gradient Boosting, Ada Boosting Linear Regression, Decision Tree with Bagging and Neural Networks.

####Cross-Val Regressors####
   
if cross_val==1:
    #Setup Crossval regression scorers
    scorers = {'Neg_MSE': 'neg_mean_squared_error', 'expl_var': 'explained_variance'} 
    
    #SciKit Random Forest Regressor - Cross Val
    start_ts=time.time()
    model_Forest = RandomForestRegressor(n_estimators = 100, max_features = 0.33, max_depth = None, random_state = rand_st, min_samples_split = 3) 
    scores = cross_validate(model_Forest, data_np, y = target_np, scoring = scorers, cv = 5)
    scores_RMSE = np.asarray([math.sqrt(-x) for x in scores['test_Neg_MSE']])                                       #Turns negative MSE scores into RMSE
    scores_Expl_Var = scores['test_expl_var']
    print("Random Forest RMSE:: %0.2f (+/- %0.2f)" % ((scores_RMSE.mean()), (scores_RMSE.std() * 2)))
    print("Random Forest Expl Var: %0.2f (+/- %0.2f)" % ((scores_Expl_Var.mean()), (scores_Expl_Var.std() * 2)))
    print("CV Runtime:", time.time()-start_ts)
    
    #SciKit Decision Tree Regressor - Cross Val
    start_ts=time.time()
    model_Decision = DecisionTreeRegressor(criterion='mse', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features=None, random_state=rand_st)
    scores = cross_validate(model_Decision, data_np, target_np, scoring=scorers, cv=5)
    scores_RMSE = np.asarray([math.sqrt(-x) for x in scores ['test_Neg_MSE']]) #Turns negative MSE scores into RMSE
    scores_Expl_Var = scores['test_expl_var']
    print("Decision Tree RMSE:: %0.2f (+/- %0.2f)" % ((scores_RMSE.mean()), (scores_RMSE.std() * 2)))
    print("Decision Tree Expl Var: %0.2f (+/- %0.2f)" % ((scores_Expl_Var.mean()), (scores_Expl_Var.std() * 2)))
    print("CV Runtime:", time.time()-start_ts)
    
    #SciKit Bagging Regressor
    start_ts=time.time()
    dgr = DecisionTreeRegressor(criterion='mse', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features=None, random_state=rand_st) 
    bag = BaggingRegressor(dgr, max_samples = 0.6, random_state = rand_st)
    scores = cross_validate(bag, data_np, target_np, scoring=scorers, cv=5)

    scores_RMSE = np.asarray([math.sqrt(-x) for x in scores ['test_Neg_MSE']]) #Turns negative MSE scores into RMSE
    scores_Expl_Var = scores['test_expl_var']
    print("Decision Bag Tree RMSE:: %0.2f (+/- %0.2f)" % ((scores_RMSE.mean()), (scores_RMSE.std() * 2)))
    print("Decision Bag Tree Expl Var: %0.2f (+/- %0.2f)" % ((scores_Expl_Var.mean()), (scores_Expl_Var.std() * 2)))
    print("CV Runtime:", time.time()-start_ts)
    
    #SciKit Ada Boosting - Cross Val
    start_ts=time.time()
    model_Ada=AdaBoostRegressor(loss = 'linear', n_estimators = 100, learning_rate = 0.5, base_estimator = None, random_state = rand_st)
    scores = cross_validate(model_Ada, data_np, y = target_np, scoring = scorers, cv = 5)
    scores_RMSE = np.asarray([math.sqrt(-x) for x in scores['test_Neg_MSE']])   #Turns negative MSE scores into RMSE                                    
    scores_Expl_Var = scores['test_expl_var']
    print("Ada Boosting RMSE: %0.2f (+/- %0.2f)" % ((scores_RMSE.mean()), (scores_RMSE.std()* 2)))
    print("Ada Boosting Expl Var: %0.2f (+/- %0.2f)" % ((scores_Expl_Var.mean()), (scores_Expl_Var.std() * 2)))
    print("CV Runtime:", time.time()-start_ts)


#SciKit Decision Tree Regressor - Cross Val
if cross_val==1:
    scorers = {'Neg_MSE': 'neg_mean_squared_error', 'expl_var': 'explained_variance'} 
    
    start_ts=time.time()
    model_Linear = LinearRegression()
    scores = cross_validate(model_Linear, data_np, target_np, scoring=scorers, cv=5)
    scores_RMSE = np.asarray([math.sqrt(-x) for x in scores ['test_Neg_MSE']]) #Turns negative MSE scores into RMSE
    scores_Expl_Var = scores['test_expl_var']
    print("Linear RMSE:: %0.2f (+/- %0.2f)" % ((scores_RMSE.mean()), (scores_RMSE.std() * 2)))
    print("Linear Expl Var: %0.2f (+/- %0.2f)" % ((scores_Expl_Var.mean()), (scores_Expl_Var.std() * 2)))
    print("CV Runtime:", time.time()-start_ts)

Here are the outputs for the various models for this instance of running. 

Linear RMSE:: 0.51 (+/- 0.02)
Linear Expl Var: 0.10 (+/- 0.04)
CV Runtime: 0.05126190185546875

Out[]:
Random Forest RMSE:: 0.52 (+/- 0.02)
Random Forest Expl Var: 0.07 (+/- 0.05)

Decision Tree RMSE:: 0.70 (+/- 0.02)
Decision Tree Expl Var: -0.76 (+/- 0.11)
CV Runtime: 1.98085618019104

Gradient Boosting RMSE: 0.50 (+/- 0.02)
Gradient Boosting Expl Var: 0.11 (+/- 0.05)
CV Runtime: 13.127336740493774

Ada Boosting RMSE: 0.51 (+/- 0.01)
Ada Boosting Expl Var: 0.10 (+/- 0.03)
CV Runtime: 8.59136414527893

    #SciKit Neural Network - Cross Val
    start_ts=time.time()
    model_NN=MLPRegressor(activation='logistic', solver = 'lbfgs', max_iter= 1000, hidden_layer_sizes = (10,), alpha = 0.0001, random_state = rand_st)
    scores = cross_validate(model_NN, data_np, y = target_np, scoring = scorers, cv = 5)

    scores_RMSE = np.asarray([math.sqrt(-x) for x in scores['test_Neg_MSE']])
                 #Turns negative MSE scores into RMSE                                             
    scores_Expl_Var = scores['test_expl_var']
    print("Neural Net RMSE: %0.2f (+/- %0.2f)" % ((scores_RMSE.mean()), (scores_RMSE.std()* 2)))
    print("Neural Net Expl Var: %0.2f (+/- %0.2f)" % ((scores_Expl_Var.mean()), (scores_Expl_Var.std() * 2)))
    print("CV Runtime:", time.time()-start_ts)

Out[]:
Neural Net RMSE: 0.51 (+/- 0.02)
Neural Net Expl Var: 0.10 (+/- 0.05)
CV Runtime: 21.232121229171753

#SciKit SVM - Cross Val
    start_ts=time.time()
    model_SVR =SVR(kernel = "linear", C = 1.0, gamma = 0.1)
    scores=cross_validate(model_SVR, data_np, y = target_np, scoring = scorers, cv = 5)                                                                                                 
    scores_RMSE = np.asarray([math.sqrt(-x) for x in scores['test_Neg_MSE']])                                       #Turns negative MSE scores into RMSE
    scores_Expl_Var = scores['test_expl_var']
    print(scores_RMSE)
    print("SVM RMSE: %0.2f (+/- %0.2f)" % ((scores_RMSE.mean()), (scores_RMSE.std() * 2)))
    print("SVM Expl Var: %0.2f (+/- %0.2f)" % ((scores_Expl_Var.mean()), (scores_Expl_Var.std() * 2)))
    print("CV Runtime:", time.time()-start_ts)

Out[]: (past data, took too long on current data)
[0.51416566 0.50801402 0.50588603 0.51958252 0.55598427]
SVM RMSE: 0.52 (+/- 0.04)
SVM Expl Var: 0.06 (+/- 0.04)
CV Runtime: 631.560005903244   

# Final Model

In [178]:
if cross_val==1:
    #Setup Crossval regression scorers
    scorers = {'Neg_MSE': 'neg_mean_squared_error', 'expl_var': 'explained_variance'} 
    #SciKit Gradient Boosting - Cross Val
    start_ts=time.time()
    #model= GradientBoostingRegressor(loss = 'ls', n_estimators = 200, learning_rate = 0.03, max_depth = 10, min_samples_split = 500, max_features = 'auto',random_state = rand_st,subsample = 0.8)
    #model = Ridge(alpha=1, fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=0.001, solver='auto', random_state=rand_st)
    model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,random_state=rand_st, loss='ls')
    scores = cross_validate(model, data_np, y = target_np, scoring = scorers, cv = 5)
    scores_RMSE = np.asarray([math.sqrt(-x) for x in scores['test_Neg_MSE']])                                       #Turns negative MSE scores into RMSE
    scores_Expl_Var = scores['test_expl_var']
    print("Gradient Boosting RMSE: %0.4f (+/- %0.4f)" % ((scores_RMSE.mean()), (scores_RMSE.std() * 2)))
    print("Gradient Boosting Expl Var: %0.4f (+/- %0.4f)" % ((scores_Expl_Var.mean()), (scores_Expl_Var.std() * 2)))
    print("CV Runtime:", time.time()-start_ts)

Gradient Boosting RMSE: 0.4821 (+/- 0.0197)
Gradient Boosting Expl Var: 0.1806 (+/- 0.0522)
CV Runtime: 448.75373458862305


Now we have to use the model to make predictions on the given test data and save it to a csv file for ranking in Kaggle

In [179]:
# fit final GB model
model.fit(data_train, target_train)
# make a prediction
y_new = model.predict(TestData)

In [180]:
# show the inputs and predicted outputs
with open('test_predict.csv', 'w') as f:
    f.write("id,relevance\n")
    for i in range(len(TestData)):
        #print("X=%s, Predicted=%s" % (int(test_id_list[i]), y_new[i]))
        f.write("%d, %.3f\n" % (int(test_id_list[i]), y_new[i]))
        
f.close()