In [15]:
import csv
import time
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn import linear_model
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
import xgboost as xgb
from xgboost import XGBClassifier

def normalize(x):
    '''This function nomalizes each columns of the input 2d array.'''
    x_mean = np.mean(x, axis=0)
    x_std = np.std(x, axis=0)
    x_std[x_std == 0] = 1
    x1 = (x - x_mean) / x_std
    return x1

def addFeature (X_train, X_test1, X_test2, add_feature):
    addFeatureLen = len(add_feature)
    for i1 in range (addFeatureLen):
        for i2 in range (i1,addFeatureLen, 1):
            newFeature1 = np.array(X_train[:,i1]*X_train[:,i2]).reshape(-1,1)
            #print (np.shape(newFeature1))
            X_train = np.hstack((X_train, newFeature1))
            newFeature2 = np.array(X_test1[:,i1]*X_test1[:,i2]).reshape(-1,1)
            X_test1 = np.hstack((X_test1, newFeature2))
            newFeature3 = np.array(X_test2[:,i1]*X_test2[:,i2]).reshape(-1,1)
            X_test2 = np.hstack((X_test2, newFeature3))
    return (X_train, X_test1, X_test2)

def selectFeature(X_train, y_train, x_test1, x_test2,alpha1):
    '''This function select the features of normalized data (i.e., np.std(X[:,j]) = 1 or 0).
    If qmin < (np.amax(X[:,j]) - np.amin(X[:,j]) < qmax, then j will be selected.'''
    reg = linear_model.Lasso(alpha = alpha1)
    reg.fit(X_train, y_train)
    keeplist = []
    feature_num = len(reg.coef_)
    for i in range(feature_num):
        if abs(reg.coef_[i])> 1e-4:
            keeplist.append(True)
        else:
            keeplist.append(False)
    #keeplist[0] = True
    keeplist = np.array(keeplist)
    #print (reg.coef_[[0, 6, 7, 39, 41]])
    #for index, value in enumerate(keeplist):
    #    if value:
    #        print (index)
    #print (reg.coef_[52])
    #dist = np.amax(X_train, axis=0) - np.amin(X_train, axis=0)
    #cols = np.all([dist > qmin, dist < qmax], axis=0)
    x_train_new = X_train[:, keeplist]
    x_test1_new = X_test1[:, keeplist]
    x_test2_new = X_test2[:, keeplist]
    return (x_train_new, x_test1_new, x_test2_new)


start = time.time()

# load the the data from the files
with open('train_2008.csv', 'r') as file1: 
    lines1 = csv.reader(file1, delimiter=',', quotechar='|') 
    next(lines1, None)
    data1 = np.array([line for line in lines1], dtype=float)

with open('test_2008.csv', 'r') as file2:
	lines2 = csv.reader(file2, delimiter=',', quotechar='"')
	next(lines2, None)
	data2 = np.array([line for line in lines2], dtype=float)

with open('test_2012.csv', 'r') as file3:
	lines3 = csv.reader(file3, delimiter=',', quotechar='"')
	next(lines3, None)
	data3 = np.array([line for line in lines3], dtype=float)

# convert the data to float numpy array 
N_train = len(data1)
add_feature = [6,11,39,41,45,48,52,57,64,75,200,332,335,337,348,363,371,374,388,406,417,424,426];
alpha1 = 0.005
y_train = 2 * (data1[:, -1] - 1.5)  # maps 1 to -1, 2 to 1
X_train = normalize(data1[:, :-1])
X_train[:, 0] = 1
X_test1 = normalize(data2)
X_test1[:, 0] = 1
X_test2 = normalize(data3)
X_test2[:, 0] = 1
#qmin, qmax = 1, 100
X_train, X_test1, X_test2 = addFeature(X_train, X_test1, X_test2, add_feature)
X_train, X_test1, X_test2 = selectFeature(X_train, y_train, X_test1, X_test2, alpha1) 
d = len(X_train[0])


# train the model and calculate the scores by cross-validation
N = 550
clf1 = AdaBoostClassifier(n_estimators=N)
clf2 = GradientBoostingClassifier(n_estimators=N)
clf3 = RandomForestClassifier(n_estimators=N)
clf4 = BaggingClassifier(n_estimators=N)
clf5 = xgb.XGBClassifier(max_depth=4, silent = 1, objective = 'binary:logistic')
eclf = VotingClassifier(estimators=[ ('gb', clf2),('xgb', clf5)], voting='hard')
#eclf = VotingClassifier(estimators=[('ab', clf1), ('gb', clf2), ('rf', clf3), ('bg',clf4)], voting='hard')
clf_lst = [clf2, clf5, eclf]
name_lst = [ 'GradientBoost', 'XGB', 'Ensemble']
for clf, name in zip(clf_lst, name_lst):
    scores = cross_val_score(clf, X_train, y_train, cv=2, scoring='accuracy')
    print("Accuracy: %0.5f (+/- %0.25f) [%s]" % (scores.mean(), scores.std(), name))

# write the prediction data into the submission file
eclf.fit(X_train, y_train)
y_test1 = eclf.predict(X_test1)
print([sum(y_test1==-1), sum(y_test1==1)])
with open('submission2008.csv', 'w', newline='') as file: 
	filewriter = csv.writer(file, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
	filewriter.writerow(['id', 'PES1'])
	for i, yi in enumerate(y_test1):
		filewriter.writerow([str(i), str(int(yi/2 + 1.5))])
y_test2 = eclf.predict(X_test2)
print([sum(y_test2==-1), sum(y_test2==1)])
with open('submission2012.csv', 'w', newline='') as file: 
	filewriter = csv.writer(file, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
	filewriter.writerow(['id', 'PES1'])
	for i, yi in enumerate(y_test2):
		filewriter.writerow([str(i), str(int(yi/2 + 1.5))])


# print running time
stop = time.time()
print('The running time is ', stop - start)






Accuracy: 0.78318 (+/- 0.0004141707754557466536482) [GradientBoost]
Accuracy: 0.78173 (+/- 0.0004208989450147915256650) [XGB]
Accuracy: 0.78264 (+/- 0.0001822048386114216711462) [Ensemble]
[14098, 1902]
[72648, 10172]
The running time is  924.8110620975494


soft:
Accuracy: 0.77894 (+/- 0.0013178445819652662152066) [Adaboost]
Accuracy: 0.78116 (+/- 0.0004982269819299722790618) [GradientBoost]
Accuracy: 0.77653 (+/- 0.0003281962658751025330162) [RandomForest]
Accuracy: 0.78235 (+/- 0.0005446000823553509562203) [Ensemble]

if add GaussianNB    
Accuracy: 0.77894 (+/- 0.0013178445819652662152066) [Adaboost]
Accuracy: 0.78122 (+/- 0.0004672983497709393141406) [GradientBoost]
Accuracy: 0.77735 (+/- 0.0000274846778660076118683) [RandomForest]
Accuracy: 0.73221 (+/- 0.0117019842098985971112768) [Ensemble]

if add logit
Accuracy: 0.77894 (+/- 0.0013178445819652662152066) [Adaboost]
Accuracy: 0.78123 (+/- 0.0004827619484596423760081) [GradientBoost]
Accuracy: 0.77582 (+/- 0.0005686953276320960704027) [RandomForest]
Accuracy: 0.77375 (+/- 0.0007612267682565732052069) [LogitEnsemble]

hard but with 150 nodes
Accuracy: 0.77894 (+/- 0.0013178445819652662152066) [Adaboost]
Accuracy: 0.78112 (+/- 0.0004672997845526216664780) [GradientBoost]
Accuracy: 0.77613 (+/- 0.0000738573000307884974802) [RandomForest]
Accuracy: 0.78236 (+/- 0.0004672806541302088056966) [Ensemble]
[13890, 2110]
[71836, 10984]
The running time is  343.1007869243622

with 200 nodes with feature selection
Accuracy: 0.78023 (+/- 0.0012714332206951173276366) [Adaboost]
Accuracy: 0.78179 (+/- 0.0000430171460753525636278) [GradientBoost]
Accuracy: 0.77792 (+/- 0.0002353917172360775467155) [RandomForest]
Accuracy: 0.78259 (+/- 0.0005136685806329532866243) [Ensemble]

with 200 nodes, without feature selection
Accuracy: 0.77871 (+/- 0.0004295655046238078256238) [Adaboost]
Accuracy: 0.78184 (+/- 0.0000961566769045463232146) [GradientBoost]
Accuracy: 0.77386 (+/- 0.0001511413369744696311159) [RandomForest]
Accuracy: 0.78173 (+/- 0.0000806945129975256136845) [Ensemble]

with 200 nodes, without feature selection; add bagging as the fourth
Accuracy: 0.77871 (+/- 0.0004295655046238078256238) [Adaboost]
Accuracy: 0.78179 (+/- 0.0001734765833901752429824) [GradientBoost]
Accuracy: 0.77398 (+/- 0.0004364825871040478588725) [RandomForest]
Accuracy: 0.77676 (+/- 0.0011168484076881646238633) [Bagging]
Accuracy: 0.78131 (+/- 0.0005291522662650982589128) [Ensemble]

with 200 nodes, with feature selection; add bagging as the fourth
Accuracy: 0.78023 (+/- 0.0012714332206951173276366) [Adaboost]
Accuracy: 0.78173 (+/- 0.0000739438651921608958162) [GradientBoost]
Accuracy: 0.77676 (+/- 0.0000034520847225216755305) [RandomForest]
Accuracy: 0.77676 (+/- 0.0012096314346020098362544) [Bagging]
Accuracy: 0.78222 (+/- 0.0009466617786906827980431) [Ensemble]

Accuracy: 0.78023 (+/- 0.0012714332206951173276366) [Adaboost]
Accuracy: 0.78174 (+/- 0.0000275525908655094298183) [GradientBoost]
Accuracy: 0.77704 (+/- 0.0002439736247259460810710) [RandomForest]
Accuracy: 0.77611 (+/- 0.0002199558576598814596537) [Bagging]
Accuracy: 0.78204 (+/- 0.0009466646482541030138691) [Ensemble]
[14149, 1851]
[73040, 9780]
The running time is  1419.6259248256683

Accuracy: 0.78177 (+/- 0.0000275530691261072213649) [GradientBoost]
Accuracy: 0.78140 (+/- 0.0005291508314834714177266) [XGB]
Accuracy: 0.78289 (+/- 0.0003744894967868672708278) [Ensemble]
[13871, 2129]
[71712, 11108]
The running time is  436.3834500312805

Accuracy: 0.78112 (+/- 0.0004363721089146732623476) [GradientBoost]
Accuracy: 0.78140 (+/- 0.0005291508314834714177266) [XGB]
Accuracy: 0.78148 (+/- 0.0005446134736510344076521) [Ensemble]
[14027, 1973]
[72592, 10228]
The running time is  291.1992840766907

