In [10]:
import csv
import time
import numpy as np 
from sklearn.ensemble import AdaBoostClassifier
from sklearn import linear_model
def normalize(x):
    '''This function nomalizes each columns of the input 2d array.'''
    x_mean = np.mean(x, axis=0)
    x_std = np.std(x, axis=0)
    x_std[x_std == 0] = 1
    x1 = (x - x_mean) / x_std
    return x1

def selectFeature(X_train, y_train, x_test1, x_test2):
    '''This function select the features of normalized data (i.e., np.std(X[:,j]) = 1 or 0).
    If qmin < (np.amax(X[:,j]) - np.amin(X[:,j]) < qmax, then j will be selected.'''
    reg = linear_model.Lasso(alpha = 0.005)
    reg.fit(X_train, y_train)
    keeplist = []
    feature_num = len(reg.coef_)
    for i in range(feature_num):
        if abs(reg.coef_[i])> 1e-4:
            keeplist.append(True)
        else:
            keeplist.append(False)
    keeplist[0] = True
    keeplist = np.array(keeplist)
    print (reg.coef_)
    #print (keeplist)
    #dist = np.amax(X_train, axis=0) - np.amin(X_train, axis=0)
    #cols = np.all([dist > qmin, dist < qmax], axis=0)
    x_train_new = X_train[:, keeplist]
    x_test1_new = X_test1[:, keeplist]
    x_test2_new = X_test2[:, keeplist]
    print (len(keeplist))
    print (np.shape(x_train_new))
    return (x_train_new, x_test1_new, x_test2_new)


start = time.time()

# load the the data from the files
with open('train_2008.csv', 'r') as file1: 
    lines1 = csv.reader(file1, delimiter=',', quotechar='|') 
    next(lines1, None)
    data1 = np.array([line for line in lines1], dtype=float)

with open('test_2008.csv', 'r') as file2:
	lines2 = csv.reader(file2, delimiter=',', quotechar='"')
	next(lines2, None)
	data2 = np.array([line for line in lines2], dtype=float)

with open('test_2012.csv', 'r') as file3:
	lines3 = csv.reader(file3, delimiter=',', quotechar='"')
	next(lines3, None)
	data3 = np.array([line for line in lines3], dtype=float)

# convert the data to float numpy array 
N_train = len(data1)
y_train = 2 * (data1[:, -1] - 1.5)  # maps 1 to -1, 2 to 1
X_train = normalize(data1[:, :-1])
X_train[:, 0] = 1
X_test1 = normalize(data2)
X_test1[:, 0] = 1
X_test2 = normalize(data3)
X_test2[:, 0] = 1
#qmin, qmax = 1, 100
X_train, X_test1, X_test2 = selectFeature(X_train, y_train, X_test1, X_test2) 
d = len(X_train[0])
print('lasso alpha', 0.005)

print ('Selected feature number: ', d)

# train the model and calculate the scores by cross-validation
N = 200
clf1 = GradientBoostingClassifier(n_estimators=N)
clf1.fit(X_train[:int(N_train/2)], y_train[:int(N_train/2)])
score1 = clf1.score(X_train[int(N_train/2):], y_train[int(N_train/2):])
clf2 = GradientBoostingClassifier(n_estimators=N)
clf2.fit(X_train[int(N_train/2):], y_train[int(N_train/2):])
score2 = clf2.score(X_train[:int(N_train/2)], y_train[:int(N_train/2)])
print("The cross-validation scores are : ", score1, score2)


# write the prediction data into the submission file
#clf = GradientBoostingClassifier(n_estimators=N)
#clf.fit(X_train, y_train)
y_test1 = clf2.predict(X_test1)
print([sum(y_test1==-1), sum(y_test1==1)])
with open('submission2008.csv', 'w', newline='') as file: 
	filewriter = csv.writer(file, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
	filewriter.writerow(['id', 'PES1'])
	for i, yi in enumerate(y_test1):
		filewriter.writerow([str(i), str(int(yi/2 + 1.5))])
y_test2 = clf2.predict(X_test2)
print([sum(y_test2==-1), sum(y_test2==1)])
with open('submission2012.csv', 'w', newline='') as file: 
	filewriter = csv.writer(file, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
	filewriter.writerow(['id', 'PES1'])
	for i, yi in enumerate(y_test2):
		filewriter.writerow([str(i), str(int(yi/2 + 1.5))])


# print running time
stop = time.time()
print('The running time is ', stop - start)





[  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   4.50242224e-04   7.31411173e-04   6.35813420e-02   3.92576386e-02
   1.38947879e-02   0.00000000e+00  -2.33095887e-03  -2.33321745e-02
   0.00000000e+00  -0.00000000e+00   0.00000000e+00  -0.00000000e+00
   0.00000000e+00   8.88772227e-04   0.00000000e+00   0.00000000e+00
  -0.00000000e+00   1.00290266e-02  -4.67369678e-03   0.00000000e+00
   5.17590642e-03  -4.74492486e-03  -5.48342959e-03   0.00000000e+00
   0.00000000e+00  -0.00000000e+00   1.09430968e-02   1.41400323e-02
  -0.00000000e+00  -8.74104156e-04   0.00000000e+00   1.07976923e-03
  -3.67225562e-03  -1.77757438e-02  -1.12347572e-02   1.46669436e-02
   7.44388672e-03  -1.39278357e-01   1.97503122e-02   0.00000000e+00
  -6.61037234e-03  -2.96473296e-02   0.00000000e+00   0.00000000e+00
  -1.68624633e-01   9.56493687e-03  -7.33857547e-03  -2.83940775e-03
   2.85346228e-02   0.00000000e+00   0.00000000e+00   4.92783540e-03
  -1.87975777e-02   2.69791968e-03

In [None]:
lasso alpha 0.002
Selected feature number:  153
The cross-validation scores are :  0.779489082699 0.783255497479
The running time is  79.7872269153595

lasso alpha 0.0025
Selected feature number:  137
The cross-validation scores are :  0.778437557988 0.783781276096
The running time is  76.78622102737427

lasso alpha 0.005
Selected feature number:  98
The cross-validation scores are :  0.779458155502 0.784492623635
The running time is  61.63374900817871

lasso alpha 0.0075
Selected feature number:  69
The cross-validation scores are :  0.780138553844 0.784152413942
The running time is  52.70413112640381

lasso alpha 0.01
Selected feature number:  56
The cross-validation scores are :  0.778994247541 0.784863761482
The running time is  48.207842111587524