In [11]:
import csv
import time
import numpy as np 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

def normalize(x):
    "This function nomalizes each columns of the input 2d array."
    x_mean = np.mean(x, axis=0)
    x_std = np.std(x, axis=0)
    x_std[x_std == 0] = 1
    x1 = (x - x_mean) / x_std
    return x1

def selectFeature(X_train, X_test1, X_test2, qmin, qmax):
    '''This function select the features of normalized data (i.e., np.std(X[:,j]) = 1 or 0).
    If qmin < (np.amax(X[:,j]) - np.amin(X[:,j]) < qmax, then j will be selected.'''
    dist = np.amax(X_train, axis=0) - np.amin(X_train, axis=0)
    cols = np.all([dist > qmin, dist < qmax], axis=0)
    x_train = X_train[:, cols]
    x_test1 = X_test1[:, cols]
    x_test2 = X_test2[:, cols]
    return (x_train, x_test1, x_test2)

start = time.time()

# load the the data from the files
with open('train_2008.csv', 'r') as file1: 
    lines1 = csv.reader(file1, delimiter=',', quotechar='|') 
    next(lines1, None)
    data1 = np.array([line for line in lines1], dtype=float)

with open('test_2008.csv', 'r') as file2:
	lines2 = csv.reader(file2, delimiter=',', quotechar='"')
	next(lines2, None)
	data2 = np.array([line for line in lines2], dtype=float)

with open('test_2012.csv', 'r') as file3:
	lines3 = csv.reader(file3, delimiter=',', quotechar='"')
	next(lines3, None)
	data3 = np.array([line for line in lines3], dtype=float)

# convert the data to float numpy array 
N_train = len(data1)
y_train = 2 * (data1[:, -1] - 1.5)  # maps 1 to -1, 2 to 1
X_train = normalize(data1[:, :-1])
X_train[:, 0] = 1
X_test1 = normalize(data2)
X_test1[:, 0] = 1
X_test2 = normalize(data3)
X_test2[:, 0] = 1
qmin, qmax = 2, 30
X_train, X_test1, X_test2 = selectFeature(X_train, X_test1, X_test2, qmin, qmax) 
d = len(X_train[0])

# train the model and calculate the scores by cross-validation
N = 200
clf1 = RandomForestClassifier(n_estimators=N)
#clf1 = ExtraTreesClassifier(n_estimators=N)
clf1.fit(X_train[:int(N_train/2)], y_train[:int(N_train/2)])
score1 = clf1.score(X_train[int(N_train/2):], y_train[int(N_train/2):])
clf2 = RandomForestClassifier(n_estimators=N)
#clf2 = ExtraTreesClassifier(n_estimators=N)
clf2.fit(X_train[int(N_train/2):], y_train[int(N_train/2):])
score2 = clf2.score(X_train[:int(N_train/2)], y_train[:int(N_train/2)])
print("The cross-validation scores are : ", score1, score2)


'''
# write the prediction data into the submission file
y_test1 = clf.predict(X_test1[:,ran_features])
with open('submission2008.csv', 'w', newline='') as file: 
	filewriter = csv.writer(file, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
	filewriter.writerow(['id', 'PES1'])
	for i, yi in enumerate(y_test1):
		filewriter.writerow([str(i), str(int(yi/2 + 1.5))])
y_test2 = clf.predict(X_test2[:,ran_features])
with open('submission2012.csv', 'w', newline='') as file: 
	filewriter = csv.writer(file, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
	filewriter.writerow(['id', 'PES1'])
	for i, yi in enumerate(y_test2):
		filewriter.writerow([str(i), str(int(yi/2 + 1.5))])
'''

# print running time
stop = time.time()
print('The running time is ', stop - start)





The cross-validation scores are :  0.772530463289 0.777935855009
The running time is  130.26528215408325


In [None]:
150 nodes
The cross-validation scores are :  0.770767613039 0.777441004546
The running time is  80.84886121749878

200 nodes
The cross-validation scores are :  0.771138739407 0.779049268549
The running time is  97.27322006225586
250 nodes

The cross-validation scores are :  0.772777880868 0.779234837473
The running time is  121.08990907669067
300 nodes
The cross-validation scores are :  0.771850064947 0.778709058856
The running time is  135.82561111450195
200 nodes, used Univariate feature selection to train the random forest