## Intro

In [3]:
import numpy as np # for multi-dimensional array operations
import pandas as pd # for reading data from .csv files
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.decomposition import PCA # for principle component analysis (dimensionality reduction)
from sklearn.model_selection import train_test_split # for splitting the dataset into training and testing sets
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, StratifiedKFold, RandomizedSearchCV # for getting the best hyper parameters
from sklearn.preprocessing import MinMaxScaler # for scaling of data before PCAfrom sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score #f1 score

In [5]:
train_set = pd.read_csv('../../Training and Testing sets/train_tfidf_features.csv') # import the training set
test_set = pd.read_csv('../../Training and Testing sets/test_tfidf_features.csv') # import the testing set

print(train_set.head(10))
print(test_set.head(10))


   id  label    0    1    2    3    4    5    6    7  ...  4990  4991  4992  \
0   1      1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
1   2      0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
2   3      1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
3   4      0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
4   5      1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
5   6      0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
6   7      0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
7   8      1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
8   9      1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
9  10      1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   

   4993  4994  4995  4996  4997  4998  4999  
0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
1   0.0   0.0   0.0   0.0   0.0   0.0 

In [6]:
train_set_label = train_set.loc[:, ["label"]]
features_names = [str(i) for i in range(0, 5000)]
train_set_features = train_set.loc[:, features_names] # train_set_features will not contain the label and id columns
test_set_features = test_set.loc[:, features_names]

print(train_set_features.shape)
print(test_set_features.shape)

(17184, 5000)
(4296, 5000)


Combine train and test

In [7]:
frames = [train_set_features,test_set_features]
to_reduce = pd.concat(frames)
print(to_reduce.shape)

(21480, 5000)


In [8]:
# scale the dataset before PCA
scaler = MinMaxScaler()
to_reduce = scaler.fit_transform(to_reduce)
print("After Scaling")
print(to_reduce.shape)

After Scaling
(21480, 5000)


## PCA (80%)
### Used to train: X_train, Y_train 
### Used to test: X_test, Y_test (if model works well)
Note: Validation test set separation already done in training

Perform PCA on train set features and separate into x_train and y_train data

Select the number of components such that the amount of variance that needs to be explained is greater than 80% percentage.

In [9]:
# perform PCA
pca = PCA(n_components = 0.80)
train_set_reduced = pca.fit_transform(train_set_features)
train_set_reduced = pd.DataFrame(data = train_set_reduced)
train_set_reduced

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1773,1774,1775,1776,1777,1778,1779,1780,1781,1782
0,-0.083198,-0.016048,-0.010595,-0.001965,-0.013779,-0.010989,-0.009680,-0.007520,-0.022370,-0.023418,...,-0.011568,-0.013349,-0.006594,0.004195,-0.000538,0.005127,0.005406,0.005453,0.011507,0.000360
1,-0.068421,-0.043649,-0.018443,-0.008228,-0.000051,-0.043177,0.127322,0.010281,0.014349,-0.005608,...,-0.035127,0.027850,-0.011023,-0.009811,0.004957,0.002454,-0.002219,-0.006450,-0.015337,-0.000006
2,-0.080171,-0.044642,-0.015342,-0.008697,-0.010481,-0.057562,0.075615,0.115411,0.111499,0.070551,...,0.003059,-0.007846,-0.008814,-0.001514,-0.008945,0.010647,0.002845,0.001764,0.006909,-0.001785
3,0.028600,-0.040406,0.002784,0.007087,0.000462,-0.031219,-0.137443,0.114581,-0.013427,0.077351,...,0.023494,-0.010312,0.016959,0.009277,0.015207,-0.007301,0.003193,-0.008891,-0.010354,0.007742
4,0.255054,-0.113418,-0.019237,-0.021468,-0.040033,-0.007000,-0.035960,0.008656,-0.026048,-0.003353,...,0.014642,0.002320,0.006176,-0.010438,-0.009482,-0.013039,-0.000114,0.000515,0.003214,0.007794
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17179,0.075930,0.078879,0.167305,-0.099091,0.304845,0.037762,-0.022524,0.018488,-0.015024,-0.006992,...,0.001864,0.009611,0.006904,-0.000056,-0.010809,-0.000104,-0.001102,0.004139,-0.002404,0.002048
17180,-0.072104,-0.019731,-0.014655,-0.005451,-0.005682,-0.012314,0.005986,-0.009017,-0.003207,-0.014674,...,-0.014893,0.021566,0.008008,-0.014403,-0.018688,-0.002211,-0.005081,-0.008327,0.003585,-0.011744
17181,0.002079,-0.041786,-0.016886,-0.008919,-0.018063,-0.020858,0.006685,-0.006916,-0.017607,0.010785,...,-0.003015,0.024486,-0.007753,0.002657,-0.003067,0.017837,-0.003254,0.009121,0.006637,-0.010113
17182,0.091355,-0.055903,-0.008473,-0.017559,-0.002191,-0.013549,-0.012348,-0.003722,-0.017070,-0.010729,...,0.000124,0.005183,0.001534,-0.001772,0.007042,-0.004696,-0.000614,0.003481,-0.000966,-0.004786


In [10]:
X = train_set_reduced
Y = train_set_label

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.15, random_state=41)
print(X_train.shape)
print(X_test.shape)

(14606, 1783)
(2578, 1783)


Perform PCA on test set features and separate into x_train and y_train data

In [11]:
features_names = [str(i) for i in range(0, 5000)]
test_set_features = test_set.loc[:, features_names] # test_set_features will not contain the label and id columns

# scale the dataset before PCA
test_set_rescaled = scaler.transform(test_set_features)

# perform PCA
submit_set_reduced = pca.transform(test_set_features) # use the pca from the train_set?
submit_set_features = pd.DataFrame(data = submit_set_reduced)
submit_set_features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1773,1774,1775,1776,1777,1778,1779,1780,1781,1782
0,-0.082801,-0.009343,-0.013543,-0.013317,-0.007984,-0.002459,-0.026278,0.004096,-0.014072,-0.020606,...,0.024957,0.024105,-0.017568,0.003822,0.008023,-0.010257,0.025428,-0.004821,-0.002038,0.016787
1,-0.075595,0.005040,-0.017771,-0.014935,-0.000467,-0.001601,-0.055948,-0.017461,-0.031185,-0.017278,...,0.008950,0.015088,-0.007169,-0.009299,0.001821,-0.003901,-0.015390,0.009871,0.000731,-0.001370
2,0.163876,0.059162,-0.057706,-0.040884,-0.063164,-0.003223,0.020473,-0.005967,-0.005539,-0.008680,...,0.007533,-0.003971,0.002672,0.009313,0.000310,-0.002152,0.002346,0.000066,-0.000932,-0.001051
3,-0.006835,0.189357,-0.044389,-0.024452,-0.043835,-0.013077,0.001344,0.000674,-0.005021,-0.028871,...,-0.001175,-0.000238,0.002451,-0.004008,0.004178,0.000616,0.006505,0.003388,-0.001446,0.002821
4,0.120704,-0.089485,0.015896,0.030057,-0.018015,-0.057111,-0.072387,0.053490,-0.153091,0.026020,...,-0.000823,0.002520,0.004705,-0.005434,0.000582,-0.004489,-0.003229,-0.002975,-0.000870,-0.009341
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4291,-0.076808,-0.024913,-0.014752,-0.008106,-0.009817,-0.013137,0.026288,-0.012828,-0.004845,-0.010727,...,0.000180,-0.000223,0.006229,-0.003326,0.013008,-0.002241,-0.004073,-0.001340,-0.003943,0.006119
4292,0.069331,-0.055078,-0.008249,-0.022183,-0.000882,0.024624,-0.085988,-0.152147,0.159634,0.018128,...,0.005807,-0.012032,-0.004000,-0.001530,-0.006042,0.002616,-0.023036,-0.015403,0.005047,0.019688
4293,0.042192,-0.056123,-0.009015,-0.022608,-0.000348,0.011589,-0.046442,-0.129357,0.126640,0.010080,...,0.011093,0.019703,0.021460,0.012654,0.000310,-0.017409,-0.008309,-0.002952,-0.025245,-0.008260
4294,0.220417,-0.101562,-0.013420,-0.014971,-0.033857,-0.007979,-0.018445,-0.004240,-0.019587,-0.000660,...,-0.002257,0.002456,-0.001471,-0.000596,-0.003709,0.001838,-0.010312,0.000559,0.002554,-0.000411


# Training: 
80% variance

{'n_estimators': [10, 100], 'max_features': ['log2', 'sqrt'], 'max_depth': [2, 4], 'bootstrap': [True, False]}

3 kfold

Results on X_test:0.0036029544226265538


In [12]:
# Parameters
# Number of tree in random forest 
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 100, num =2)]
# Number of features to consider at every split
max_features = ['log2','sqrt']
#Maximum number of levels in tree
max_depth = [2,4]
# Method of selecting samples for training each tree
bootstrap = [True,False]

# Creating a param grid
random_grid = {'n_estimators':n_estimators,
            'max_features':max_features,
            'max_depth': max_depth,
            'bootstrap': bootstrap }

print(random_grid)

{'n_estimators': [10, 100], 'max_features': ['log2', 'sqrt'], 'max_depth': [2, 4], 'bootstrap': [True, False]}


In [13]:
kfold = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 0) # for 3-fold cross validation

In [14]:
clf = RandomForestClassifier()
grid = RandomizedSearchCV(estimator = clf, param_distributions = random_grid, scoring = 'f1', refit = 'accuracy', n_jobs = -1 , cv = kfold, verbose = 2)
grid.fit(X_train, np.ravel(Y_train))
print(grid.best_params_) # gets the best hyper-parameters for random forest 

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END bootstrap=True, max_depth=4, max_features=log2, n_estimators=10; total time=   3.3s
[CV] END bootstrap=True, max_depth=4, max_features=log2, n_estimators=10; total time=   3.4s
[CV] END bootstrap=True, max_depth=4, max_features=log2, n_estimators=10; total time=   3.4s
[CV] END bootstrap=False, max_depth=2, max_features=log2, n_estimators=10; total time=   3.6s
[CV] END bootstrap=False, max_depth=2, max_features=log2, n_estimators=10; total time=   3.6s
[CV] END bootstrap=False, max_depth=2, max_features=log2, n_estimators=10; total time=   3.7s
[CV] END bootstrap=False, max_depth=4, max_features=log2, n_estimators=10; total time=   1.6s
[CV] END bootstrap=False, max_depth=4, max_features=log2, n_estimators=10; total time=   1.6s
[CV] END bootstrap=False, max_depth=4, max_features=log2, n_estimators=10; total time=   1.6s
[CV] END bootstrap=True, max_depth=4, max_features=sqrt, n_estimators=10; total time=   5.4s
[CV

In [15]:
grid_prediction = grid.predict(X_train)
score = f1_score(Y_train, grid_prediction)
print(score)

0.041615235408217245


# Training

80% variance

{'n_estimators': [10, 673, 1336, 2000], 'max_features': ['auto', 'log2', 'sqrt'], 'max_depth': [10, 73, 136, 200], 'bootstrap': [True, False]}

4 kfolds

Results on X_test:0.9960238568588469

Best: {'n_estimators': 10, 'max_features': 'sqrt', 'max_depth': 200, 'bootstrap': False}



In [16]:
# Parameters
# Number of tree in random forest 
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 2000, num =4)]
# Number of features to consider at every split
max_features = ['auto','log2','sqrt']
#Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(start = 10, stop = 200, num = 4)]
# Method of selecting samples for training each tree
bootstrap = [True,False]

# Creating a param grid
random_grid = {'n_estimators':n_estimators,
            'max_features':max_features,
            'max_depth': max_depth,
            'bootstrap': bootstrap }

print(random_grid)

{'n_estimators': [10, 673, 1336, 2000], 'max_features': ['auto', 'log2', 'sqrt'], 'max_depth': [10, 73, 136, 200], 'bootstrap': [True, False]}


In [17]:
kfold = StratifiedKFold(n_splits = 4, shuffle = True, random_state = 0) # for 4-fold cross validation

In [18]:
clf = RandomForestClassifier()
grid = RandomizedSearchCV(estimator = clf, param_distributions = random_grid, scoring = 'f1', refit = 'accuracy', n_jobs = -1 , cv = kfold, verbose = 2)
grid.fit(X_train, np.ravel(Y_train))
print(grid.best_params_) # gets the best hyper-parameters for random forest 

Fitting 4 folds for each of 10 candidates, totalling 40 fits


  warn(
  warn(
  warn(
  warn(


[CV] END bootstrap=True, max_depth=10, max_features=log2, n_estimators=2000; total time= 3.7min
[CV] END bootstrap=True, max_depth=10, max_features=log2, n_estimators=2000; total time= 3.7min
[CV] END bootstrap=True, max_depth=10, max_features=log2, n_estimators=2000; total time= 3.7min
[CV] END bootstrap=True, max_depth=10, max_features=log2, n_estimators=2000; total time= 3.7min


KeyboardInterrupt: 

In [None]:
grid_prediction = grid.predict(X_train)
score = f1_score(Y_train, grid_prediction)
print(score)

0.9960238568588469


## PCA (95%)
### Used to train: X_train, Y_train 
### Used to test: X_test, Y_test (if model works well)
Note: Validation test set separation already done in training

Perform PCA on train set features and separate into x_train and y_train data

Select the number of components such that the amount of variance that needs to be explained is greater than 80% percentage.

In [None]:
# perform PCA
pca = PCA(n_components = 0.95)
traintest_reduced = pca.fit_transform(to_reduce)
traintest_reduced = pd.DataFrame(data = traintest_reduced)
traintest_reduced.shape

(21480, 3833)

In [None]:
X = traintest_reduced.iloc[0:17184,:]
Y = train_set_label
submit_set_reduced = traintest_reduced.iloc[17184:,:]
print(X.shape)
print(Y.shape)
print(submit_set_reduced.shape)

(17184, 3833)
(17184, 1)
(4296, 3833)


In [None]:
# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.15, random_state=41)
# print(X_train.shape)
# print(X_test.shape)

## Training Ada Boost

95% variance

{'n_estimators': [10, 580, 1150, 1720, 2290, 2860, 3430, 4000], 'max_features': ['auto', 'sqrt'], 'max_depth': [100, 142, 185, 228, 271, 314, 357, 400], 'bootstrap': [True, False]}


4 kfolds

Results on X_test:

Best:


In [43]:
# Parameters
# Number of tree in random forest 
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 4000, num =8)]
algorithm = ['SAMME.R','SAMME']

# Creating a param grid
random_grid = {'n_estimators':n_estimators,
            'algorithm': algorithm,}

print(random_grid)

{'n_estimators': [10, 580, 1150, 1720, 2290, 2860, 3430, 4000], 'algorithm': ['SAMME.R', 'SAMME']}


In [44]:
kfold = StratifiedKFold(n_splits = 4, shuffle = True, random_state = 0) # for 4-fold cross validation

In [1]:
clf = AdaBoostClassifier()
grid = RandomizedSearchCV(estimator = clf, param_distributions = random_grid, scoring = 'f1', refit = 'accuracy', n_jobs = -1 , cv = kfold, verbose = 2)
grid.fit(X, np.ravel(Y))
print(grid.best_params_) # gets the best hyper-parameters for random forest 

NameError: name 'AdaBoostClassifier' is not defined

In [None]:
# grid_prediction = grid.predict(X_train)
# score = f1_score(Y_train, grid_prediction)
# print(score)

## Training

95% variance

{'n_estimators': [10, 580, 1150, 1720, 2290, 2860, 3430, 4000], 'max_features': ['auto', 'sqrt'], 'max_depth': [100, 142, 185, 228, 271, 314, 357, 400], 'bootstrap': [True, False]}


4 kfolds

Results on X_test:

Best:


In [15]:
# Parameters
# Number of tree in random forest 
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 4000, num =4)]
# Number of features to consider at every split
max_features = ['auto','sqrt']
#Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(start = 100, stop = 400, num = 4)]
# Method of selecting samples for training each tree
bootstrap = [True,False]

# Creating a param grid
random_grid = {'n_estimators':n_estimators,
            'max_features':max_features,
            'max_depth': max_depth,
            'bootstrap': bootstrap }

print(random_grid)

{'n_estimators': [10, 1340, 2670, 4000], 'max_features': ['auto', 'sqrt'], 'max_depth': [100, 200, 300, 400], 'bootstrap': [True, False]}


In [16]:
kfold = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 0) # for 4-fold cross validation

In [17]:
clf = RandomForestClassifier()
grid = RandomizedSearchCV(estimator = clf, param_distributions = random_grid, scoring = 'f1', refit = 'accuracy', n_jobs = -1 , cv = kfold, verbose = 2)
grid.fit(X, np.ravel(Y))
print(grid.best_params_) # gets the best hyper-parameters for random forest 

Fitting 3 folds for each of 10 candidates, totalling 30 fits


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


[CV] END bootstrap=False, max_depth=200, max_features=sqrt, n_estimators=10; total time=  33.8s
[CV] END bootstrap=False, max_depth=200, max_features=sqrt, n_estimators=10; total time=  33.9s
[CV] END bootstrap=False, max_depth=200, max_features=sqrt, n_estimators=10; total time=  34.5s


  warn(
  warn(
  warn(


[CV] END bootstrap=False, max_depth=400, max_features=auto, n_estimators=1340; total time=51.4min


  warn(


[CV] END bootstrap=False, max_depth=400, max_features=auto, n_estimators=1340; total time=51.5min


  warn(


[CV] END bootstrap=False, max_depth=400, max_features=auto, n_estimators=1340; total time=51.8min


  warn(


[CV] END bootstrap=True, max_depth=200, max_features=sqrt, n_estimators=2670; total time=63.8min
[CV] END bootstrap=True, max_depth=100, max_features=auto, n_estimators=2670; total time=63.8min
[CV] END bootstrap=True, max_depth=100, max_features=auto, n_estimators=2670; total time=64.0min
[CV] END bootstrap=True, max_depth=200, max_features=sqrt, n_estimators=2670; total time=64.1min
[CV] END bootstrap=True, max_depth=100, max_features=auto, n_estimators=2670; total time=64.1min
[CV] END bootstrap=True, max_depth=200, max_features=sqrt, n_estimators=2670; total time=64.1min
[CV] END bootstrap=True, max_depth=200, max_features=auto, n_estimators=4000; total time=93.6min
[CV] END bootstrap=True, max_depth=200, max_features=auto, n_estimators=4000; total time=93.6min


  warn(
  warn(


[CV] END bootstrap=True, max_depth=200, max_features=auto, n_estimators=4000; total time=93.9min


  warn(


[CV] END bootstrap=False, max_depth=100, max_features=auto, n_estimators=2670; total time=99.3min
[CV] END bootstrap=False, max_depth=100, max_features=auto, n_estimators=2670; total time=99.1min
[CV] END bootstrap=False, max_depth=100, max_features=auto, n_estimators=2670; total time=99.9min
[CV] END bootstrap=True, max_depth=200, max_features=sqrt, n_estimators=4000; total time=95.3min
[CV] END bootstrap=True, max_depth=200, max_features=sqrt, n_estimators=4000; total time=95.6min
[CV] END bootstrap=True, max_depth=200, max_features=sqrt, n_estimators=4000; total time=95.8min
[CV] END bootstrap=False, max_depth=400, max_features=auto, n_estimators=2670; total time=237.7min
[CV] END bootstrap=False, max_depth=400, max_features=auto, n_estimators=2670; total time=238.0min
[CV] END bootstrap=False, max_depth=400, max_features=auto, n_estimators=2670; total time=238.2min
[CV] END bootstrap=False, max_depth=100, max_features=sqrt, n_estimators=4000; total time=277.6min
[CV] END bootstrap=

In [18]:
# grid_prediction = grid.predict(X_train)
# score = f1_score(Y_train, grid_prediction)
# print(score)

# TO SUBMIT

In [25]:
y_predicted = grid.predict(submit_set_reduced)

In [28]:
# y_predicted = svc_model.predict(test_set_features)
y_predicted = pd.DataFrame(y_predicted, columns = ['label']) # convert y_predicted from nparray to pandas dataframe
y_predicted.insert(loc = 0, column = 'id', value = [i for i in range(17185, 17185 + 4296)]) # insert a column of the ids, starting from 17185
y_predicted.to_csv('skynet_submission2.csv', index = False) # output the predicted labels to ./skynet_submission.csv