In [1]:
import numpy as np
import pandas as pd
import sklearn

In [2]:
# read the training set
train = pd.read_csv('training.data', header = None)

# rename the columns of training set
train.columns = ['Class', 'Sequence']

train

Unnamed: 0,Class,Sequence
0,1,CTCATTGAAACAGCTATATTTCTTTTTCAGATTAGTGATGATGAAC...
1,1,AGTACTTCCTCTCTCCGTCTCTTTATATAGAGACCTAAACCAGATG...
2,1,AAAAGCATTCTAAGGCTGTTTCTCCACCAGGTTTCCGCCCCACCAC...
3,1,GCAGCCCAGACCTACCTCTTGCTTTTGCAGCAATATAAATGTCACC...
4,1,GGAATCTTCACTCTGAAATTTCCCTTGCAGGTGACCAGTTGTCTCT...
...,...,...
2782,2,GATTCTCCTGTGCTAGATGTGCAAATGCAAGCTAGTGGCTTCAAAA...
2783,2,CCCCAGCTTACCGCCCAGTACGCAGACTCTGAAGCTTATTGAGACT...
2784,2,GAGCTGCAGCAGTCGCGCATCCGCATCGACAGCCTCTCTGCCCAGC...
2785,2,GTATCCCCACTGCCTTGTATGTGGTGAAACCAAAGGGTTACTTTTA...


In [3]:
classes_train = train.loc[:, 'Class']
sequences_train = list(train.loc[:, 'Sequence'])
dataset_train = {}

# loop through sequences and split into individual nucleotides
for i, sequence in enumerate(sequences_train):
    
    # split into nucleotides, remove tab characters
    genes = list(sequence)
    genes = [x for x in genes if x != '\t']
    
    # append class assignment
    genes.append(classes_train[i])
    
    # add to dataset
    dataset_train[i] = genes
    
print(dataset_train[0])

['C', 'T', 'C', 'A', 'T', 'T', 'G', 'A', 'A', 'A', 'C', 'A', 'G', 'C', 'T', 'A', 'T', 'A', 'T', 'T', 'T', 'C', 'T', 'T', 'T', 'T', 'T', 'C', 'A', 'G', 'A', 'T', 'T', 'A', 'G', 'T', 'G', 'A', 'T', 'G', 'A', 'T', 'G', 'A', 'A', 'C', 'C', 'A', 'G', 'G', 'T', 'T', 'A', 'T', 'G', 'A', 'C', 'C', 'T', 'T', 1]


In [4]:
# convert the set into dataframe
df_train = pd.DataFrame(dataset_train)
print(df_train)

   0    1    2    3    4    5    6    7    8    9     ... 2777 2778 2779 2780  \
0     C    A    A    G    G    G    A    T    T    T  ...    C    C    T    T   
1     T    G    A    C    G    G    A    C    C    T  ...    A    A    C    C   
2     C    T    A    A    A    C    A    A    T    T  ...    G    C    G    G   
3     A    A    A    G    A    C    A    G    G    T  ...    A    C    A    G   
4     T    C    G    C    T    T    C    C    C    T  ...    G    A    T    A   
..  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
56    C    T    T    A    T    G    T    A    A    T  ...    G    C    C    G   
57    C    T    C    C    C    C    C    A    C    C  ...    C    T    G    C   
58    T    C    T    A    C    G    A    G    C    A  ...    A    C    C    C   
59    T    A    T    A    C    T    A    C    A    A  ...    G    C    C    C   
60    1    1    1    1    1    1    1    1    1    1  ...    2    2    2    2   

   2781 2782 2783 2784 2785

In [5]:
# Tranpose the matrix so each row represents a DNA sequence with its class 

df_train = df_train.transpose()
print(df_train.iloc[:5])

  0  1  2  3  4  5  6  7  8  9   ... 51 52 53 54 55 56 57 58 59 60
0  C  T  C  A  T  T  G  A  A  A  ...  T  A  T  G  A  C  C  T  T  1
1  A  G  T  A  C  T  T  C  C  T  ...  T  A  T  T  A  T  T  C  A  1
2  A  A  A  A  G  C  A  T  T  C  ...  G  G  T  G  A  T  C  T  T  1
3  G  C  A  G  C  C  C  A  G  A  ...  C  G  C  C  C  A  C  A  A  1
4  G  G  A  A  T  C  T  T  C  A  ...  G  G  C  A  T  T  C  C  C  1

[5 rows x 61 columns]


In [6]:
# Rename the 60th column to 'class'

df_train.rename(columns = {60: 'Class'}, inplace = True) 
print(df_train.iloc[:5])

   0  1  2  3  4  5  6  7  8  9  ... 51 52 53 54 55 56 57 58 59 Class
0  C  T  C  A  T  T  G  A  A  A  ...  T  A  T  G  A  C  C  T  T     1
1  A  G  T  A  C  T  T  C  C  T  ...  T  A  T  T  A  T  T  C  A     1
2  A  A  A  A  G  C  A  T  T  C  ...  G  G  T  G  A  T  C  T  T     1
3  G  C  A  G  C  C  C  A  G  A  ...  C  G  C  C  C  A  C  A  A     1
4  G  G  A  A  T  C  T  T  C  A  ...  G  G  C  A  T  T  C  C  C     1

[5 rows x 61 columns]


In [7]:
# Check all the unique values in training set
series_train = []
for name in df_train.columns:
    series_train.append(df_train[name].value_counts())
    
info_train = pd.DataFrame(series_train)
details_train = info_train.transpose()
print(details_train)

       0      1      2      3      4      5      6      7      8      9  ...  \
0    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN  ...   
1    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN  ...   
2    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN  ...   
A  644.0  674.0  613.0  658.0  692.0  608.0  683.0  678.0  657.0  675.0  ...   
C  730.0  756.0  785.0  772.0  753.0  782.0  741.0  774.0  787.0  693.0  ...   
D    1.0    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN  ...   
G  756.0  689.0  727.0  688.0  640.0  734.0  677.0  648.0  660.0  664.0  ...   
N    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN  ...   
R    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN  ...   
S    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN  ...   
T  656.0  668.0  662.0  669.0  702.0  663.0  686.0  687.0  683.0  755.0  ...   

      51     52     53     54     55   

In [8]:
# Drop the class column in training set
df_no_class_train = df_train.drop(columns = ['Class'])
print(df_no_class_train.iloc[:5])

  0  1  2  3  4  5  6  7  8  9   ... 50 51 52 53 54 55 56 57 58 59
0  C  T  C  A  T  T  G  A  A  A  ...  T  T  A  T  G  A  C  C  T  T
1  A  G  T  A  C  T  T  C  C  T  ...  G  T  A  T  T  A  T  T  C  A
2  A  A  A  A  G  C  A  T  T  C  ...  C  G  G  T  G  A  T  C  T  T
3  G  C  A  G  C  C  C  A  G  A  ...  G  C  G  C  C  C  A  C  A  A
4  G  G  A  A  T  C  T  T  C  A  ...  G  G  G  C  A  T  T  C  C  C

[5 rows x 60 columns]


In [9]:
# Read the test set
test = pd.read_csv('test.data', header = None)
test

Unnamed: 0,0
0,CTGGTGTGAATGGCATTCTCTTTTTTGCAGACAGAGGAGCTGAACC...
1,CAAACGCCAAACCTGCTATCTCCTTTGCAGGGCGATTCGGGAGGCC...
2,TTTTCCCATCATCCTGTACTTCTTTTCTAGATGTCAGCCAGGAAGA...
3,AGGGCCCCTTACGTTCCCCTCTTTTCCCAGAGCCGGCTTCCCAGCC...
4,CCACTCTATTTGCATTTTGTTGCATTTCAGAGGAACATCAAGAAAT...
...,...
398,TTGAAGTTGCCTAGACCAGAGGACATAAGTATCATGTCTCCTTTAA...
399,TAACTTTCAGGCTGTACAAAATGGCAGTTGGATTTATGCTTGCTCA...
400,GAGAAAAGCCCGTCTGTTTGCAGCCCTCTGAACATGACATCTTCGG...
401,AGGTGTGCATCACCTTTGACCAGGCTGACCTGACCATCAAGCTGCC...


In [10]:
sequences_test = list(test.iloc[:, 0])
dataset_test = {}

# loop through sequences and split into individual nucleotides
for i, sequence in enumerate(sequences_test):
    
    # split into nucleotides, remove tab characters
    genes = list(sequence)
    genes = [x for x in genes if x != '\t']
    
    # add to dataset
    dataset_test[i] = genes
    
print(dataset_test[0])

['C', 'T', 'G', 'G', 'T', 'G', 'T', 'G', 'A', 'A', 'T', 'G', 'G', 'C', 'A', 'T', 'T', 'C', 'T', 'C', 'T', 'T', 'T', 'T', 'T', 'T', 'G', 'C', 'A', 'G', 'A', 'C', 'A', 'G', 'A', 'G', 'G', 'A', 'G', 'C', 'T', 'G', 'A', 'A', 'C', 'C', 'G', 'C', 'G', 'A', 'G', 'G', 'T', 'G', 'G', 'C', 'C', 'A', 'C', 'C']


In [11]:
# Convert the set of test data into datafram
df_test = pd.DataFrame(dataset_test)
print(df_test)

   0   1   2   3   4   5   6   7   8   9    ... 393 394 395 396 397 398 399  \
0    C   C   T   A   C   A   A   C   G   A  ...   T   C   C   G   A   T   T   
1    T   A   T   G   C   C   A   C   G   A  ...   C   C   T   C   A   T   A   
2    G   A   T   G   A   C   C   T   G   A  ...   T   C   G   T   A   G   A   
3    G   A   T   G   C   T   A   C   A   T  ...   T   C   T   G   G   A   C   
4    T   C   C   C   T   T   A   C   A   A  ...   T   C   G   A   C   A   T   
5    G   G   C   C   C   T   T   C   C   T  ...   A   C   C   C   C   G   T   
6    T   C   C   C   T   T   C   G   C   T  ...   C   A   A   T   A   T   T   
7    G   C   A   C   A   T   C   C   T   T  ...   T   T   A   G   C   T   C   
8    A   A   T   T   T   T   T   T   G   T  ...   A   C   T   C   A   G   A   
9    A   A   C   T   T   A   T   T   A   C  ...   A   C   G   C   A   C   G   
10   T   A   A   A   T   T   T   T   C   A  ...   A   C   C   A   G   C   G   
11   G   C   T   C   G   T   T   G   G   T  ...   A 

In [12]:
# Tranpose the test matrix so each row represents a DNA sequence

df_test = df_test.transpose()
print(df_test.iloc[:5])

  0  1  2  3  4  5  6  7  8  9   ... 50 51 52 53 54 55 56 57 58 59
0  C  T  G  G  T  G  T  G  A  A  ...  G  G  T  G  G  C  C  A  C  C
1  C  A  A  A  C  G  C  C  A  A  ...  G  G  T  G  T  G  T  C  T  G
2  T  T  T  T  C  C  C  A  T  C  ...  C  C  C  C  T  C  G  T  A  A
3  A  G  G  G  C  C  C  C  T  T  ...  A  T  C  C  C  C  A  T  C  G
4  C  C  A  C  T  C  T  A  T  T  ...  A  A  C  A  A  C  T  T  T  G

[5 rows x 60 columns]


In [13]:
# Check all the unique values in test set
series_test = []
for name in df_test.columns:
    series_test.append(df_test[name].value_counts())
    
info_test = pd.DataFrame(series_test)
details_test = info_test.transpose()
print(details_test)

      0      1      2      3      4      5      6      7      8      9   ...  \
G  120.0  105.0  114.0  114.0   79.0   94.0  103.0   83.0   89.0  102.0  ...   
C  105.0  102.0   91.0  112.0  112.0  116.0  117.0  104.0  122.0  115.0  ...   
A   99.0  105.0   99.0   95.0  109.0   96.0  100.0  112.0   92.0   92.0  ...   
T   79.0   90.0   99.0   82.0  103.0   97.0   83.0  104.0  100.0   94.0  ...   
D    NaN    1.0    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN  ...   
N    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN  ...   

      50     51     52     53     54     55     56     57     58     59  
G  124.0  106.0  103.0  119.0  105.0  113.0   94.0  106.0   95.0  131.0  
C  110.0  110.0  104.0  113.0  109.0  132.0  111.0  101.0  119.0   86.0  
A   79.0  106.0   91.0   85.0  104.0   84.0  109.0   97.0  106.0   85.0  
T   90.0   81.0  105.0   86.0   85.0   74.0   89.0   99.0   83.0  101.0  
D    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    N

# Mismatch of Training and Test Set

Training set has nucleotide R and S that sequences in test set do not have


In [14]:
# Add one more columns to the test and training dataframe 
df_test['train'] = 0
df_no_class_train['train'] = 1

# Combine the test and training dataframes together as one dataframe
combined = pd.concat([df_no_class_train, df_test])
combined

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,train
0,C,T,C,A,T,T,G,A,A,A,...,T,A,T,G,A,C,C,T,T,1
1,A,G,T,A,C,T,T,C,C,T,...,T,A,T,T,A,T,T,C,A,1
2,A,A,A,A,G,C,A,T,T,C,...,G,G,T,G,A,T,C,T,T,1
3,G,C,A,G,C,C,C,A,G,A,...,C,G,C,C,C,A,C,A,A,1
4,G,G,A,A,T,C,T,T,C,A,...,G,G,C,A,T,T,C,C,C,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
398,T,T,G,A,A,G,T,T,G,C,...,A,T,A,C,C,C,C,G,A,0
399,T,A,A,C,T,T,T,C,A,G,...,A,T,G,G,A,T,T,T,A,0
400,G,A,G,A,A,A,A,G,C,C,...,A,G,C,C,C,T,G,C,T,0
401,A,G,G,T,G,T,G,C,A,T,...,G,A,C,A,T,G,A,A,T,0


In [15]:
# Treating each nucleotide as a feature, we convert its alphabetical value into numerical one by One-hot Encoding with get_dummies function in pandas
combined_dummies = pd.get_dummies(combined)

In [16]:
combined_dummies

Unnamed: 0,train,0_A,0_C,0_D,0_G,0_T,1_A,1_C,1_D,1_G,...,58_A,58_C,58_G,58_N,58_T,59_A,59_C,59_G,59_N,59_T
0,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
1,1,1,0,0,0,0,0,0,0,1,...,0,1,0,0,0,1,0,0,0,0
2,1,1,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
3,1,0,0,0,1,0,0,1,0,0,...,1,0,0,0,0,1,0,0,0,0
4,1,0,0,0,1,0,0,0,0,1,...,0,1,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
398,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
399,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,1,1,0,0,0,0
400,0,0,0,0,1,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,1
401,0,1,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,1


In [17]:
# Create a new dataframe for training set after  encoding
train_after_combined = combined_dummies[combined_dummies['train'] == 1]
train_after_combined = train_after_combined.iloc[:, 1:]
train_after_combined

Unnamed: 0,0_A,0_C,0_D,0_G,0_T,1_A,1_C,1_D,1_G,1_T,...,58_A,58_C,58_G,58_N,58_T,59_A,59_C,59_G,59_N,59_T
0,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,1
1,1,0,0,0,0,0,0,0,1,0,...,0,1,0,0,0,1,0,0,0,0
2,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
3,0,0,0,1,0,0,1,0,0,0,...,1,0,0,0,0,1,0,0,0,0
4,0,0,0,1,0,0,0,0,1,0,...,0,1,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2782,0,0,0,1,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
2783,0,1,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,1
2784,0,0,0,1,0,1,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
2785,0,0,0,1,0,0,0,0,0,1,...,1,0,0,0,0,0,0,1,0,0


In [18]:
# Create a new dataframe for test set after  encoding
test_after_combined = combined_dummies[combined_dummies['train'] == 0]
test_after_combined = test_after_combined.iloc[:, 1:]
test_after_combined

Unnamed: 0,0_A,0_C,0_D,0_G,0_T,1_A,1_C,1_D,1_G,1_T,...,58_A,58_C,58_G,58_N,58_T,59_A,59_C,59_G,59_N,59_T
0,0,1,0,0,0,0,0,0,0,1,...,0,1,0,0,0,0,1,0,0,0
1,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
2,0,0,0,0,1,0,0,0,0,1,...,1,0,0,0,0,1,0,0,0,0
3,1,0,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,1,0,0
4,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
398,0,0,0,0,1,0,0,0,0,1,...,0,0,1,0,0,1,0,0,0,0
399,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
400,0,0,0,1,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
401,1,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,1


In [19]:
# Separate the training set into training and test set to train a model
from sklearn import model_selection

# Create X and Y datasets for training
X = np.array(train_after_combined)
y = np.array(df_train['Class'])

# Convert numerical value type from int to float
X = X.astype(np.float)
y = y.astype(np.float)

# split data into training and testing datasets, with test size = 25% of training set
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25, random_state=1)

In [20]:
X

array([[0., 1., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

In [21]:
# Write a function to print out the hyperparameters after tuning for each classifier 
def report_best_scores(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [22]:
# Tuning the hyperparameters for XGBoost classifier by Randomized Search Cross Validation

from xgboost import XGBClassifier
import xgboost as xgb
from random import randint, uniform
import scipy.stats as st

xgb_model = xgb.XGBClassifier()

params_xgb = {
    "eta": st.uniform(0, 1),
    "max_depth": randint(0, 6), # default 3
    "n_estimators": st.randint(100, 150),
    "max_depth": st.randint(1, 6),
    "learning_rate": st.uniform(0.03, 0.4),
    "colsample_bytree": st.uniform(0, 1),
    "subsample": st.uniform(0.01, 1),
    "gamma": st.uniform(0, 10),
    'reg_alpha': st.uniform(0, 1),
    'reg_lambda': st.uniform(0, 1),
}

search_xgb = model_selection.RandomizedSearchCV(xgb_model, param_distributions = params_xgb, random_state=42, n_iter=200, cv=3, verbose=1, n_jobs=1, return_train_score=True)

search_xgb.fit(X_train, y_train)

report_best_scores(search_xgb.cv_results_, 1)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 600 out of 600 | elapsed:  4.4min finished


Model with rank: 1
Mean validation score: 0.963 (std: 0.003)
Parameters: {'colsample_bytree': 0.1800104374727235, 'eta': 0.08855558422324139, 'gamma': 1.0330895712401567, 'learning_rate': 0.2638682801770355, 'max_depth': 4, 'n_estimators': 141, 'reg_alpha': 0.397378988464401, 'reg_lambda': 0.4798970815920732, 'subsample': 0.6959677359829841}



In [23]:
# Build a XGBoost classifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score

xgb_classifier = XGBClassifier(
    colsample_bytree = 0.1800104374727235, 
    eta = 0.08855558422324139,
    gamma = 1.0330895712401567, 
    learning_rate = 0.2638682801770355, 
    max_depth = 4,
    n_estimators = 141,
    reg_alpha = 0.397378988464401,
    reg_lambda = 0.4798970815920732, 
    subsample = 0.6959677359829841,
)

xgb_classifier.fit(X_train, y_train)
xgb_predictions = xgb_classifier.predict(X_test)
print(accuracy_score(y_test, xgb_predictions))

0.9626972740315638


In [24]:
# Check the accuracy for classifier with 10-cross validation

scoring = 'accuracy'
kfold = model_selection.KFold(n_splits=10, random_state = 1, shuffle = True)
print('accuracy score: ', model_selection.cross_val_score(xgb_classifier, X_train, y_train, cv= 10, scoring=scoring).mean())

accuracy score:  0.9607655502392346


In [25]:
# Tuning the hyperparameters for Random Forest classifier by Randomized Search Cross Validation
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier()

param_forest = [{'n_estimators': [10, 20, 50, 70, 100], 
                 'max_features': [1, 2, 4, 6, 8, 10, 13, 17, 20, None], 
                 'max_depth': [1, 2, 4, 6, 8, 10, 13, 17, 20, None], 
                 'bootstrap': [True, False], 
                 'criterion': ["gini", "entropy"], 
                 'min_samples_leaf': [1, 2, 4, 6, 8, 10, 13, 17, 20]}
]


search_forest = model_selection.RandomizedSearchCV(forest, param_distributions = param_forest, random_state=1, n_iter=200, cv=3, verbose=1, n_jobs=1, return_train_score=True)
search_forest.fit(X_train, y_train)

report_best_scores(search_forest.cv_results_, 1)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 600 out of 600 | elapsed:   56.3s finished


Model with rank: 1
Mean validation score: 0.960 (std: 0.002)
Parameters: {'n_estimators': 100, 'min_samples_leaf': 8, 'max_features': 13, 'max_depth': 10, 'criterion': 'entropy', 'bootstrap': False}



In [26]:
# Build a Random Forest classifier

forest_classifier = RandomForestClassifier(n_estimators = 100, 
                                           criterion = 'entropy', 
                                           max_depth = 10, 
                                           max_features = 13, 
                                           min_samples_leaf = 8, 
                                           bootstrap = False)

forest_classifier.fit(X_train, y_train)
forest_predictions = forest_classifier.predict(X_test)
print(accuracy_score(y_test, forest_predictions))

0.9641319942611191


In [27]:
# Check the accuracy for classifier with 10-cross validation

scoring = 'accuracy'
kfold = model_selection.KFold(n_splits=10, random_state = 1, shuffle = True)
print('accuracy score: ', model_selection.cross_val_score(forest_classifier, X_train, y_train, cv= 10, scoring=scoring).mean())

accuracy score:  0.9574162679425837


In [28]:
# Tuning the hyperparameters for Adaboost classifier by Randomized Search Cross Validation
from sklearn.ensemble import AdaBoostClassifier

adaboost = AdaBoostClassifier()

param_adaboost = {'n_estimators' : [10, 20, 50, 70, 100],
                   'learning_rate' : st.uniform(0.1, 1), 
                   'algorithm' : ['SAMME.R', 'SAMME'],
                   'random_state' : [0, 1, 42, 10, 37, None]}

search_adaboost = model_selection.RandomizedSearchCV(adaboost, param_distributions = param_adaboost, random_state=1, n_iter=200, cv=3, verbose=1, n_jobs=1, return_train_score=True)
search_adaboost.fit(X_train, y_train)

report_best_scores(search_adaboost.cv_results_, 1)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 600 out of 600 | elapsed:  2.9min finished


Model with rank: 1
Mean validation score: 0.946 (std: 0.009)
Parameters: {'algorithm': 'SAMME', 'learning_rate': 1.0500433102183813, 'n_estimators': 100, 'random_state': 42}



In [30]:
# Build a Adaboost classifier

adaboost_classifier = AdaBoostClassifier(n_estimators = 100, 
                                           learning_rate = 1.0500433102183813,
                                           algorithm = 'SAMME', 
                                           random_state = 42)

adaboost_classifier.fit(X_train, y_train)
adaboost_predictions = adaboost_classifier.predict(X_test)
print(accuracy_score(y_test, adaboost_predictions))

0.9469153515064562


In [31]:
# Check the accuracy for classifier with 10-cross validation

scoring = 'accuracy'
kfold = model_selection.KFold(n_splits=10, random_state = 1, shuffle = True)
print('accuracy score: ', model_selection.cross_val_score(adaboost_classifier, X_train, y_train, cv= 10, scoring=scoring).mean())

accuracy score:  0.9444976076555024


In [32]:
test_after_combined_array = np.array(test_after_combined)

In [33]:
# Predict the actual test set by XGBoost

test_predictions_xgb = xgb_classifier.predict(test_after_combined_array)
test_predictions_xgb = test_predictions_xgb.astype(int)
test_predictions_xgb

array([1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2,
       2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2,

In [34]:
# Predict the actual test set by Random Forest

test_predictions_forest = forest_classifier.predict(test_after_combined_array)
test_predictions_forest = test_predictions_forest.astype(int)
test_predictions_forest

array([1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2,
       2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

In [35]:
# Find the difference between predictions of XGBoost and RandomForest classifier
difference_forest_xgb = test_predictions_forest - test_predictions_xgb

In [36]:
# The index where difference occurs 
difference_forest_xgb.nonzero()

(array([  5,   8,  20,  36,  45,  54,  91,  98, 164, 293, 321, 334, 353,
        363, 385]),)

In [37]:
# Predict the actual test set by AdaBoost

test_predictions_adaboost = adaboost_classifier.predict(test_after_combined_array)
test_predictions_adaboost = test_predictions_adaboost.astype(int)
test_predictions_adaboost

array([1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       2, 2, 2, 2, 2, 0, 2, 0, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2,
       2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

In [39]:
# Find the difference between predictions of XGBoost and AdaBoost classifier
difference_adaboost_xgb = test_predictions_adaboost - test_predictions_xgb

In [40]:
# The index where difference occurs 
difference_adaboost_xgb.nonzero()

(array([  5,  32,  45,  92, 122, 144, 159, 161, 180, 211, 293, 349, 351,
        357, 365, 381, 385]),)