# Importing all the necessary libraries

In [21]:
import time

start = time.time()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import VarianceThreshold
from collections import Counter
import imblearn
from imblearn.over_sampling import RandomOverSampler

In [22]:
# Setting custom random seed value to get deterministic desired data and reproducibility 
np.random.seed(1)

#Data Parsing  

In [23]:
# Reading the files and saving them into seperate variables
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_feamat = pd.read_csv('feamat.csv')

# Dropping all the columns after 150
feamat_df = df_feamat.drop(df_feamat.iloc[:,150:].columns, axis=1)

# Replacing any infinite values with null
feamat_df = feamat_df.replace([np.inf, -np.inf], np.nan)

# Replacing all the null values with mean
feamat_df.fillna(feamat_df.mean(), inplace=True)

In [24]:
feamat_df.head(5)

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30,V31,V32,V33,V34,V35,V36,V37,V38,V39,V40,...,V111,V112,V113,V114,V115,V116,V117,V118,V119,V120,V121,V122,V123,V124,V125,V126,V127,V128,V129,V130,V131,V132,V133,V134,V135,V136,V137,V138,V139,V140,V141,V142,V143,V144,V145,V146,V147,V148,V149,V150
0,60-35-5,178,59.037114,-0.808,43.09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,8.755965,0,0,9,2,1,5.134449,3.498274,4.051736,0.0,0.0,9,2.584963,1,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,1
1,103-90-2,1983,151.063329,0.87,49.33,0.0,0.0,0.0,0.083333,0.142259,0.0,0.0,0.0,0.027778,9.090909,22.785137,6,6,20,2,2,12.908918,11.996548,16.048283,12.32864,9.162458,115,4.459432,1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,1,0,0,0,0,1
2,968-81-0,1989,324.114378,2.96,100.72,0.0,0.0,0.0,0.185395,0.161948,0.0,0.0,0.0,0.12984,18.340265,48.04386,6,6,42,6,2,31.94559,33.058769,42.184565,42.829163,37.981351,452,5.523562,1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,1,1,0,0,1,0,0,1,0,0,0,0,1,0,1,0,1,0,0,0,1,1,1,0,0,0,0,1,1,0,0,0,1
3,520-45-6,122903,168.042259,-0.551,60.44,0.0,0.0,0.0,0.055556,0.198742,0.0,0.0,0.0,0.01701,10.083333,22.622344,0,0,20,4,0,15.097876,13.66046,20.431205,20.32092,21.196989,102,4.584963,1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
4,50594-66-6,44073,360.996485,4.557,89.67,0.0,0.0,0.0,0.136083,0.276855,0.0,0.0,0.0,0.048113,20.3136,38.268551,12,12,31,2,1,40.451408,31.298599,51.693067,52.809931,49.936323,465,5.643856,1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,1,0,0,1,1,0,0,0,0,1,1,0,0,0,0,0,1,0,0,1


In [25]:
# Calculating variance of all the columns and displaying it in descending order
feamat_df.var().sort_values(ascending=False)

V27    3.025037e+17
V2     1.235307e+14
V3     3.245040e+04
V5     4.760357e+03
V26    2.462033e+03
           ...     
V34    7.921236e-04
V33    6.790400e-04
V30    0.000000e+00
V32    0.000000e+00
V29    0.000000e+00
Length: 149, dtype: float64

In [26]:
feamat_df.describe()

Unnamed: 0,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30,V31,V32,V33,V34,V35,V36,V37,V38,V39,V40,V41,...,V111,V112,V113,V114,V115,V116,V117,V118,V119,V120,V121,V122,V123,V124,V125,V126,V127,V128,V129,V130,V131,V132,V133,V134,V135,V136,V137,V138,V139,V140,V141,V142,V143,V144,V145,V146,V147,V148,V149,V150
count,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,...,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0,8831.0
mean,3532785.0,285.458335,2.569736,66.570037,0.006788,0.009845,0.040165,0.183573,0.319938,0.005254,0.007653,0.02405,0.088833,16.638949,43.796484,5.911448,6.007134,36.969199,3.507191,1.301891,34.827669,24.626109,34.220222,35.60523,33.962291,-12050260.0,4.99236,1.0,0.0,0.006341,0.0,0.000679,0.000793,0.001699,0.013022,0.003171,0.004643,0.015853,0.005435,0.004416,...,0.211075,0.147095,0.252293,0.223078,0.157627,0.184011,0.219907,0.226701,0.229646,0.181746,0.235194,0.151059,0.220926,0.259201,0.219341,0.243347,0.209263,0.279923,0.025252,0.192843,0.2207,0.252067,0.262145,0.253086,0.23576,0.296116,0.279357,0.227947,0.300985,0.889707,0.226701,0.244593,0.317065,0.325331,0.250481,0.437436,0.113917,0.265542,0.321594,0.540029
std,11114440.0,180.139956,2.802949,68.995342,0.053572,0.061903,0.089054,0.220668,0.446881,0.040384,0.052248,0.062317,0.149227,18.955131,28.194922,5.912583,6.064681,24.43482,4.02083,2.135531,46.573271,17.917387,26.86047,34.179641,49.618879,550003300.0,0.938373,0.0,0.0,0.079384,0.0,0.026058,0.028145,0.041181,0.113376,0.056222,0.067983,0.124915,0.073529,0.066312,...,0.408094,0.354221,0.434353,0.416334,0.364411,0.387515,0.414207,0.418722,0.420629,0.385657,0.424144,0.358126,0.414894,0.438221,0.413824,0.429127,0.406805,0.448987,0.156898,0.394553,0.414742,0.434224,0.439826,0.434804,0.424497,0.456569,0.448709,0.419532,0.458712,0.313273,0.418722,0.42987,0.465359,0.468525,0.433315,0.496098,0.317728,0.441646,0.467115,0.498423
min,4.0,30.010565,-31.171,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.846793,0.0,0.0,2.0,0.0,0.0,2.774469,0.0,0.0,0.0,0.0,-2129542000.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,8404.0,167.131014,1.012,26.3,0.0,0.0,0.0,0.045361,0.058926,0.0,0.0,0.0,0.015625,10.0,25.340344,0.0,0.0,22.0,1.0,0.0,15.548938,12.996548,16.664184,14.332228,10.158734,76.0,4.459432,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,37768.0,246.0652,2.449,51.75,0.0,0.0,0.0,0.102062,0.184195,0.0,0.0,0.0,0.044194,14.0,37.07793,6.0,6.0,31.0,3.0,1.0,24.56736,20.664758,28.158734,26.994183,22.650834,197.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,443793.0,356.212576,3.962,85.81,0.0,0.0,0.068041,0.257389,0.418641,0.0,0.0,0.025516,0.103891,20.28,55.024032,11.0,11.0,46.0,4.0,2.0,37.806696,31.325052,44.653284,46.757487,43.313472,478.5,5.643856,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
max,126843400.0,3566.878444,27.344,1473.2,1.443376,1.094819,1.149522,3.572863,11.3125,1.118034,1.5,0.941947,4.061309,1452.0,501.435941,62.0,66.0,471.0,95.0,51.0,1341.909199,352.936696,657.764008,536.23725,1131.462114,2100654000.0,9.044394,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [27]:
#  Displaying first 5 records of the test file 
df_test.head(5)

Unnamed: 0,x
0,88-60-8;1682
1,122931-48-0;1656
2,NOCAS_47311;36
3,55589-62-3;1850
4,79902-63-9;30


In [28]:
# Storing the class labels of training records in a seperate variable y
y=df_train['Expected']

# Displaying number of records each class has and this shows that the dataset is highly unbalanced
print(Counter(y))

Counter({2: 66495, 1: 10918})


In [29]:
# Splitting the first column of train.csv and test.csv to 'chem Id' and 'Assay Id'
df_train[['ChemId', 'Assay Id']] = df_train.Id.str.split(";", expand=True)
df_test[['ChemId', 'Assay Id']] = df_test.x.str.split(";", expand=True)

# Renaming column v1 to ChemId
feamat_df.rename(columns = {'V1':'ChemId'}, inplace=True)

# Merging columns after data parsing
df_train = df_train.merge(feamat_df, on="ChemId", how="left")
df_test = df_test.merge(feamat_df, on="ChemId", how="left")

# Dropping unnecessary features after data parsing
df_train.drop(['Id', 'ChemId', 'V2', 'Expected'], axis=1, inplace=True) 
test = df_test.drop(['x', 'ChemId', 'V2'], axis=1)


In [30]:
df_train.head(5)

Unnamed: 0,Assay Id,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30,V31,V32,V33,V34,V35,V36,V37,V38,V39,V40,V41,...,V111,V112,V113,V114,V115,V116,V117,V118,V119,V120,V121,V122,V123,V124,V125,V126,V127,V128,V129,V130,V131,V132,V133,V134,V135,V136,V137,V138,V139,V140,V141,V142,V143,V144,V145,V146,V147,V148,V149,V150
0,1644,315.982463,4.592,40.46,0.0,0.0,0.0,0.166667,0.262892,0.0,0.0,0.0,0.055556,15.39,40.118723,12,12,30,0,2,43.687788,26.519496,59.32253,41.038992,52.085432,290,5.321928,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0
1,2451,156.151415,3.852,17.07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,31.73786,0,0,31,1,0,11.774469,10.332092,10.664184,8.332092,7.332092,139,4.321928,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,1,1,0,1,0,0,0,0
2,1384,361.347528,9.912,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26.231405,74.006064,0,0,72,0,0,32.07293,22.664728,24.332364,22.332364,21.332364,1050326996,5.459432,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,1,0,1,1,1,0,1,0,0,0,1
3,16,255.052302,2.294,83.66,0.0,0.0,0.117851,0.25,0.29741,0.0,0.0,0.05,0.081872,13.432099,31.79193,6,6,27,4,1,28.061789,22.74813,32.352497,26.617141,23.522689,273,5.169925,1,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,1,1,1,0,0,1,1,0,0,1,0,1,1,1,0,1,0,1,1,0,0,1,1,0,0,0,1,1,0,0,0,0,1,1,1,1,1,1
4,1856,149.894242,1.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.638949,28.95,0,0,2,0,0,115.302448,0.0,0.0,0.0,0.0,0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [31]:
# Display top 5 columns 
test.head(5)

Unnamed: 0,Assay Id,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30,V31,V32,V33,V34,V35,V36,V37,V38,V39,V40,V41,...,V111,V112,V113,V114,V115,V116,V117,V118,V119,V120,V121,V122,V123,V124,V125,V126,V127,V128,V129,V130,V131,V132,V133,V134,V135,V136,V137,V138,V139,V140,V141,V142,V143,V144,V145,V146,V147,V148,V149,V150
0,1682,164.120115,3.659,20.23,0.0,0.0,0.0,0.068041,0.170103,0.0,0.0,0.0,0.024056,10.083333,30.830688,6,6,28,0,1,12.774469,12.332092,18.664184,16.996276,14.66046,106,4.584963,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0
1,1656,431.05694,1.258,183.3,0.0,0.0,0.0,0.151375,0.227671,0.0,0.0,0.0,0.03595,24.271106,52.889481,12,12,45,10,2,47.476691,49.504506,62.259656,71.196703,77.917913,590,5.857981,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,1,1,1,1,0,1,1,0,0,0,1,0,0,0,1,0,0,0,1,0,1,1,1,0,0,1,0,0,1,1,1
2,36,695.250845,6.365,95.92,0.0,0.0,0.174792,0.78911,1.360933,0.0,0.0,0.099536,0.401739,36.36214,104.13172,21,22,88,7,1,68.9681,68.337655,99.85624,110.392516,108.648356,1715,6.754888,1,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,1,1,1,1,0,0,0,1,1,1,0,1,0,1,0,1,0,1,1,0,1,1,1,1,0,1,1,1,0,0,0,0,1,1,0,1,1,1
3,1850,200.94981,-1.34,68.82,0.0,0.0,0.0,0.058926,0.235702,0.0,0.0,0.0,0.0,11.0,60.315172,0,0,15,5,0,30.182453,20.61302,24.039282,19.054704,13.150049,-1474836500,4.321928,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,1,0,1,0,0,1,1,1,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,1,1,0
4,30,418.271924,4.775,72.83,0.0,0.0,0.0,0.179152,0.516591,0.0,0.0,0.0,0.105379,24.638672,73.348134,0,0,68,5,1,33.872345,34.324644,51.869858,55.313472,51.866134,632,6.0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,0,0,0,1,0,1,1,1,0,1,0,0,0,0


# Train Test Split


In [32]:
# Setting custom random seed value to get deterministic desired data and reproducibilty
np.random.seed(3)

In [33]:
# splitting training set 
x_train, x_test, y_train, y_test = train_test_split(df_train, y, test_size=.25, random_state=1)

In [34]:
# Printing the shape of the train test split
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(58059, 149) (19354, 149) (58059,) (19354,)


# Feature Selection using Variance Threshold

In [35]:
# Setting custom random seed value to get deterministic desired data and reproducibility
np.random.seed(7)

In [36]:
# Creating Variance Threshold object
selector = VarianceThreshold(threshold=0.5)

# Applying the threshold to x_train
selector.fit_transform(x_train)

# Fetching and Displaying all the remaining columns
all_columns = x_train.columns

# Output shows only one column has been removed
print(all_columns)

Index(['Assay Id', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11',
       ...
       'V141', 'V142', 'V143', 'V144', 'V145', 'V146', 'V147', 'V148', 'V149',
       'V150'],
      dtype='object', length=149)


# Feature Scaling

In [37]:
# Creating the Scaling object
sc = StandardScaler()

# Fitting the train data to the feature scaling 
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

# Fitting the final test data
test = sc.transform(test)

#Oversampling


In [38]:
# Creating a object of RandomOverSampler
oversample = RandomOverSampler(sampling_strategy=0.5)

# Displaying count of records each class has before oversampling
print(Counter(y_train))

# fit and apply the transform
x_train, y_train = oversample.fit_resample(x_train, y_train)

# Displaying how many records each class has after oversampling
print(Counter(y_train))

Counter({2: 49847, 1: 8212})
Counter({2: 49847, 1: 24923})




# XGBoost (Bootstrap Aggregation)

In [39]:
# Creating XGBoost classifier 
xgb = XGBClassifier(max_depth=8, n_estimators=700,n_jobs=-1)

# Fitting the train test split
xgb.fit(x_train, y_train)

# Predicting the test split
xgb_pred = xgb.predict(x_test)

# F1-Macro Score Accuracy 
print('XGB', f1_score(y_test, xgb_pred, average='macro'))

trainend = time.time()

# Displaying total execution time in Seconds
print(f"Total Execution Time : {trainend - start}")

XGB 0.8089926483360065
Total Execution Time : 344.82408809661865


In [40]:
# # Downloading the submission file for kaggle

# from google.colab import files

# xgb_pred = xgb.predict(test)
# output = pd.DataFrame({'Id': df_test.x, 'Predicted':xgb_pred })
# output.to_csv('Bit_and_Byte_submission.csv', index=False)
# print("Your submission was successfully saved!")

# files.download('Bit_and_Byte_submission.csv')