In [1]:
#Import necessary libraries
import pandas as pd
import numpy as np

In [2]:
#Loading and reading required files
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")
feature_data = pd.read_csv("feamat.csv")

In [3]:
#Renaming the x column as Id in test dataset
test_data = test_data.rename(columns = {"x": "Id"})
test_data.head()

Unnamed: 0,Id
0,88-60-8;1682
1,122931-48-0;1656
2,NOCAS_47311;36
3,55589-62-3;1850
4,79902-63-9;30


In [4]:
#Splitting Id into 2 separate columns
train_data[["Chemical_Id", "Assay_Id"]] = train_data.Id.str.split(";", expand = True)
test_data[["Chemical_Id", "Assay_Id"]] = test_data.Id.str.split(";", expand = True)
train_data.head()

Unnamed: 0,Id,Expected,Chemical_Id,Assay_Id
0,2971-36-0;1644,2,2971-36-0,1644
1,693-54-9;2451,2,693-54-9,2451
2,7173-51-5;1384,2,7173-51-5,1384
3,138261-41-3;16,2,138261-41-3,16
4,7681-82-5;1856,2,7681-82-5,1856


In [5]:
#Merging the train and test csv with feamat csv 
train_mapping = pd.merge(train_data, feature_data, left_on = "Chemical_Id", right_on = "V1", how = "left")
test_mapping = pd.merge(test_data, feature_data, left_on = "Chemical_Id", right_on = "V1", how = "left")

In [6]:
#Dropping V1 & Id
train_mapping.drop(["V1", "Id"], axis = 1, inplace = True)
test_mapping.drop(["V1", "Id"], axis = 1, inplace = True)

In [7]:
train_mapping.head()

Unnamed: 0,Expected,Chemical_Id,Assay_Id,V2,V3,V4,V5,V6,V7,V8,...,V1066,V1067,V1068,V1069,V1070,V1071,V1072,V1073,V1074,V1075
0,2,2971-36-0,1644,76302,315.982463,4.592,40.46,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,2,693-54-9,2451,12741,156.151415,3.852,17.07,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,2,7173-51-5,1384,23558,361.347528,9.912,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,2,138261-41-3,16,86418,255.052302,2.294,83.66,0.0,0.0,0.117851,...,0,0,0,0,0,0,0,0,0,0
4,2,7681-82-5,1856,5238,149.894242,1.05,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
test_mapping.head()

Unnamed: 0,Chemical_Id,Assay_Id,V2,V3,V4,V5,V6,V7,V8,V9,...,V1066,V1067,V1068,V1069,V1070,V1071,V1072,V1073,V1074,V1075
0,88-60-8,1682,6937,164.120115,3.659,20.23,0.0,0.0,0.0,0.068041,...,0,0,0,0,0,0,0,0,0,0
1,122931-48-0,1656,91779,431.05694,1.258,183.3,0.0,0.0,0.0,0.151375,...,0,0,0,0,0,0,0,0,0,0
2,NOCAS_47311,36,53257735,695.250845,6.365,95.92,0.0,0.0,0.174792,0.78911,...,0,0,0,0,0,0,0,0,0,0
3,55589-62-3,1850,11074431,200.94981,-1.34,68.82,0.0,0.0,0.0,0.058926,...,0,0,0,0,0,0,0,0,0,0
4,79902-63-9,30,54454,418.271924,4.775,72.83,0.0,0.0,0.0,0.179152,...,0,0,0,0,0,0,0,0,0,0


In [9]:
#Converting the Assay Id datatype into float
train_mapping["Assay_Id"] = train_mapping["Assay_Id"].astype(float)
test_mapping["Assay_Id"] = test_mapping["Assay_Id"].astype(float)

In [10]:
#Replacing infinity values in both the datasets with NaN
train_mapping = train_mapping.replace([np.inf, -np.inf], np.nan)
test_mapping = test_mapping.replace([np.inf, -np.inf], np.nan)

In [11]:
#Replacing NaN's with zeroes
train_mapping = train_mapping.replace(np.nan, 0)
test_mapping = test_mapping.replace(np.nan, 0)

In [12]:
#Replacing the Null values with median
train_mapping.fillna(train_mapping.median(), inplace = True)
test_mapping.fillna(test_mapping.median(), inplace = True)

  train_mapping.fillna(train_mapping.median(), inplace = True)
  test_mapping.fillna(test_mapping.median(), inplace = True)


In [13]:
#Import model libraries
from xgboost import XGBClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import GridSearchCV, cross_val_score

In [14]:
train_mapping = train_mapping.drop("Chemical_Id", axis = 1)

In [15]:
X_train = train_mapping.drop("Expected", axis = 1)
Y_train = train_mapping["Expected"]
X_test  = test_mapping.drop("Chemical_Id", axis=1).copy()

X_train.shape, Y_train.shape, X_test.shape

((77413, 1075), (77413,), (11139, 1075))

In [16]:
#Setting the threshold value
var_thres=VarianceThreshold(threshold=0.21)
var_thres.fit(X_train)

VarianceThreshold(threshold=0.21)

In [17]:
sum(var_thres.get_support())

149

In [18]:
len(X_train.columns[(var_thres.get_support())])

149

In [19]:
constant_columns= [column for column in X_train.columns
                  if column not in X_train.columns[var_thres.get_support()]]
print(len(constant_columns))

926


In [20]:
X_train=X_train.drop(constant_columns,axis=1)
X_test=X_test.drop(constant_columns,axis=1)

In [21]:
from imblearn.over_sampling import SMOTE

smote=SMOTE(random_state=10)
X_train_smote,Y_train_smote=smote.fit_resample(X_train,Y_train)

In [22]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()  
scaler.fit(X_train_smote)  
X_train_smote = scaler.transform(X_train_smote)  
# apply same transformation to test data
X_test = scaler.transform(X_test) 

In [23]:
#XGBoost classifier
clf=XGBClassifier(max_depth = 8,n_estimators = 400, random_state = 11)

In [24]:
#Fitting the model with the classifier
clf.fit(X_train_smote,Y_train_smote)
pred = clf.predict(X_test)

acc_score = clf.score(X_train_smote,Y_train_smote)
print(acc_score)



0.9908790134596587


In [25]:
#Performing K fold Cross Validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

kfold = KFold(n_splits = 10, random_state = 11, shuffle = True)
scores = cross_val_score(clf, X_train_smote, Y_train_smote, cv = 10, scoring = "accuracy")

print("Scores:", (scores * 100))
print("Mean:", (scores.mean() * 100))
print("Standard Deviation", (scores.std() * 100))







































Scores: [74.13339349 84.48755546 97.72915257 97.60132341 97.39078126 97.74419129
 97.49605233 97.51109106 97.75923002 97.60884277]
Mean: 93.94616136551622
Standard Deviation 7.67618292840829


In [26]:
#Evaluate Confusion Matrix for model with highest accuracy
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

predictions = cross_val_predict(clf, X_train_smote, Y_train_smote, cv = 10)
confusion_matrix(Y_train_smote, predictions)









































array([[61162,  5333],
       [ 2718, 63777]], dtype=int64)

In [27]:
#F1 Score
from sklearn.metrics import classification_report
print(classification_report(Y_train_smote, predictions))


              precision    recall  f1-score   support

           1       0.96      0.92      0.94     66495
           2       0.92      0.96      0.94     66495

    accuracy                           0.94    132990
   macro avg       0.94      0.94      0.94    132990
weighted avg       0.94      0.94      0.94    132990

