In [57]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier


# LOADING DATASET

In [58]:
test = pd.read_csv("data_set_ALL_AML_independent.csv")
train = pd.read_csv("data_set_ALL_AML_train.csv")


In [59]:
print(test.shape)
test.head()

(7129, 70)


Unnamed: 0,Gene Description,Gene Accession Number,39,call,40,call.1,42,call.2,47,call.3,...,65,call.29,66,call.30,63,call.31,64,call.32,62,call.33
0,AFFX-BioB-5_at (endogenous control),AFFX-BioB-5_at,-342,A,-87,A,22,A,-243,A,...,-62,A,-58,A,-161,A,-48,A,-176,A
1,AFFX-BioB-M_at (endogenous control),AFFX-BioB-M_at,-200,A,-248,A,-153,A,-218,A,...,-198,A,-217,A,-215,A,-531,A,-284,A
2,AFFX-BioB-3_at (endogenous control),AFFX-BioB-3_at,41,A,262,A,17,A,-163,A,...,-5,A,63,A,-46,A,-124,A,-81,A
3,AFFX-BioC-5_at (endogenous control),AFFX-BioC-5_at,328,A,295,A,276,A,182,A,...,141,A,95,A,146,A,431,A,9,A
4,AFFX-BioC-3_at (endogenous control),AFFX-BioC-3_at,-224,A,-226,A,-211,A,-289,A,...,-256,A,-191,A,-172,A,-496,A,-294,A


In [60]:
print(train.shape)
train.head()

(7129, 78)


Unnamed: 0,Gene Description,Gene Accession Number,1,call,2,call.1,3,call.2,4,call.3,...,29,call.33,30,call.34,31,call.35,32,call.36,33,call.37
0,AFFX-BioB-5_at (endogenous control),AFFX-BioB-5_at,-214,A,-139,A,-76,A,-135,A,...,15,A,-318,A,-32,A,-124,A,-135,A
1,AFFX-BioB-M_at (endogenous control),AFFX-BioB-M_at,-153,A,-73,A,-49,A,-114,A,...,-114,A,-192,A,-49,A,-79,A,-186,A
2,AFFX-BioB-3_at (endogenous control),AFFX-BioB-3_at,-58,A,-1,A,-307,A,265,A,...,2,A,-95,A,49,A,-37,A,-70,A
3,AFFX-BioC-5_at (endogenous control),AFFX-BioC-5_at,88,A,283,A,309,A,12,A,...,193,A,312,A,230,P,330,A,337,A
4,AFFX-BioC-3_at (endogenous control),AFFX-BioC-3_at,-295,A,-264,A,-376,A,-419,A,...,-51,A,-139,A,-367,A,-188,A,-407,A


In [61]:
y = pd.read_csv("actual.csv")
y.head()

Unnamed: 0,patient,cancer
0,1,ALL
1,2,ALL
2,3,ALL
3,4,ALL
4,5,ALL


In [62]:
y.shape

(72, 2)

In [63]:
y = y.replace({'ALL':0,'AML':1})
labels = ['ALL', 'AML']

# PROCESSING TRAIN AND TEST DATASET

REMOVING UNWANTED COLUMNS AND ARRANGING COLUMNS IN ORDER

In [64]:
tcol = [col for col in train.columns if "call" not in col]
train = train[tcol]
train_col = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
       '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', 
       '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38']

train = train.reindex(columns=train_col)
train.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,29,30,31,32,33,34,35,36,37,38
0,-214,-139,-76,-135,-106,-138,-72,-413,5,-88,...,15,-318,-32,-124,-135,-20,7,-213,-25,-72
1,-153,-73,-49,-114,-125,-85,-144,-260,-127,-105,...,-114,-192,-49,-79,-186,-207,-100,-252,-20,-139
2,-58,-1,-307,265,-76,215,238,7,106,42,...,2,-95,49,-37,-70,-50,-57,136,124,-1
3,88,283,309,12,168,71,55,-2,268,219,...,193,312,230,330,337,101,132,318,325,392
4,-295,-264,-376,-419,-230,-272,-399,-541,-210,-178,...,-51,-139,-367,-188,-407,-369,-377,-209,-396,-324


In [65]:
tcol = [col for col in test.columns if "call" not in col]
test = test[tcol]
test_col = ['39', '40', '41', '42', '43', '44', '45', '46',
       '47', '48', '49', '50', '51', '52', '53',  '54', '55', '56', '57', '58', '59',
       '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72']

test = test.reindex(columns=test_col)
test.head()

Unnamed: 0,39,40,41,42,43,44,45,46,47,48,...,63,64,65,66,67,68,69,70,71,72
0,-342,-87,-62,22,86,-146,-187,-56,-243,-130,...,-161,-48,-62,-58,-76,-154,-79,-55,-59,-131
1,-200,-248,-23,-153,-36,-74,-187,-43,-218,-177,...,-215,-531,-198,-217,-98,-136,-118,-44,-114,-126
2,41,262,-7,17,-141,170,312,43,-163,-28,...,-46,-124,-5,63,-153,49,-30,12,23,-50
3,328,295,142,276,252,174,142,177,182,266,...,146,431,141,95,237,180,68,129,146,211
4,-224,-226,-233,-211,-201,-32,114,-116,-289,-170,...,-172,-496,-256,-191,-215,-257,-110,-108,-171,-206


CREATING TRANSPOSE OF BOTH THE DATASETS

In [66]:
xtrain = train.T
xtest = test.T

In [67]:
print(xtrain.shape)
xtrain.head()

(38, 7129)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7119,7120,7121,7122,7123,7124,7125,7126,7127,7128
1,-214,-153,-58,88,-295,-558,199,-176,252,206,...,185,511,-125,389,-37,793,329,36,191,-37
2,-139,-73,-1,283,-264,-400,-330,-168,101,74,...,169,837,-36,442,-17,782,295,11,76,-14
3,-76,-49,-307,309,-376,-650,33,-367,206,-215,...,315,1199,33,168,52,1138,777,41,228,-41
4,-135,-114,265,12,-419,-585,158,-253,49,31,...,240,835,218,174,-110,627,170,-50,126,-91
5,-106,-125,-76,168,-230,-284,4,-122,70,252,...,156,649,57,504,-26,250,314,14,56,-25


In [68]:
print(xtest.shape)
xtest.head()

(34, 7129)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7119,7120,7121,7122,7123,7124,7125,7126,7127,7128
39,-342,-200,41,328,-224,-427,-656,-292,137,-144,...,277,1023,67,214,-135,1074,475,48,168,-70
40,-87,-248,262,295,-226,-493,367,-452,194,162,...,83,529,-295,352,-67,67,263,-33,-33,-21
41,-62,-23,-7,142,-233,-284,-167,-97,-12,-70,...,129,383,46,104,15,245,164,84,100,-18
42,22,-153,17,276,-211,-250,55,-141,0,500,...,413,399,16,558,24,893,297,6,1971,-42
43,86,-36,-141,252,-201,-384,-420,-197,-60,-468,...,341,91,-84,615,-52,1235,9,7,1545,-81


In [69]:
x_train =xtrain.reset_index(drop=True)
y_train = y[y.patient <= 38].reset_index(drop=True)


x_test = xtest.reset_index(drop=True)
y_test = y[y.patient > 38].reset_index(drop=True)

# SVM

In [70]:
sv = SVC(C= 0.1, decision_function_shape= 'ovo', gamma= 1, kernel= 'linear')
sv.fit(x_train, y_train.iloc[:,1])

SVC(C=0.1, decision_function_shape='ovo', gamma=1, kernel='linear')

In [71]:
y_pred_sv = sv.predict(x_test)
print(accuracy_score(y_test.iloc[:,1], y_pred_sv))

0.9705882352941176


In [72]:
svm_param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001, 0.00001, 10], "kernel": ["linear", "rbf", "poly"], "decision_function_shape" : ["ovo", "ovr"]} 

# Create SVM grid search classifier
svm_grid = GridSearchCV(SVC(), svm_param_grid, cv=3)

# Train the classifier
svm_grid.fit(x_train, y_train.iloc[:,1])

print("Best Parameters:\n", svm_grid.best_params_)

# Select best svc
best_svc = svm_grid.best_estimator_

# Make predictions using the optimised parameters
svm_pred = best_svc.predict(x_test)

print('SVM accuracy:', round(accuracy_score(y_test.iloc[:,1], svm_pred), 3))

Best Parameters:
 {'C': 0.1, 'decision_function_shape': 'ovo', 'gamma': 1, 'kernel': 'linear'}
SVM accuracy: 0.971


# RANDOM FOREST CLASSIFIER

In [73]:
rf = RandomForestClassifier()
rf.fit(x_train, y_train.iloc[:,1])

RandomForestClassifier()

In [74]:
y_pred_rf = dt.predict(x_test)
print(accuracy_score(y_test.iloc[:,1], y_pred_rf))

0.9117647058823529
