In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree
import seaborn as sns
import numpy as np
import warnings
warnings.filterwarnings("ignore")

## Reading the data

In [2]:
data = pd.read_csv("gliomadataset/TCGA_GBM_LGG_Mutations_all.csv", header=None, na_values="?")
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
0,Grade,Project,Case_ID,Gender,Age_at_diagnosis,Primary_Diagnosis,Race,IDH1,TP53,ATRX,...,FUBP1,RB1,NOTCH1,BCOR,CSMD3,SMARCA4,GRIN2A,IDH2,FAT4,PDGFRA
1,LGG,TCGA-LGG,TCGA-DU-8164,Male,51 years 108 days,"Oligodendroglioma, NOS",white,MUTATED,NOT_MUTATED,NOT_MUTATED,...,MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED
2,LGG,TCGA-LGG,TCGA-QH-A6CY,Male,38 years 261 days,Mixed glioma,white,MUTATED,NOT_MUTATED,NOT_MUTATED,...,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED
3,LGG,TCGA-LGG,TCGA-HW-A5KM,Male,35 years 62 days,"Astrocytoma, NOS",white,MUTATED,MUTATED,MUTATED,...,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED
4,LGG,TCGA-LGG,TCGA-E1-A7YE,Female,32 years 283 days,"Astrocytoma, anaplastic",white,MUTATED,MUTATED,MUTATED,...,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,MUTATED,NOT_MUTATED


In [3]:
dataframe = pd.DataFrame(data)
dataframe.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
0,Grade,Project,Case_ID,Gender,Age_at_diagnosis,Primary_Diagnosis,Race,IDH1,TP53,ATRX,...,FUBP1,RB1,NOTCH1,BCOR,CSMD3,SMARCA4,GRIN2A,IDH2,FAT4,PDGFRA
1,LGG,TCGA-LGG,TCGA-DU-8164,Male,51 years 108 days,"Oligodendroglioma, NOS",white,MUTATED,NOT_MUTATED,NOT_MUTATED,...,MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED
2,LGG,TCGA-LGG,TCGA-QH-A6CY,Male,38 years 261 days,Mixed glioma,white,MUTATED,NOT_MUTATED,NOT_MUTATED,...,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED
3,LGG,TCGA-LGG,TCGA-HW-A5KM,Male,35 years 62 days,"Astrocytoma, NOS",white,MUTATED,MUTATED,MUTATED,...,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED
4,LGG,TCGA-LGG,TCGA-E1-A7YE,Female,32 years 283 days,"Astrocytoma, anaplastic",white,MUTATED,MUTATED,MUTATED,...,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,MUTATED,NOT_MUTATED


## Handeling with missing values

In [4]:
dataframe.isnull().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    0
dtype: int64

In [5]:
dataframe.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
0,Grade,Project,Case_ID,Gender,Age_at_diagnosis,Primary_Diagnosis,Race,IDH1,TP53,ATRX,...,FUBP1,RB1,NOTCH1,BCOR,CSMD3,SMARCA4,GRIN2A,IDH2,FAT4,PDGFRA
1,LGG,TCGA-LGG,TCGA-DU-8164,Male,51 years 108 days,"Oligodendroglioma, NOS",white,MUTATED,NOT_MUTATED,NOT_MUTATED,...,MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED
2,LGG,TCGA-LGG,TCGA-QH-A6CY,Male,38 years 261 days,Mixed glioma,white,MUTATED,NOT_MUTATED,NOT_MUTATED,...,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED
3,LGG,TCGA-LGG,TCGA-HW-A5KM,Male,35 years 62 days,"Astrocytoma, NOS",white,MUTATED,MUTATED,MUTATED,...,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED
4,LGG,TCGA-LGG,TCGA-E1-A7YE,Female,32 years 283 days,"Astrocytoma, anaplastic",white,MUTATED,MUTATED,MUTATED,...,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,MUTATED,NOT_MUTATED


## Label Encoding 

In [6]:
from sklearn.preprocessing import LabelEncoder

In [7]:
le = LabelEncoder()

In [8]:
cols=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26]
for i in cols:
    dataframe[i] = le.fit_transform(dataframe[i])
dataframe.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
0,1,0,0,2,838,7,1,0,2,0,...,0,2,1,0,0,2,0,0,0,2
1,2,2,481,3,410,5,6,1,1,2,...,1,1,2,2,2,1,2,2,2,1
2,2,2,733,3,223,4,6,1,1,2,...,2,1,2,2,2,1,2,2,2,1
3,2,2,693,3,174,1,6,1,0,1,...,2,1,2,2,2,1,2,2,2,1
4,2,2,519,1,126,2,6,1,0,1,...,2,1,2,2,2,1,2,2,1,1


In [9]:
dataframe.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
count,863.0,863.0,863.0,863.0,863.0,863.0,863.0,863.0,863.0,863.0,...,863.0,863.0,863.0,863.0,863.0,863.0,863.0,863.0,863.0,863.0
mean,1.15759,1.577057,431.0,2.152955,417.232908,3.366165,5.755504,1.517961,0.590962,1.742758,...,1.943221,0.95365,1.910776,1.964079,1.965238,0.968714,1.966396,1.971031,1.971031,0.975666
std,0.98749,0.496654,249.270937,0.995238,242.70315,1.333267,0.771754,0.502282,0.494294,0.440012,...,0.23651,0.215808,0.411733,0.192331,0.189508,0.180728,0.186634,0.174592,0.174592,0.161522
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,215.5,1.0,208.5,3.0,6.0,1.0,0.0,1.0,...,2.0,1.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,1.0
50%,2.0,2.0,431.0,3.0,418.0,3.0,6.0,2.0,1.0,2.0,...,2.0,1.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,1.0
75%,2.0,2.0,646.5,3.0,627.5,4.0,6.0,2.0,1.0,2.0,...,2.0,1.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,1.0
max,2.0,2.0,862.0,3.0,838.0,7.0,6.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0


In [10]:
target = dataframe[0]
data = dataframe.drop(columns=[0])
data

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,17,18,19,20,21,22,23,24,25,26
0,0,0,2,838,7,1,0,2,0,2,...,0,2,1,0,0,2,0,0,0,2
1,2,481,3,410,5,6,1,1,2,1,...,1,1,2,2,2,1,2,2,2,1
2,2,733,3,223,4,6,1,1,2,1,...,2,1,2,2,2,1,2,2,2,1
3,2,693,3,174,1,6,1,0,1,1,...,2,1,2,2,2,1,2,2,2,1
4,2,519,1,126,2,6,1,0,1,1,...,2,1,2,2,2,1,2,2,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
858,1,234,1,803,3,6,2,1,2,0,...,2,1,2,2,2,1,2,2,2,1
859,1,197,3,831,3,6,2,0,2,0,...,2,1,2,2,2,1,2,2,2,1
860,1,267,1,802,3,6,2,0,2,0,...,2,1,2,2,2,1,2,2,2,1
861,1,301,3,632,3,6,2,0,2,1,...,2,0,2,2,2,1,2,2,2,1


## Spliting the data into training and testing

In [11]:
x_train, x_test, y_train, y_test = train_test_split(data, target, test_size = 0.3)

In [12]:
x_train.shape

(604, 26)

In [13]:
y_train.shape

(604,)

## Preproccessing and Pipelining

In [14]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.linear_model import Perceptron
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline

In [15]:
pipe1 = Pipeline([('sc', StandardScaler()), ('pca', PCA()), ('SVM', SVC())])
pipe2 = Pipeline([('sc', StandardScaler()), ('pca', PCA()), ("DSC Tree", DecisionTreeClassifier(criterion='gini', max_depth=3))])
pipe3 = Pipeline([('sc', StandardScaler()), ('pca', PCA()),  ("perceptron", Perceptron(penalty="l2"))])
pipe4 = Pipeline([('sc', StandardScaler()), ('pca', PCA()),  ("GussianNB", GaussianNB())])
pipe5 = Pipeline([('sc', StandardScaler()), ('pca', PCA()),  ("LogisticRegression", LogisticRegression())])
pipe1.fit(x_train, y_train)
pipe2.fit(x_train,y_train)
pipe3.fit(x_train,y_train)
pipe4.fit(x_train,y_train)
pipe5.fit(x_train,y_train)

In [16]:
print("Trainimg Accuracy with SVM", pipe1.score(x_train, y_train))
print("Testing Accuracy with SVM ", pipe1.score(x_test,y_test))
print("\n")
print("Trainimg Accuracy with Decision TREE", pipe2.score(x_train, y_train))
print("Testing Accuracy with Decision TREE", pipe2.score(x_test,y_test))
print("\n")
print("Trainimg Accuracy with PERCEPTRON", pipe3.score(x_train, y_train))
print("Testing Accuracy with PERCEPTRON", pipe3.score(x_test,y_test))
print("\n")
print("Trainimg Accuracy with GussianNB", pipe4.score(x_train, y_train))
print("Testing Accuracy with GussianNB", pipe4.score(x_test,y_test))
print("\n")
print("Trainimg Accuracy with Logistic Regression", pipe4.score(x_train, y_train))
print("Testing Accuracy with Logistic Regression", pipe4.score(x_test,y_test))

Trainimg Accuracy with SVM 1.0
Testing Accuracy with SVM  0.9922779922779923


Trainimg Accuracy with Decision TREE 0.9867549668874173
Testing Accuracy with Decision TREE 0.9498069498069498


Trainimg Accuracy with PERCEPTRON 1.0
Testing Accuracy with PERCEPTRON 1.0


Trainimg Accuracy with GussianNB 0.9470198675496688
Testing Accuracy with GussianNB 0.9343629343629344


Trainimg Accuracy with Logistic Regression 0.9470198675496688
Testing Accuracy with Logistic Regression 0.9343629343629344


In [17]:
from sklearn.model_selection import cross_validate
import numpy as np

In [18]:


for i in range(2,10):
    print(i, end=" " )
    cv = cross_validate(pipe1, data, target, cv=i)
    print(np.average(cv['test_score']))
# for i in range(2,10):
#     print(i)
#     cv = cross_validate(pipe2, data, target, cv=i)
#     print(np.average(cv['test_score']))

2 0.9907353699407064
3 0.9895752677764872
4 0.9895725667527993
5 0.9907380024196801
6 0.9907326469826471
7 0.9918980180585216
8 0.9918873312564902
9 0.9907285575048734


In [19]:
cv = cross_validate(pipe1, data, target, cv=10)
print(cv, end=" ") #fit_time, test_time, score_time

{'fit_time': array([0.01709175, 0.02330279, 0.02458286, 0.02269816, 0.0221405 ,
       0.02254295, 0.02117777, 0.02416801, 0.02253604, 0.01950693]), 'score_time': array([0.00299788, 0.00400043, 0.00400066, 0.00300026, 0.00299931,
       0.00300217, 0.00299954, 0.00299931, 0.00299788, 0.00299525]), 'test_score': array([0.96551724, 1.        , 0.98850575, 0.97674419, 1.        ,
       1.        , 1.        , 0.98837209, 1.        , 0.98837209])} 

In [20]:
print(cv['test_score'], end=" ")

[0.96551724 1.         0.98850575 0.97674419 1.         1.
 1.         0.98837209 1.         0.98837209] 

In [21]:
print(np.average(cv['test_score']))

0.9907511360598772


## Hyper parameter Tunning

In [22]:
from sklearn.model_selection import GridSearchCV

In [23]:
logistic_grid_values = {'penalty': ['l1', 'l2'], 'C': [0.01, 0.1, 1, 10, 100]}
svm_grid_values = {'kernel': ['linear', 'poly'], 'C': [0.01, 0.1, 1, 10, 100]}

In [24]:
gs1 = GridSearchCV(LogisticRegression(), param_grid=logistic_grid_values, scoring='accuracy', cv=2)
gs2 = GridSearchCV(SVC(), param_grid=svm_grid_values, scoring='accuracy', cv=2)

In [25]:
gs1.fit(data,target)
gs2.fit(data,target)

In [26]:
print("Logistic Regression Best parameter", gs1.best_params_)
print("SVM Best parameter",gs2.best_params_)
print("Logistic Regression Best Score",gs1.best_score_)
print("SVM Best Score", gs2.best_score_)

Logistic Regression Best parameter {'C': 10, 'penalty': 'l2'}
SVM Best parameter {'C': 1, 'kernel': 'linear'}
Logistic Regression Best Score 0.9953623141703188
SVM Best Score 0.9988425925925926
