In [1]:
from itertools import combinations
import pandas as pd

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC


In [2]:
df = pd.read_csv("datasets/TCGA_InfoWithGrade.csv")
df

Unnamed: 0,Grade,Gender,Age_at_diagnosis,Race,IDH1,TP53,ATRX,PTEN,EGFR,CIC,...,FUBP1,RB1,NOTCH1,BCOR,CSMD3,SMARCA4,GRIN2A,IDH2,FAT4,PDGFRA
0,0,0,51.30,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,0,38.72,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,35.17,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,32.78,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,31.51,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
834,1,1,77.89,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
835,1,0,85.18,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
836,1,1,77.49,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
837,1,0,63.33,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [3]:
X = df.drop("Grade", axis=1)
y = pd.DataFrame(df["Grade"]).copy()

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y.values.ravel(), test_size=0.2, random_state=42)

In [5]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [47]:
seed = 42
lr = LogisticRegression(random_state=seed)
svm = SVC(random_state=seed, probability=True)
knn = KNeighborsClassifier()
rf = RandomForestClassifier(random_state=seed)
adaboost = AdaBoostClassifier(random_state=seed)

classifiers = [('lr', lr), ('svm', svm), ('knn', knn), ('rf', rf), ('adaboost', adaboost)]

voting_classifiers = []
for n in range(3, 6):
    for subset in combinations(classifiers, n):
        voting_classifier = VotingClassifier(estimators=list(subset), voting='soft')
        voting_classifiers.append(voting_classifier)

results_df = pd.DataFrame(columns=['Classifier', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

for vc in voting_classifiers:
    vc.fit(X_train, y_train)
    y_pred = vc.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    clf_names = [name for name, _ in vc.estimators]
    clf_names_str = ', '.join(clf_names)
    results_df = results_df.append({'Classifier': clf_names_str, 'Accuracy': accuracy,
                                    'Precision': precision, 'Recall': recall, 'F1 Score': f1},
                                   ignore_index=True)


In [51]:
results_df.style.format(precision=3)

Unnamed: 0,Classifier,Accuracy,Precision,Recall,F1 Score
0,"lr, svm, knn",0.857,0.816,0.899,0.855
1,"lr, svm, rf",0.875,0.83,0.924,0.874
2,"lr, svm, adaboost",0.863,0.818,0.911,0.862
3,"lr, knn, rf",0.869,0.828,0.911,0.867
4,"lr, knn, adaboost",0.857,0.816,0.899,0.855
5,"lr, rf, adaboost",0.869,0.82,0.924,0.869
6,"svm, knn, rf",0.857,0.831,0.873,0.852
7,"svm, knn, adaboost",0.851,0.814,0.886,0.848
8,"svm, rf, adaboost",0.857,0.816,0.899,0.855
9,"knn, rf, adaboost",0.851,0.829,0.861,0.845
