# Multi-omics Enabled Sample Mislabeling Correction Challenge

This notebook is using random forest classifier in and attempt to detect sample misclassifications

Details about this challeng: https://precision.fda.gov/challenges

## Solution

Import libraries

In [717]:
import os
import sys
import getopt
import re
import pandas as pd
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate, train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn import preprocessing

Load data

In [1188]:
labels = pd.read_csv("challenge_data/train_cli.tsv", sep="\t", index_col="sample")
proteins = pd.read_csv("challenge_data/train_pro.tsv", sep="\t")
# Transpose proteins matrix
proteins = proteins.T
misClassified = pd.read_csv("challenge_data/sum_tab_1.csv", sep=",")
# Replace missing values with median
proteins = proteins.fillna(proteins.median())
# Drop remaining columns with missing values
proteins = proteins.dropna(axis='columns')

Select only rows which were correctly classified (matches) for machine learning

In [1189]:
matches = list(misClassified.query('mismatch==0').loc[:,"sample"])
x = proteins.loc[matches]
y = labels.loc[matches]

Classification function, any classifier can be supplied

In [1190]:
def classify(x, y, clf):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, shuffle=True, random_state=100)
    # I will have separat models for gende and msi
    y_gender_train = lb.fit_transform(y_train.loc[:,"gender"]).ravel()
    y_gender_test = lb.fit_transform(y_test.loc[:,"gender"]).ravel()
    y_msi_train = lb.fit_transform(y_train.loc[:,"msi"]).ravel()
    y_msi_test = lb.fit_transform(y_test.loc[:,"msi"]).ravel()

    clf.fit(x_train, y_gender_train)

    y_gender_predict = clf.predict(x_train)
    print("Gender train accuracy:", accuracy_score(y_gender_train, y_gender_predict))
    # print("Gender train F1:", f1_score(y_gender_train, y_gender_predict))

    y_gender_predict = clf.predict(x_test)
    print("Gender test accuracy:", accuracy_score(y_gender_test, y_gender_predict))
    # print("Gender test F1:", f1_score(y_gender_test, y_gender_predict))

    clf.fit(x_train, y_msi_train)

    y_msi_predict = clf.predict(x_train)
    print("Msi train accuracy:", accuracy_score(y_msi_train, y_msi_predict))
    # print("Msi train F1:", f1_score(y_msi_train, y_msi_predict))

    y_msi_predict = clf.predict(x_test)
    print("Msi train accuracy:", accuracy_score(y_msi_test, y_msi_predict))
    # print("Msi train F1:", f1_score(y_msi_test, y_msi_predict))

Train classifiers. TODO: Figure out best parameters for the forest


### SVM

* It seems that a high penalty needs to be set for SVM, otherwise it assignes the more frequent label (female and low msi) to everything.

In [1191]:
classify(x, y, SVC(C=100, kernel="rbf", gamma="scale", probability=True))

('Gender train accuracy:', 1.0)
('Gender test accuracy:', 0.6190476190476191)
('Msi train accuracy:', 1.0)
('Msi train accuracy:', 1.0)


### Random Forest

In [1192]:
classify(x, y, RandomForestClassifier(n_estimators = 10))

('Gender train accuracy:', 0.9787234042553191)
('Gender test accuracy:', 0.5238095238095238)
('Msi train accuracy:', 1.0)
('Msi train accuracy:', 0.9523809523809523)


### How to Combine?

Msi seems to be much better indicator than gender. How do we take this into account?

* MSI does not match --> Mismatch label, no matter what gender says
* MSI matching, gender mismatch - what do we do?
* I propose to calculate a confidence score and use it in gender.


### Confidence Score
* Instead of reporting just label, show model's confidence score. This can help us decide in case of gender misclassification

In [1193]:
clf = SVC(C=100, kernel="rbf", gamma="scale", probability=True)
#clf = RandomForestClassifier(n_estimators=100)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, shuffle=True, random_state=100)

y_gender_train = lb.fit_transform(y_train.loc[:,"gender"]).ravel()
y_gender_test = lb.fit_transform(y_test.loc[:,"gender"]).ravel()
y_msi_train = lb.fit_transform(y_train.loc[:,"msi"]).ravel()
y_msi_test = lb.fit_transform(y_test.loc[:,"msi"]).ravel()

clf.fit(x_train, y_gender_train)
y_gender_predict = clf.predict(x_test)
print("Gender test accuracy:", accuracy_score(y_gender_test, y_gender_predict))
probs = clf.predict_proba(x_test)
for i in range(len(probs)):
    print(probs[i] , y_gender_test[i])
print()

clf.fit(x_train, y_msi_train)
y_msi_predict = clf.predict(x_test)
print("Msi test accuracy:", accuracy_score(y_msi_test, y_msi_predict))
probs = clf.predict_proba(x_test)
for i in range(len(probs)):
    print(probs[i], y_msi_test[i])

('Gender test accuracy:', 0.6190476190476191)
(array([0.68390427, 0.31609573]), 1)
(array([0.68974806, 0.31025194]), 0)
(array([0.68500562, 0.31499438]), 1)
(array([0.68634387, 0.31365613]), 0)
(array([0.65061566, 0.34938434]), 1)
(array([0.55734399, 0.44265601]), 1)
(array([0.7734738, 0.2265262]), 1)
(array([0.67698393, 0.32301607]), 0)
(array([0.6724305, 0.3275695]), 1)
(array([0.71805197, 0.28194803]), 0)
(array([0.74711107, 0.25288893]), 1)
(array([0.68475497, 0.31524503]), 0)
(array([0.76718322, 0.23281678]), 0)
(array([0.66820209, 0.33179791]), 0)
(array([0.72051824, 0.27948176]), 0)
(array([0.66275485, 0.33724515]), 0)
(array([0.80011816, 0.19988184]), 1)
(array([0.67741045, 0.32258955]), 1)
(array([0.73041421, 0.26958579]), 0)
(array([0.61357665, 0.38642335]), 1)
(array([0.72724355, 0.27275645]), 0)
()
('Msi test accuracy:', 1.0)
(array([0.14827599, 0.85172401]), 1)
(array([0.03930947, 0.96069053]), 1)
(array([0.15016489, 0.84983511]), 1)
(array([0.08736752, 0.91263248]), 1)
(a