# Task 3

In [1]:
import pandas as pd
import numpy as np

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, auc, roc_auc_score
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('https://raw.githubusercontent.com/s-a-nersisyan/HSE_bioinformatics_2021/master/seminar15/BRCA_pam50.tsv', index_col = 0, sep = '\t')
df1 = df.loc[(df['Subtype'] == 'Luminal A') | (df['Subtype'] == 'Luminal B')]

X1 = df1.iloc[:,:-1].to_numpy()
y1 = df1['Subtype'].to_numpy()
clf = SVC(kernel = 'linear', class_weight = 'balanced')

def classification(X, y):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 5, stratify = y)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    arr = confusion_matrix(y_test, y_pred)
    print('Accuracy score: ', accuracy_score(y_test, y_pred))
    print('Confusion matrix: ', arr)
    print('True Positive Rate = ', (arr[1,1]/(arr[1,1] + arr[1,0])))
    print('True Negative Rate = ', (arr[0,0]/(arr[0,0] + arr[0,1])))
    
    le = LabelEncoder() #otherwise roc auc score cannot be counted
    y_pred = le.fit_transform(y_pred)
    y_test = le.fit_transform(y_test)
    
    print('ROC AUC = ', (roc_auc_score(y_test, y_pred)))
    
classification(X1, y1)

Accuracy score:  0.9583333333333334
Confusion matrix:  [[79  4]
 [ 1 36]]
True Positive Rate =  0.972972972972973
True Negative Rate =  0.9518072289156626
ROC AUC =  0.9623901009443179


In [2]:
#counting coefficients

df1 = df.iloc[:, :50]
df1.loc['Weights'] = clf.coef_[0]

In [3]:
df2 = df1.sort_values(by = 'Weights', axis = 1, ascending = False).iloc[:,:5]
df2.loc['Weights'] = np.abs(df2.loc['Weights'])

print(df2.columns)
df2 = pd.concat([df2, df['Subtype']], axis = 1)

Index(['UBE2T', 'UBE2C', 'RRM2', 'MELK', 'MYBL2'], dtype='object')


In [4]:
df2 = df2.iloc[:-1, :]
df2 = df2.loc[(df2['Subtype'] == 'Luminal A') | (df2['Subtype'] == 'Luminal B')]

X2 = df2.iloc[:,:5].to_numpy()
y2 = df2['Subtype'].to_numpy()

classification(X2, y2)

Accuracy score:  0.8583333333333333
Confusion matrix:  [[68 15]
 [ 2 35]]
True Positive Rate =  0.9459459459459459
True Negative Rate =  0.8192771084337349
ROC AUC =  0.8826115271898405
