In [5]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, plot_roc_curve, balanced_accuracy_score, accuracy_score
from sklearn.linear_model import LogisticRegression, LassoCV, LinearRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/s-a-nersisyan/HSE_bioinformatics_2021/master/seminar18/BRCA_pam50.tsv', sep = '\t', index_col = 0)
df = df.loc[(df['Subtype'] == 'Luminal A') | (df['Subtype'] == 'Luminal B')] #оставляем только нужные типы 
df

Unnamed: 0,UBE2T,BIRC5,NUF2,CDC6,CCNB1,TYMS,MYBL2,CEP55,MELK,NDC80,...,NAT1,FOXA1,BLVRA,MMP11,GPR160,FGFR4,GRB7,TMEM45B,ERBB2,Subtype
TCGA-A1-A0SD-01A,3.016158,2.514871,1.509129,1.354016,3.685035,2.125453,3.047474,2.333704,1.996853,1.171520,...,4.905805,6.677118,5.044878,5.631087,3.942169,3.404177,3.655783,2.757977,5.999041,Luminal A
TCGA-A1-A0SE-01A,4.626989,3.289047,2.102190,2.116975,4.525954,2.906470,3.798966,2.745027,2.451070,2.070499,...,6.053451,6.690114,5.843557,5.504526,3.881691,1.038253,3.156081,1.355837,5.585759,Luminal A
TCGA-A1-A0SF-01A,3.417782,2.262844,1.569301,1.800667,3.823694,2.843952,3.807282,2.141323,2.082400,1.693568,...,6.883599,6.280921,5.705410,3.511572,4.374803,2.299735,3.054917,1.315680,5.215699,Luminal A
TCGA-A1-A0SG-01A,2.948244,2.298689,1.116282,1.552142,3.582144,2.886328,2.697313,1.968233,1.662241,1.280947,...,1.814486,6.284076,4.622443,6.845350,3.381032,3.586285,2.845107,3.359571,5.786065,Luminal A
TCGA-A1-A0SH-01A,3.097068,2.659636,0.762293,1.326867,3.256827,2.353642,2.396729,2.103346,1.477519,1.173324,...,3.020659,6.237971,5.467481,6.715036,5.146230,4.216788,2.829156,2.732181,6.553140,Luminal A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-GM-A2DL-01A,4.279134,3.879382,2.518115,1.460530,4.977908,3.594853,4.385759,3.567513,3.161674,1.688465,...,3.005505,6.540118,5.935218,8.588987,3.979697,1.891074,1.991623,2.532752,4.186391,Luminal A
TCGA-GM-A2DM-01A,2.238649,1.804489,0.741287,1.041073,2.985514,1.839953,1.190147,1.380188,1.131089,0.952764,...,1.633269,7.083743,5.864256,2.892541,3.150784,0.017051,3.092225,2.646148,4.517829,Luminal A
TCGA-GM-A2DN-01A,4.172296,3.371600,2.965499,2.474122,3.908066,3.763788,3.240319,2.608317,2.470973,2.372499,...,4.685897,6.203535,4.365116,3.841470,3.653086,0.720639,2.811856,1.028073,4.976873,Luminal A
TCGA-GM-A2DO-01A,3.755078,4.027775,2.623690,2.253348,4.228186,3.913628,3.822350,2.746687,2.541812,2.905382,...,4.579657,4.756974,5.265363,4.559312,2.579594,0.848374,2.902589,1.118081,4.262969,Luminal B


In [3]:
X = df.iloc[:, :-1].to_numpy()
y = df.loc[:,"Subtype"].to_numpy()

In [9]:
#линейный SVM
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=17)
model = SVC(kernel="linear", class_weight='balanced') #линейное ядро
model.fit(X_train, y_train) #тренировка
y_pred = model.predict(X_test) #проверка тестовой выборки
print('The balanced accuracy score is', balanced_accuracy_score(y_test, y_pred))


The balanced accuracy score is 0.8862389193752638


In [12]:
#линейный SVM, построенный на топ-2 признаках
df1 = df.iloc[:, :-1]
df1.loc['Weights'] = model.coef_[0]
df1.loc['Weights'] = np.abs(df1.loc['Weights'])
df1 = df1.sort_values(by='Weights', axis=1, ascending=False).iloc[:-1, :2]
print(df1.columns)

X = df1.to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=17)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(balanced_accuracy_score(y_pred, y_test))

Index(['BAG1', 'BIRC5'], dtype='object')
0.743820224719101


In [11]:
#логистическая регрессия
lr = LogisticRegression(class_weight = 'balanced', C=0.01, penalty='l1', solver='liblinear')
lr.fit(X_train, y_train)
print(lr.coef_)
y_pred = lr.predict(X_test)
print('The balanced accuracy score is', balanced_accuracy_score(y_pred, y_test))

[[ 0.          0.          0.          0.          0.          0.
   0.27089103  0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.         -0.36141867  0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.        ]]
The balanced accuracy score is 0.7992673992673993


In [7]:
np.random.seed(17)
lr = LogisticRegression(penalty = 'none')

a = 0
for i in range(10000):
    
    X = np.random.normal(loc=0, size=(20, 2))
    y = np.array([0]*10 + [1]*10)
    lr.fit(X, y)

    y_pred = lr.predict(X)
    if accuracy_score(y, y_pred) == 1:
        a += 1

print(a / 10000)

0.0002
