In [5]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import LeaveOneOut
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectKBest, chi2, f_classif

from multiprocessing import Pool

import warnings
warnings.filterwarnings('ignore')

import plotly.graph_objects as go

In [6]:
brca = pd.read_csv("BRCA_original.tsv", sep = "\t").T.reset_index()

brca.columns = brca.iloc[0]

brca['Y'] = brca['Ensembl_ID'].apply(lambda x:1 if x[-3:] == "11A" else 0)

brca = brca.drop([0])

y = brca['Y'].to_numpy()
X = brca.drop(columns = ['Ensembl_ID', 'Y']).to_numpy()
scaler = StandardScaler()
X = scaler.fit_transform(X)
selector = SelectKBest(f_classif, k=3422).fit(X, y)
X = selector.transform(X)

selected_columns = brca.drop(columns = ['Ensembl_ID', 'Y']).columns[selector.get_support()].to_list()

gene_name_df = pd.read_csv("brca_gene_name.txt", sep = "\t")

In [7]:
clf = DecisionTreeClassifier(random_state = 42, class_weight = 'balanced')
clf.fit(X, y)
clf.feature_importances_
features_from_decision_tree = sorted([(i, j) for i, j in zip(selected_columns, list(clf.feature_importances_)) if j>0 ], key = lambda x:-x[-1])

dt_df = pd.DataFrame(features_from_decision_tree, columns = ['id', 'importance_score'])
dt_df = pd.merge(dt_df, gene_name_df, how = 'left', on = 'id')
dt_df.to_csv("decision_tree_importance_gene_brca.csv")

In [8]:
clf = AdaBoostClassifier(n_estimators = 1000, random_state = 42)
clf.fit(X, y)
clf.feature_importances_
features_from_adaboost = sorted([(i, j) for i, j in zip(selected_columns, list(clf.feature_importances_)) if j>0 ], key = lambda x:-x[-1])

adaboost_df = pd.DataFrame(features_from_adaboost, columns = ['id', 'importance_score'])
adaboost_df = pd.merge(adaboost_df, gene_name_df, how = 'left', on = 'id')
adaboost_df.to_csv("adaboost_importance_gene_brca.csv")

In [32]:
set(adaboost_df['gene']) & set(dt_df['gene'])

{'MAGED1'}

In [9]:
clf = LogisticRegression(random_state=42)
clf.fit(X, y)
clf.coef_

array([[-0.00649354,  0.00291677, -0.00286064, ...,  0.00172979,
         0.00504998,  0.0116035 ]])