In [1]:
import pandas as pd
import numpy as np
from sklearn import tree
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score, roc_curve
from sklearn.ensemble import RandomForestClassifier

Import data with the regions with already known oncogenes, get rid of unnecessary columns

In [2]:
all_data = pd.read_csv("../../Data/Processed_data/Selection_of_regions_for_ML.csv")

In [3]:
data = all_data.iloc[:, 8:]

Overall look of the table

In [4]:
data

Unnamed: 0,is_oncogene,rank_expr_fold_change,rank_TF,rank_median.CRISPR,rank_mean.CRISPR,rank_min.CRISPR,rank_kinase,rank_pubmed_mean,rank_GO_terms,rank_PPI,SUM_RANK
0,0,0.21,0.5,0.05,0.05,0.26,0.53,0.39,0.50,0.50,2.99
1,0,0.63,0.5,0.16,0.16,0.11,0.53,0.26,0.50,0.50,3.35
2,0,0.32,0.5,0.11,0.11,0.05,0.53,0.26,0.50,0.50,2.88
3,0,0.16,0.5,0.58,0.58,0.79,0.53,1.00,0.50,0.50,5.14
4,0,0.47,0.5,0.68,0.68,0.53,0.53,0.74,0.50,0.50,5.13
...,...,...,...,...,...,...,...,...,...,...,...
3752,0,0.56,1.0,0.33,0.44,0.56,0.50,0.78,0.56,0.56,5.29
3753,0,0.67,0.5,0.78,0.78,0.78,0.50,0.50,0.56,0.56,5.63
3754,0,0.78,0.5,0.44,0.33,0.33,0.50,0.33,0.56,0.56,4.33
3755,0,0.89,0.5,0.89,0.89,0.89,0.50,0.50,0.56,0.56,6.18


Target

In [5]:
all_data[["is_oncogene"]]

Unnamed: 0,is_oncogene
0,0
1,0
2,0
3,0
4,0
...,...
3752,0
3753,0
3754,0
3755,0


Some columns have non-numerical type

Total amount of row with NA out of total is high, but still lets remove them for the simplicity

In [6]:
data.isna().any(axis=1).sum()

0

In [7]:
data.shape

(3757, 11)

Split data into train and test (cnsider using stratify)

In [8]:
my_y = data[['is_oncogene']].values.astype("int8")

In [9]:
my_x = data.drop('is_oncogene', axis=1).values

In [10]:
my_x

array([[0.21, 0.5 , 0.05, ..., 0.5 , 0.5 , 2.99],
       [0.63, 0.5 , 0.16, ..., 0.5 , 0.5 , 3.35],
       [0.32, 0.5 , 0.11, ..., 0.5 , 0.5 , 2.88],
       ...,
       [0.78, 0.5 , 0.44, ..., 0.56, 0.56, 4.33],
       [0.89, 0.5 , 0.89, ..., 0.56, 0.56, 6.18],
       [1.  , 0.5 , 1.  , ..., 0.56, 0.56, 6.29]])

In [11]:
my_y = np.ravel(my_y)

In [15]:
import xgboost
import catboost

from sklearn.ensemble import (ExtraTreesClassifier, VotingClassifier)
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split, ShuffleSplit, cross_val_score

In [13]:
SEED = 42
rf = RandomForestClassifier(n_estimators=300, random_state=SEED)
etc = ExtraTreesClassifier(n_estimators=300, random_state=SEED)
cat = catboost.CatBoostClassifier(verbose=0, random_seed=SEED)
xgb_rf = xgboost.XGBRFClassifier(random_state=SEED)
nb = GaussianNB()

In [16]:
base_models_new = [("RF", rf), ("ETC", etc), ("CAT", cat), ("XGB_RF", xgb_rf),
                   ("NB", nb)]

cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=SEED)
voting_soft = VotingClassifier(base_models_new, voting='soft')
score = cross_val_score(voting_soft, my_x, my_y, cv=cv, scoring="recall", n_jobs=-1)

In [17]:
score.mean()

0.8305258859684942

In [18]:
X_train, X_test, y_train, y_test = train_test_split(my_x, my_y, test_size=0.30, random_state=SEED) 

In [19]:
voting_soft.fit(X_train, y_train)

In [20]:
y_pred = voting_soft.predict(X_test)

In [21]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy (how many objects got correct labels) is \n{accuracy}")
print()
precision = precision_score(y_test, y_pred)
print(f"Precision (how many oncogenes are actuall oncognes within predicted ones) is \n{precision}")
print()
recall = recall_score(y_test, y_pred)
print(f"Recall (how many oncogenes out of all oncogenes we got right) is \n{recall}")
print()
f1_score_result = f1_score(y_test, y_pred)
print(f"F1_score is \n{f1_score_result}")
print()
roc_auc = roc_auc_score(y_test, voting_soft.predict_proba(X_test)[:,1])
print(f"ROC-AUC (performance of model over multiple thresholds in our predicted probabilities) is \n{roc_auc}")

Accuracy (how many objects got correct labels) is 
0.9858156028368794

Precision (how many oncogenes are actuall oncognes within predicted ones) is 
0.9411764705882353

Recall (how many oncogenes out of all oncogenes we got right) is 
0.8421052631578947

F1_score is 
0.8888888888888888

ROC-AUC (performance of model over multiple thresholds in our predicted probabilities) is 
0.9911822093255954


In [22]:
new_data = pd.read_csv("../../Data/Processed_data/Selection_of_regions_without_known_oncognes.csv")

In [23]:
new_data

Unnamed: 0.1,Unnamed: 0,sample,chr,startpos,endpos,nMajor,region_length,gene_name,rank_expr_fold_change,rank_TF,rank_median.CRISPR,rank_mean.CRISPR,rank_min.CRISPR,rank_kinase,rank_pubmed_mean,rank_GO_terms,rank_PPI,SUM_RANK,group_num
0,1,TCGA-85-A4JB,8,31254,1060634,20,1029380,OR4F21,0.14,0.50,0.86,0.86,0.86,0.57,0.21,0.57,0.57,5.14,785
1,2,TCGA-85-A4JB,8,31254,1060634,20,1029380,RP11-585F1.10,1.00,0.50,1.00,1.00,1.00,0.57,0.21,0.57,0.57,6.42,785
2,3,TCGA-85-A4JB,8,31254,1060634,20,1029380,ZNF596,0.86,1.00,0.43,0.43,0.43,0.57,0.57,0.57,0.57,5.43,785
3,4,TCGA-85-A4JB,8,31254,1060634,20,1029380,FBXO25,0.64,0.50,0.64,0.64,0.64,0.57,0.93,0.57,0.57,5.70,785
4,5,TCGA-E9-A1N9,8,31254,627181,17,595927,OR4F21,0.17,0.50,0.83,0.83,0.83,0.58,0.25,0.58,0.58,5.15,2468
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20954,20955,TCGA-ER-A197,1,242566798,243006007,19,439209,PLD5,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,9.00,2594
20955,20956,TCGA-DX-AB2H,1,244460667,244920820,15,460153,C1orf100,0.50,0.62,0.50,0.50,0.50,0.62,0.50,0.62,0.62,4.98,2321
20956,20957,TCGA-DX-AB2H,1,244460667,244920820,15,460153,ADSS,0.75,0.62,0.75,0.75,0.75,0.62,0.75,0.62,0.62,6.23,2321
20957,20958,TCGA-DX-AB2H,1,244460667,244920820,15,460153,C1orf101,0.25,0.62,1.00,1.00,1.00,0.62,0.25,0.62,0.62,5.98,2321


In [24]:
data_to_predict = new_data.iloc[:, 8:18].values

In [25]:
my_pred_proba = voting_soft.predict_proba(data_to_predict)

In [26]:
first_values = my_pred_proba[:, 1]

In [27]:
new_data['is_onco_prediction_proba_ensemble'] = first_values

In [28]:
gene_list_proba_09 = new_data[new_data['is_onco_prediction_proba_ensemble'] >= 0.9][['gene_name', 'is_onco_prediction_proba_ensemble']]

In [29]:
gene_list_proba_09.dropna(inplace = True)

In [30]:
gene_list_proba_09.to_csv('Prediction_by_ensemble.csv', index=False)