In [1]:
# In case you do not have following packages installed, uncomment instalisation.

import pandas as pd
import numpy as np
import os
import glob
import functools
from pathlib import Path
import matplotlib.pyplot as plt

#!pip install dask[complete];
# you need to run these in case dask gives you error, it might need update.
#!pip install --upgrade pandas "dask[complete]"
#python -m pip install "dask[dataframe]" --upgrade
#import dask.dataframe as dd

from sklearn.model_selection import train_test_split

from sklearn.metrics import precision_score, recall_score, f1_score, roc_curve, accuracy_score
from sklearn.metrics import roc_auc_score, confusion_matrix, precision_recall_curve

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import StratifiedKFold
from sklearn.inspection import permutation_importance       

from sklearn.feature_selection import SelectFromModel
from sklearn.utils import class_weight

#!pip3 install xgboost
#from xgboost import XGBClassifier

#!pip install conorm
#import conorm # for tmm normalisation

#!pip3 install pydeseq2 or pip install pydeseq2
#from pydeseq2.dds import DeseqDataSet
#from pydeseq2.ds import DeseqStats
#from pydeseq2.utils import load_example_data



#to install R :
#conda install -c r r-irkernel

#to install a library from R
#!pip install library edgeR
# pip install rpy2

### Data for Imaging of Parkinson ###

In [2]:
path3 = Path("/home/znazari/data/open_proteomic/")


In [38]:
ensemble = pd.read_csv(path3/'Ensemble_GeneID_TranscriptID_Symbol_mart_export.txt',delimiter='\t')
gin = ensemble[['Gene name','Gene stable ID']]
gin

Unnamed: 0,Gene name,Gene stable ID
0,MT-TF,ENSG00000210049
1,MT-RNR1,ENSG00000211459
2,MT-TV,ENSG00000210077
3,MT-RNR2,ENSG00000210082
4,MT-TL1,ENSG00000209082
...,...,...
274613,,ENSG00000284882
274614,,ENSG00000284882
274615,,ENSG00000289881
274616,,ENSG00000289881


In [35]:
ensemble

Unnamed: 0,Gene stable ID,Gene stable ID version,Transcript stable ID,Transcript stable ID version,Gene name,Transcript name,HGNC symbol
0,ENSG00000210049,ENSG00000210049.1,ENST00000387314,ENST00000387314.1,MT-TF,MT-TF-201,MT-TF
1,ENSG00000211459,ENSG00000211459.2,ENST00000389680,ENST00000389680.2,MT-RNR1,MT-RNR1-201,MT-RNR1
2,ENSG00000210077,ENSG00000210077.1,ENST00000387342,ENST00000387342.1,MT-TV,MT-TV-201,MT-TV
3,ENSG00000210082,ENSG00000210082.2,ENST00000387347,ENST00000387347.2,MT-RNR2,MT-RNR2-201,MT-RNR2
4,ENSG00000209082,ENSG00000209082.1,ENST00000386347,ENST00000386347.1,MT-TL1,MT-TL1-201,MT-TL1
...,...,...,...,...,...,...,...
274613,ENSG00000284882,ENSG00000284882.2,ENST00000646567,ENST00000646567.2,,,
274614,ENSG00000284882,ENSG00000284882.2,ENST00000644618,ENST00000644618.1,,,
274615,ENSG00000289881,ENSG00000289881.1,ENST00000701339,ENST00000701339.1,,,
274616,ENSG00000289881,ENSG00000289881.1,ENST00000701824,ENST00000701824.1,,,


In [4]:
proteomic_annotation = pd.read_csv(path3/"PPMI_Project_151_pqtl_Analysis_Annotations_20210210.csv",delimiter=',')
symb = proteomic_annotation[['SOMA_SEQ_ID','TARGET_GENE_SYMBOL']]
symb

Unnamed: 0,SOMA_SEQ_ID,TARGET_GENE_SYMBOL
0,10000-28_3,CRYBB2
1,10001-7_3,RAF1
2,10003-15_3,ZNF41
3,10006-25_3,ELK1
4,10008-43_3,GUCA1A
...,...,...
5188,9993-11_3,ZNF264
5189,9994-217_3,ATP4B
5190,9995-6_3,DUT
5191,9997-12_3,UBXN4


In [5]:
# Specify the base file name and path
base_file_name = "Project_151_pQTL_in_CSF_{}_of_7_Batch_Corrected_.csv"

# Number of files
num_files = 7

# List to store DataFrames
dfs = []

# Loop through the file indices and read each file
for file_index in range(1, num_files + 1):
    file_name = base_file_name.format(file_index)
    file_path = path3 / file_name
    
    # Check if the file exists before attempting to read it
    if file_path.is_file():
        # Read the CSV file and append it to the list
        df = pd.read_csv(file_path, delimiter=',')
        dfs.append(df)
    else:
        print(f"File {file_name} not found.")


In [8]:
# Concatenate all DataFrames into a single DataFrame
result_df = pd.concat(dfs, ignore_index=True)

# Filter out patients diagnosed as Prodromal
result_df = result_df[result_df['COHORT'] != 'Prodromal']

# Extract the two columns TESTNAME and PATNO
result_subset = result_df[['TESTNAME', 'PATNO']]

# Pivot the DataFrame to get the desired format
#result_pivot = result_subset.pivot_table(index='TESTNAME', columns='PATNO', aggfunc=len, fill_value=0)


In [12]:
unique_result_subset = result_subset.drop_duplicates(subset=['TESTNAME'])

In [19]:
#IMPORTANT TESTNAME: are the main names associated with CSF:
unique_result_subset.shape

(4785, 2)

In [25]:
common_values_symb = symb[symb['SOMA_SEQ_ID'].isin(unique_result_subset['TESTNAME'])]


In [27]:
symb_gene= common_values_symb.drop_duplicates(subset=['SOMA_SEQ_ID'])

In [30]:
symb_genesymb= symb_gene.reset_index(drop=True)

In [31]:
symb_genesymb

Unnamed: 0,SOMA_SEQ_ID,TARGET_GENE_SYMBOL
0,10000-28_3,CRYBB2
1,10001-7_3,RAF1
2,10003-15_3,ZNF41
3,10006-25_3,ELK1
4,10008-43_3,GUCA1A
...,...,...
4780,9993-11_3,ZNF264
4781,9994-217_3,ATP4B
4782,9995-6_3,DUT
4783,9997-12_3,UBXN4


In [40]:
gene_ensmble_genesyb = gin[gin['Gene name'].isin(symb_genesymb['TARGET_GENE_SYMBOL'])]


In [41]:
gene_ensmble_genesyb.shape

(42633, 2)

In [44]:
unique_ensg = gene_ensmble_genesyb.drop_duplicates(subset=['Gene name']).reset_index(drop=True)

In [45]:
unique_ensg

Unnamed: 0,Gene name,Gene stable ID
0,INTS3,ENSG00000262826
1,KIR3DL2,ENSG00000273735
2,KIR3DL3,ENSG00000276433
3,PTCHD3,ENSG00000276595
4,KIR2DL4,ENSG00000276779
...,...,...
3996,BTG2,ENSG00000159388
3997,KCNAB2,ENSG00000069424
3998,NUP210L,ENSG00000143552
3999,NTRK1,ENSG00000198400


In [46]:
# Loading the already preprocessed data:

path2 = Path("/home/znazari/data") # where the output data will be saved at the end.

# with all the filtered genes:

ir3_rna_step_vst =  pd.read_csv(path2/'mydata_Log_CPM_filtered_bact_sex_effect_removed_RIN_covariate.txt',delimiter='\t' )

diagnosis = pd.read_csv(path2/'patients_HC_PK_diagnosis.csv')

# mapping diagnosis to zero and one.
diagnosis['COHORT_DEFINITION'] = diagnosis['COHORT_DEFINITION'].map({'Healthy Control': 0, "Parkinson's Disease": 1})

# X: feature matrix, y: the target variable

X=ir3_rna_step_vst.T
y=diagnosis['COHORT_DEFINITION']

In [49]:
ir3_rna_step_vst

Unnamed: 0,3000,3001,3002,3003,3004,3006,3007,3008,3009,3010,...,4121,4122,4123,4124,4125,4126,4135,4136,4139,41410
ENSG00000000419,3.845119,5.213262,4.837730,5.101270,4.974751,4.825708,5.011214,5.229956,5.174531,5.284571,...,4.451699,5.408345,5.247058,4.584573,4.398598,5.449524,5.015613,3.987076,5.496642,4.963686
ENSG00000000457,5.648852,6.202708,5.624314,6.067816,6.092646,6.223112,6.003440,6.010872,6.272480,6.285266,...,5.863215,6.422896,6.368654,6.114092,5.798945,6.383039,6.290690,5.179048,6.293745,6.058590
ENSG00000000460,3.817007,4.095830,4.012751,4.262379,3.969580,4.374808,4.417553,4.331596,4.675491,4.348990,...,3.957820,4.518421,4.424952,4.145217,4.194468,4.628795,4.747158,3.569694,4.025577,4.268701
ENSG00000000938,8.463275,8.810743,8.368607,8.466923,8.927912,9.043402,8.681277,8.235545,8.067889,8.631065,...,8.416943,9.104202,8.838484,9.193292,8.217039,8.772340,8.497247,8.612951,8.695999,8.676493
ENSG00000000971,1.944928,3.384911,2.522713,4.085667,3.693875,2.544924,3.968554,4.085922,4.811027,4.093739,...,2.209538,2.390307,3.974330,2.196031,2.196303,3.910535,4.403068,2.359305,2.445675,3.267041
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000285219,2.721748,1.638440,3.771521,1.812421,2.549122,2.058582,2.125234,1.765596,2.069582,1.764143,...,2.495753,1.830501,1.218611,2.165035,3.126425,1.943526,0.648852,2.959793,2.179546,2.136740
ENSG00000285230,2.322872,2.517009,2.599599,2.386956,2.851771,2.737816,2.894924,2.388526,2.458358,2.695723,...,2.169890,2.481177,2.779666,2.643108,2.415556,2.438769,2.464313,2.474065,2.487454,2.544368
ENSG00000285280,4.541140,1.709681,2.196385,2.727775,2.760335,2.204436,2.655007,2.044956,1.865374,1.846651,...,2.344181,2.573584,2.253876,2.459218,3.058924,2.916741,2.501802,2.977444,2.196541,2.727432
ENSG00000285399,5.099339,5.353763,4.708896,5.048748,5.349433,5.567314,5.192088,5.066864,5.211345,5.326240,...,5.208116,5.197208,5.421681,5.656614,5.370812,5.384910,5.832674,4.774830,4.800063,5.287198


In [50]:
filtered_df = ir3_rna_step_vst[ir3_rna_step_vst.index.isin(unique_ensg['Gene stable ID'])]

In [51]:
filtered_df

Unnamed: 0,3000,3001,3002,3003,3004,3006,3007,3008,3009,3010,...,4121,4122,4123,4124,4125,4126,4135,4136,4139,41410
ENSG00000000938,8.463275,8.810743,8.368607,8.466923,8.927912,9.043402,8.681277,8.235545,8.067889,8.631065,...,8.416943,9.104202,8.838484,9.193292,8.217039,8.772340,8.497247,8.612951,8.695999,8.676493
ENSG00000000971,1.944928,3.384911,2.522713,4.085667,3.693875,2.544924,3.968554,4.085922,4.811027,4.093739,...,2.209538,2.390307,3.974330,2.196031,2.196303,3.910535,4.403068,2.359305,2.445675,3.267041
ENSG00000002330,3.571424,3.959793,3.388754,4.085483,3.756835,3.986339,3.665170,3.716128,4.060661,4.147625,...,3.368043,4.429167,3.945994,4.056662,2.965991,4.581369,4.673969,3.146942,4.272373,3.885313
ENSG00000002549,5.872790,5.908892,6.004722,6.073372,5.541370,7.682882,5.899073,5.657070,6.353882,5.949753,...,6.898440,6.502385,6.639824,5.425128,5.561972,6.668685,7.166978,6.111543,6.510586,6.325302
ENSG00000002587,0.139701,1.312890,1.736479,0.976318,1.218935,0.709119,1.053174,1.422313,2.073881,1.777967,...,1.751712,0.780928,1.562588,0.983353,1.199543,0.721738,-0.725075,1.494510,1.673137,0.951384
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000271303,4.141592,5.154501,3.158052,5.093705,3.283577,5.812244,4.497693,4.390843,3.943871,4.750242,...,2.731588,4.644567,4.741196,4.482349,3.986768,5.829626,5.499389,3.893179,5.252354,4.670282
ENSG00000271605,3.253041,3.349505,4.194189,4.647757,3.770106,4.032242,3.725165,4.014289,3.882068,3.648119,...,4.301260,3.829374,4.916163,4.849648,3.918818,3.755242,4.052355,3.537703,4.062805,3.979138
ENSG00000272325,6.689842,6.335426,6.208001,6.495175,6.384606,6.810942,6.646111,6.402878,6.308627,6.737337,...,6.560920,6.703202,6.730188,6.621639,6.583015,6.837237,6.500973,6.102476,6.663363,6.564332
ENSG00000272886,5.031533,5.461262,4.897880,5.414562,5.042777,5.492468,5.444711,5.371691,5.520976,5.315246,...,5.277152,5.400093,5.523296,5.426866,5.269029,5.388664,5.499137,4.822674,5.094705,5.293258


In [52]:
X=filtered_df.T
y=diagnosis['COHORT_DEFINITION']

In [64]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=40)

#  Initialize and train the XGBoost model
xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)

#  Make predictions on the testing set
y_pred_proba = xgb_model.predict_proba(X_test)[:, 1]

# Calculate AU-ROC score
au_roc_score = roc_auc_score(y_test, y_pred_proba)

# Print the AU-ROC score
print(f'AU-ROC Score: {au_roc_score}')

AU-ROC Score: 0.565281461833186


In [3]:
# Sample DataFrames
df1 = pd.DataFrame({'Column1': [1, 2, 3, 4, 5], 'Data1': ['A', 'B', 'C', 'D', 'E']})

df2 = pd.DataFrame({'Column2': [3, 4, 5, 6, 7], 'Data2': ['X', 'Y', 'Z', 'W', 'K']})

# Using merge to find the intersection
intersection = pd.merge(df1, df2, left_on='Column1', right_on='Column2', how='inner')

print(intersection)


   Column1 Data1  Column2 Data2
0        3     C        3     X
1        4     D        4     Y
2        5     E        5     Z


# compare between merge and isin:
in the merge we actually want to have the data from the both pandas dataframe files but when we use isin
we basiccaclly want from one dataframe those that are in common with the other one.

In [4]:
df1

Unnamed: 0,Column1,Data1
0,1,A
1,2,B
2,3,C
3,4,D
4,5,E


In [5]:
df2

Unnamed: 0,Column2,Data2
0,3,X
1,4,Y
2,5,Z
3,6,W
4,7,K


In [6]:
common_values2 = df1[df1['Column1'].isin(df2['Column2'])]


In [7]:
common_values2

Unnamed: 0,Column1,Data1
2,3,C
3,4,D
4,5,E


In [None]:
pd.set_option('display.max_rows', None)

In [None]:
pd.reset_option('display.max_rows')