### 1- Importing Libraries

In [35]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

###  2- Importing all files

In [36]:
Peptides_data = pd.read_csv('train_peptides.csv')
Proteins_data = pd.read_csv('train_proteins.csv')
supplemental_clinical_data = pd.read_csv('supplemental_clinical_data.csv')
train_clinical_data = pd.read_csv('train_clinical_data.csv')


### 3- Discovering and cleaning Train Clinical Data 

In [37]:
train_clinical_data.head()
train_clinical_data.isnull().sum()
train_clinical_data.isnull().any()
train_clinical_data[train_clinical_data['updrs_1'].isnull()]
train_clinical_data[train_clinical_data['updrs_2'].isnull()]
train_clinical_data[train_clinical_data['updrs_3'].isnull()]
train_clinical_data[train_clinical_data['updrs_4'].isnull()]

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication
0,55_0,55,0,10.0,6.0,15.0,,
1,55_3,55,3,10.0,7.0,25.0,,
2,55_6,55,6,8.0,10.0,34.0,,
13,942_0,942,0,3.0,2.0,20.0,,
14,942_3,942,3,7.0,2.0,17.0,,
...,...,...,...,...,...,...,...,...
2590,64674_24,64674,24,12.0,12.0,26.0,,
2591,64674_30,64674,30,12.0,18.0,48.0,,
2600,65043_0,65043,0,2.0,6.0,16.0,,
2601,65043_3,65043,3,2.0,11.0,19.0,,


In [38]:
#Replacing by mean for each patient for all UPDRS1-3

mean_by_patient_updrs1 = train_clinical_data.groupby('patient_id')['updrs_1'].mean().to_dict()
train_clinical_data['updrs_1'] = train_clinical_data.apply(lambda row: mean_by_patient_updrs1[row['patient_id']] if pd.isna(row['updrs_1']) else row['updrs_1'], axis=1)

mean_by_patient_updrs2 = train_clinical_data.groupby('patient_id')['updrs_2'].mean().to_dict()
train_clinical_data['updrs_2'] = train_clinical_data.apply(lambda row: mean_by_patient_updrs2[row['patient_id']] if pd.isna(row['updrs_2']) else row['updrs_2'], axis=1)

mean_by_patient_updrs3 = train_clinical_data.groupby('patient_id')['updrs_3'].mean().to_dict()
train_clinical_data['updrs_3'] = train_clinical_data.apply(lambda row: mean_by_patient_updrs3[row['patient_id']] if pd.isna(row['updrs_3']) else row['updrs_3'], axis=1)

# for updrs_4 it's very tricky to replace by the mean since most patients don't have any data measured so we are dropping the rows with missing udprs4

train_clinical_data = train_clinical_data[train_clinical_data["updrs_4"].notna()]

#upd23b_clinical_state_on_medication indicates if patient is on medication when mesauring updrs4 levels, which also not very wise nor accurate to replace by the mode. so we are dropping it 

train_clinical_data = train_clinical_data[train_clinical_data["upd23b_clinical_state_on_medication"].notna()]

In [39]:
#final check of our train_clinical_data
train_clinical_data.isnull().sum()
train_clinical_data.shape

(1266, 8)

### 4- Discovering and cleaning Supplemental Clinical Data 

In [40]:
supplemental_clinical_data.head()
supplemental_clinical_data.isnull().sum()
supplemental_clinical_data.isnull().any()
supplemental_clinical_data[supplemental_clinical_data['updrs_1'].isnull()]
supplemental_clinical_data[supplemental_clinical_data['updrs_2'].isnull()]
supplemental_clinical_data[supplemental_clinical_data['updrs_3'].isnull()]
supplemental_clinical_data[supplemental_clinical_data['updrs_4'].isnull()]

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication
4,155_0,155,0,,,0.0,,
12,573_5,573,5,,,24.0,,
13,673_0,673,0,5.0,1.0,18.0,,
14,673_6,673,6,3.0,2.0,14.0,,
15,673_12,673,12,6.0,3.0,20.0,,
...,...,...,...,...,...,...,...,...
2210,65110_5,65110,5,,,14.0,,
2211,65290_0,65290,0,4.0,14.0,18.0,,On
2212,65290_6,65290,6,1.0,12.0,22.0,,On
2218,65382_0,65382,0,,,0.0,,


In [41]:
supplemental_clinical_data.isnull().sum()


visit_id                                  0
patient_id                                0
visit_month                               0
updrs_1                                 213
updrs_2                                 214
updrs_3                                   5
updrs_4                                 928
upd23b_clinical_state_on_medication    1101
dtype: int64

In [42]:

#Replacing by mean for each patient for all UPDRS1-3


Mean_UPDRS1 = supplemental_clinical_data.groupby("patient_id")["updrs_1"].mean()
Mean_UPDRS2 = supplemental_clinical_data.groupby("patient_id")["updrs_2"].mean()
Mean_UPDRS3 = supplemental_clinical_data.groupby("patient_id")["updrs_3"].mean()
Mean_UPDRS4 = supplemental_clinical_data.groupby("patient_id")["updrs_4"].mean()

supplemental_clinical_data['updrs_1'] = supplemental_clinical_data['patient_id'].map(Mean_UPDRS1)
supplemental_clinical_data['updrs_2'] = supplemental_clinical_data['patient_id'].map(Mean_UPDRS2)
supplemental_clinical_data['updrs_3'] = supplemental_clinical_data['patient_id'].map(Mean_UPDRS3)

# for updrs_4 it's very tricky to replace by the mean since most patients don't have any data measured so we are dropping the rows with missing udprs4

supplemental_clinical_data = supplemental_clinical_data[supplemental_clinical_data["updrs_4"].notna()]

#upd23b_clinical_state_on_medication indicates if patient is on medication when mesauring updrs4 levels, which also not very wise nor accurate to replace by the mode. so we are dropping it 

supplemental_clinical_data = supplemental_clinical_data[supplemental_clinical_data["upd23b_clinical_state_on_medication"].notna()]

In [43]:
supplemental_clinical_data.isnull().sum()
supplemental_clinical_data.shape

(819, 8)

### 5- Merging Train Clinical Data and Supplemental Clinical Data

In [44]:
Final_clinical_data = pd.concat([train_clinical_data, supplemental_clinical_data])
Patient_nbr_clinical= len(Final_clinical_data['patient_id'].unique())
Final_clinical_data.sort_values(['patient_id', 'visit_id'], ascending=[True, True])

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication
4,55_12,55,12,10.0,10.0,41.0,0.0,On
5,55_18,55,18,7.0,13.0,38.0,0.0,On
6,55_24,55,24,16.0,9.0,49.0,0.0,On
7,55_30,55,30,14.0,13.0,49.0,0.0,On
8,55_36,55,36,17.0,18.0,51.0,0.0,On
...,...,...,...,...,...,...,...,...
2603,65043_9,65043,9,2.0,7.0,11.0,0.0,On
2213,65290_12,65290,12,3.0,13.8,16.0,0.0,On
2214,65290_24,65290,24,3.0,13.8,16.0,2.0,On
2215,65290_30,65290,30,3.0,13.8,16.0,0.0,On


In [45]:
Patient_nbr_clinical

562

### 6- Discovering Train Peptides 

In [46]:
Peptides_data.isnull().sum()
Peptides_data.head()
print(Peptides_data.nunique())

visit_id              1113
visit_month             15
patient_id             248
UniProt                227
Peptide                968
PeptideAbundance    738931
dtype: int64


In [47]:
Peptides_data.head()

Unnamed: 0,visit_id,visit_month,patient_id,UniProt,Peptide,PeptideAbundance
0,55_0,0,55,O00391,NEQEQPLGQWHLS,11254.3
1,55_0,0,55,O00533,GNPEPTFSWTK,102060.0
2,55_0,0,55,O00533,IEIPSSVQQVPTIIK,174185.0
3,55_0,0,55,O00533,KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK,27278.9
4,55_0,0,55,O00533,SMEQNGPGLEYR,30838.7


In [48]:
len(Peptides_data)

981834

In [49]:
Peptides_data['visit_month'].unique() 

array([  0,   3,   6,  12,  18,  24,  30,  36,  48,  54,  60,  72,  84,
        96, 108], dtype=int64)

In [50]:
Peptides_data['UniProt'].value_counts() #227 UniProt
Peptides_data['Peptide'].value_counts() #968 Peptide
Peptides_data['PeptideAbundance'].value_counts()

127056.0    10
144004.0    10
109591.0    10
103890.0     9
116278.0     9
            ..
21198.0      1
62987.2      1
182618.0     1
30224.1      1
12825.9      1
Name: PeptideAbundance, Length: 738931, dtype: int64

In [51]:
Peptides_pivot = Peptides_data.pivot(index=['visit_id', 'UniProt'], columns='Peptide', values='PeptideAbundance')

In [52]:
Peptides_pivot #unfortunately unlike NPX, we can't replace the missing values with O 

Unnamed: 0_level_0,Peptide,AADDTWEPFASGK,AAFGQGSGPIMLDEVQC(UniMod_4)TGTEASLADC(UniMod_4)K,AAFTEC(UniMod_4)C(UniMod_4)QAADK,AANEVSSADVK,AATGEC(UniMod_4)TATVGKR,AATVGSLAGQPLQER,AAVYHHFISDGVR,ADDKETC(UniMod_4)FAEEGK,ADDKETC(UniMod_4)FAEEGKK,ADDLGKGGNEESTKTGNAGSR,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
visit_id,UniProt,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
10053_0,O00391,,,,,,,,,,,...,,,,,,,,,,
10053_0,O00533,,,,,,,,,,,...,,,,,,,,,,
10053_0,O14773,,,,,,,,,,,...,,,,,,,,,,
10053_0,O14791,,,,,,,,,,,...,,,,,,,,,,
10053_0,O15240,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
942_6,Q9UHG2,,,,,,,,,,,...,,,,,,,,,,
942_6,Q9UKV8,,,,,,,,,,,...,,,,,,,,,,
942_6,Q9UNU6,,,,,,,,,,,...,,,,,,,,,,
942_6,Q9Y646,,,,,,,,,,,...,,,,,,,,,,


In [53]:
Peptides_pivot.loc[('55_0', ), :].shape

Peptides_pivot.loc[('55_0', ), :].isnull().sum()

Peptide
AADDTWEPFASGK                                       218
AAFGQGSGPIMLDEVQC(UniMod_4)TGTEASLADC(UniMod_4)K    218
AAFTEC(UniMod_4)C(UniMod_4)QAADK                    218
AANEVSSADVK                                         219
AATGEC(UniMod_4)TATVGKR                             218
                                                   ... 
YVNKEIQNAVNGVK                                      218
YWGVASFLQK                                          218
YYC(UniMod_4)FQGNQFLR                               218
YYTYLIMNK                                           218
YYWGGQYTWDMAK                                       218
Length: 968, dtype: int64

##### unfortunately unlike NPX in proteins , we can't replace the PeptideAbundance missing values by 0 , it's not advisable without careful consideration and doing so may introduce significant bias and incorrect information into your data (NOT SURE VERIFYING THIS WITH MEDICS)

In [54]:
Peptides_pivot.fillna(0)

Unnamed: 0_level_0,Peptide,AADDTWEPFASGK,AAFGQGSGPIMLDEVQC(UniMod_4)TGTEASLADC(UniMod_4)K,AAFTEC(UniMod_4)C(UniMod_4)QAADK,AANEVSSADVK,AATGEC(UniMod_4)TATVGKR,AATVGSLAGQPLQER,AAVYHHFISDGVR,ADDKETC(UniMod_4)FAEEGK,ADDKETC(UniMod_4)FAEEGKK,ADDLGKGGNEESTKTGNAGSR,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
visit_id,UniProt,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
10053_0,O00391,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10053_0,O00533,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10053_0,O14773,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10053_0,O14791,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10053_0,O15240,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
942_6,Q9UHG2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942_6,Q9UKV8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942_6,Q9UNU6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942_6,Q9Y646,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [55]:
Peptides_pivot.shape # same length as my protein data --> on peut faire un croisement des tabelaux

(232741, 968)

In [56]:
Peptides_pivot.isnull().sum()

Peptide
AADDTWEPFASGK                                       231672
AAFGQGSGPIMLDEVQC(UniMod_4)TGTEASLADC(UniMod_4)K    231692
AAFTEC(UniMod_4)C(UniMod_4)QAADK                    231634
AANEVSSADVK                                         231832
AATGEC(UniMod_4)TATVGKR                             231827
                                                     ...  
YVNKEIQNAVNGVK                                      231630
YWGVASFLQK                                          231652
YYC(UniMod_4)FQGNQFLR                               231636
YYTYLIMNK                                           231711
YYWGGQYTWDMAK                                       231876
Length: 968, dtype: int64

### 7- Disocvering the Train protein

In [57]:
Proteins_data.isnull().sum()
Proteins_data.head()
print(Proteins_data.nunique())

visit_id         1113
visit_month        15
patient_id        248
UniProt           227
NPX            218795
dtype: int64


In [58]:
print(Proteins_data.nunique())

visit_id         1113
visit_month        15
patient_id        248
UniProt           227
NPX            218795
dtype: int64


In [59]:
len(Proteins_data)
    

232741

In [60]:
Proteins_data['UniProt'].value_counts() # 227 uniprot

P01024    1113
P05090    1113
P01011    1113
P01023    1113
Q92520    1113
          ... 
Q6UX71     661
P01780     654
Q562R1     616
Q99832     606
Q99829     489
Name: UniProt, Length: 227, dtype: int64

In [61]:
Rim BoughanmiProteins_data['NPX'].value_counts() #few first values are redudant and the rest are unique

SyntaxError: invalid syntax (293905595.py, line 1)

In [None]:
RimPatientID = Proteins_data['patient_id'].unique()
PatientID

In [None]:
len(PatientID) #We have 248 Patient

In [None]:
Visits = Proteins_data['visit_month'].unique()
Visits
len(Visits)

In [None]:
VisitsID = Proteins_data['visit_id'].unique()

In [None]:
len(VisitsID)

In [None]:
Proteins_pivot = Proteins_data.pivot(index=['visit_id'], columns='UniProt', values='NPX')

Proteins_pivot.head()

In [None]:
Proteins_pivot.fillna(0) #Replace by Zero because of the protein indicator is NaN -> 0 value measured

In [None]:
Proteins_pivot.plot()
plt.show()

In [None]:
#Plot the first ID to see how NPX levels are for each Uniprot for the first few visits for Patient_10053
Proteins_pivot.loc["10053_0"].plot(c="pink")
plt.title('Patient 10053, visit0 NPX/UniProt')
plt.ylabel('NPX Levels')


In [None]:
Rim#Second visit month 12 --> he skipped month 3 and 6
Proteins_pivot.loc["10053_12"].plot(c="pink")
plt.title('Patient 10053, visit12 NPX/UniProt')
plt.ylabel('NPX Levels')

In [None]:
#Testing the coorelation for UPDRS levels and NPX 

Final_clinical_data['Total_UPDRS'] = Final_clinical_data[['updrs_1','updrs_2','updrs_3', 'updrs_4']].sum(axis=1)


#Testing the coorelation levels for UPDRS and NPX 

In [None]:
Final_clinical_data.i['Total_UPDERS'].drop()

In [None]:
Final_clinical_data

In [None]:
Last_visit_of_each_patient = Final_clinical_data.copy()
Last_visit_of_each_patient = Last_visit_of_each_patient.drop_duplicates(subset=["patient_id"], keep='last')

In [None]:
Last_visit_of_each_patient

In [None]:
UPDRS = np.array(Last_visit_of_each_patient['Total_UPDRS'])
patient_ID = np.array(Last_visit_of_each_patient['patient_id'])

In [None]:
UPDRS

### 7- Merging Train proteins and Train pepetides based on the shared UniProt

In [None]:
plt.scatter(UPDRS)

In [None]:
Boughanmi

### 8- Calculate the mean fo each updr and divide patients by clusters and divide them by clusters 

### 9- Heat map the levels and indntify and coorelation between the UPDRS -> define UPDRS indicators

### 10- Check for each cluster how Peptide levels evolve -> define Peptide indiciator 

### 11-  Check for each cluster how Proteins levels evolve -> define Protein indiciator 

### 12- Calculate the coorelation for All indicators and establish a relationship

### 13- ML models testing 