### 1- Importing Libraries

In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

: 

###  2- Importing all files

In [3]:
Peptides_data = pd.read_csv('train_peptides.csv')
Proteins_data = pd.read_csv('train_proteins.csv')
supplemental_clinical_data = pd.read_csv('supplemental_clinical_data.csv')
train_clinical_data = pd.read_csv('train_clinical_data.csv')


### 3- Discovering and cleaning Train Clinical Data 

In [4]:
train_clinical_data.head()


Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication
0,55_0,55,0,10.0,6.0,15.0,,
1,55_3,55,3,10.0,7.0,25.0,,
2,55_6,55,6,8.0,10.0,34.0,,
3,55_9,55,9,8.0,9.0,30.0,0.0,On
4,55_12,55,12,10.0,10.0,41.0,0.0,On


In [5]:
train_clinical_data.isnull().sum()


visit_id                                  0
patient_id                                0
visit_month                               0
updrs_1                                   1
updrs_2                                   2
updrs_3                                  25
updrs_4                                1038
upd23b_clinical_state_on_medication    1327
dtype: int64

In [6]:
train_clinical_data.shape

(2615, 8)

###### Filling missing values in the updrs_1, updrs_2, and updrs_3 columns involved a process that ensured accurate imputation while considering each patient's data sequence. The function fill_missing_updrs123() was created to systematically handle missing values within patient groups. This function first performed forward-fill and backward-fill operations within each patient's data to replace missing values with the nearest available measurements before and after. By incorporating this function for each UPDRS column, we were able to maintain data integrity by utilizing the available information while accounting for temporal sequencing within patients' visits. The temporary columns created during this process were then dropped to maintain the cleanliness and organization of the data. However a different approach would be implemented for updrs_4

In [7]:


def fill_missing_updrs_123(data, updrs_column):
    # Create a temporary filled column within each patient group
    filled_column = f'{updrs_column}_filled'
    data[filled_column] = data.groupby('patient_id')[updrs_column].ffill()
    data[filled_column] = data.groupby('patient_id')[filled_column].bfill()
    data[updrs_column] = data[updrs_column].fillna(data[filled_column])
    
    # Dropping the temporary filled column
    data.drop(columns=filled_column, inplace=True)
    return data

# Filling missing values for UPDRS_1
train_clinical_data = fill_missing_updrs_123(train_clinical_data, 'updrs_1')

# Filling missing values for UPDRS_2
train_clinical_data = fill_missing_updrs_123(train_clinical_data, 'updrs_2')

# Filling missing values for UPDRS_3
train_clinical_data = fill_missing_updrs_123(train_clinical_data, 'updrs_3')


In [8]:
train_clinical_data.isnull().sum()

visit_id                                  0
patient_id                                0
visit_month                               0
updrs_1                                   0
updrs_2                                   0
updrs_3                                   0
updrs_4                                1038
upd23b_clinical_state_on_medication    1327
dtype: int64

###### In addressing missing values within the updrs_4 column, a two-fold strategy was implemented. First, a binary indicator column, 'updrs_4_missing', was created to distinctly mark the initially missing values. This indicator serves as a valuable flag, indicating whether the patient's updrs_4 level was genuinely absent or started as 0. Subsequently, the missing values in updrs_4 were replaced with zeros. Even in cases where a patient's updrs_4 level might be naturally 0, the 'updrs_4_missing' column distinguishes between inherent zero levels and missing data. This dual approach not only addresses the missing values but also acknowledges both inherent zero levels and absent measurements.

In [9]:

# Creating a binary indicator column for missing UPDRS_4 values
train_clinical_data['updrs_4_missing'] = train_clinical_data['updrs_4'].isnull().astype(int)

# Filling missing values in UPDRS_4 with zeros
train_clinical_data['updrs_4'] = train_clinical_data['updrs_4'].fillna(0)


In [10]:
 train_clinical_data['clinical_state_on_medication'] =  train_clinical_data['upd23b_clinical_state_on_medication']

In [11]:

mode_per_patient = train_clinical_data.groupby('patient_id')['clinical_state_on_medication'].apply(lambda x: x.mode().iloc[0] if not x.mode().empty else None).reset_index()

# Merging the mode values back to the original dataframe
train_clinical_data = pd.merge(train_clinical_data, mode_per_patient, on='patient_id', how='left', suffixes=('', '_mode'))

# Filling missing values in 'clinical_state_on_medication' with the calculated modes
train_clinical_data['clinical_state_on_medication'] = train_clinical_data['clinical_state_on_medication'].fillna(train_clinical_data['clinical_state_on_medication_mode'])

# Dropping the temporary mode column
train_clinical_data.drop(columns='clinical_state_on_medication_mode', inplace=True)


In [12]:
del(train_clinical_data["upd23b_clinical_state_on_medication"])

In [13]:
train_clinical_data.isnull().sum()

visit_id                          0
patient_id                        0
visit_month                       0
updrs_1                           0
updrs_2                           0
updrs_3                           0
updrs_4                           0
updrs_4_missing                   0
clinical_state_on_medication    732
dtype: int64

###### filling missing values with 'unknown'

In [14]:
train_clinical_data['clinical_state_on_medication'].fillna('Unknown', inplace=True)


In [15]:
train_clinical_data.isna().sum()

visit_id                        0
patient_id                      0
visit_month                     0
updrs_1                         0
updrs_2                         0
updrs_3                         0
updrs_4                         0
updrs_4_missing                 0
clinical_state_on_medication    0
dtype: int64

In [16]:
train_clinical_data.shape

(2615, 9)

In [17]:
train_clinical_data.head()

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,updrs_4_missing,clinical_state_on_medication
0,55_0,55,0,10.0,6.0,15.0,0.0,1,On
1,55_3,55,3,10.0,7.0,25.0,0.0,1,On
2,55_6,55,6,8.0,10.0,34.0,0.0,1,On
3,55_9,55,9,8.0,9.0,30.0,0.0,0,On
4,55_12,55,12,10.0,10.0,41.0,0.0,0,On


### 4- Discovering and cleaning Supplemental Clinical Data 

######  supplemental_clinical_data is same as train_clinical_data so we will be foolowing the same claning strategy before we merge them into one data frame

In [18]:
supplemental_clinical_data.head()


Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication
0,35_0,35,0,5.0,3.0,16.0,0.0,
1,35_36,35,36,6.0,4.0,20.0,0.0,
2,75_0,75,0,4.0,6.0,26.0,0.0,
3,75_36,75,36,1.0,8.0,38.0,0.0,On
4,155_0,155,0,,,0.0,,


In [19]:
supplemental_clinical_data.isnull().sum()


visit_id                                  0
patient_id                                0
visit_month                               0
updrs_1                                 213
updrs_2                                 214
updrs_3                                   5
updrs_4                                 928
upd23b_clinical_state_on_medication    1101
dtype: int64

In [20]:

# Filling missing values for UPDRS_1
supplemental_clinical_data = fill_missing_updrs_123(supplemental_clinical_data, 'updrs_1')

# Filling missing values for UPDRS_2
supplemental_clinical_data = fill_missing_updrs_123(supplemental_clinical_data, 'updrs_2')

# Filling missing values for UPDRS_3
supplemental_clinical_data = fill_missing_updrs_123(supplemental_clinical_data, 'updrs_3')


In [21]:
supplemental_clinical_data.isna().sum()

visit_id                                  0
patient_id                                0
visit_month                               0
updrs_1                                  96
updrs_2                                  96
updrs_3                                   3
updrs_4                                 928
upd23b_clinical_state_on_medication    1101
dtype: int64

In [22]:
# Displaying the rows with missing values in 'clinical_state_on_medication'
missing_values_mask = supplemental_clinical_data['updrs_1'].isnull()
rows_with_missing_values = supplemental_clinical_data[missing_values_mask]
print(rows_with_missing_values)


     visit_id  patient_id  visit_month  updrs_1  updrs_2  updrs_3  updrs_4  \
4       155_0         155            0      NaN      NaN      0.0      NaN   
27      889_0         889            0      NaN      NaN      0.0      NaN   
65     1663_0        1663            0      NaN      NaN      0.0      NaN   
85     2222_0        2222            0      NaN      NaN      0.0      NaN   
86     2331_0        2331            0      NaN      NaN      1.0      NaN   
...       ...         ...          ...      ...      ...      ...      ...   
2127  63278_0       63278            0      NaN      NaN      4.0      NaN   
2130  63335_0       63335            0      NaN      NaN      7.0      NaN   
2136  63479_0       63479            0      NaN      NaN      6.0      NaN   
2148  63774_0       63774            0      NaN      NaN      0.0      NaN   
2218  65382_0       65382            0      NaN      NaN      0.0      NaN   

     upd23b_clinical_state_on_medication  
4                   

###### these patients have most of their updrs level missing so it would be more accurate to just drop their correspening rows

In [23]:
supplemental_clinical_data.dropna(subset=['updrs_1', 'updrs_2', 'updrs_3'], inplace=True)


In [None]:
supplemental_clinical_data.isna().sum()

In [24]:
supplemental_clinical_data['updrs_4_missing'] = supplemental_clinical_data['updrs_4'].isnull().astype(int)
supplemental_clinical_data['updrs_4'] = supplemental_clinical_data['updrs_4'].fillna(0)

In [25]:
supplemental_clinical_data.isna().sum()

visit_id                                  0
patient_id                                0
visit_month                               0
updrs_1                                   0
updrs_2                                   0
updrs_3                                   0
updrs_4                                   0
upd23b_clinical_state_on_medication    1004
updrs_4_missing                           0
dtype: int64

In [28]:
supplemental_clinical_data['clinical_state_on_medication'] =  supplemental_clinical_data['upd23b_clinical_state_on_medication']

In [29]:
del(supplemental_clinical_data["upd23b_clinical_state_on_medication"])

In [30]:

mode_per_patient1 = supplemental_clinical_data.groupby('patient_id')['clinical_state_on_medication'].apply(lambda x: x.mode().iloc[0] if not x.mode().empty else None).reset_index()

# Merging the mode values back to the original dataframe
supplemental_clinical_data = pd.merge(supplemental_clinical_data, mode_per_patient1, on='patient_id', how='left', suffixes=('', '_mode'))

# Filling missing values in 'clinical_state_on_medication' with the calculated modes
supplemental_clinical_data['clinical_state_on_medication'] = supplemental_clinical_data['clinical_state_on_medication'].fillna(supplemental_clinical_data['clinical_state_on_medication_mode'])

# Dropping the temporary mode column
supplemental_clinical_data.drop(columns='clinical_state_on_medication_mode', inplace=True)


In [31]:
supplemental_clinical_data.isna().sum()

visit_id                          0
patient_id                        0
visit_month                       0
updrs_1                           0
updrs_2                           0
updrs_3                           0
updrs_4                           0
updrs_4_missing                   0
clinical_state_on_medication    469
dtype: int64

In [32]:
supplemental_clinical_data['clinical_state_on_medication'].fillna('Unknown', inplace=True)


In [33]:
supplemental_clinical_data.isna().sum()

visit_id                        0
patient_id                      0
visit_month                     0
updrs_1                         0
updrs_2                         0
updrs_3                         0
updrs_4                         0
updrs_4_missing                 0
clinical_state_on_medication    0
dtype: int64

In [36]:
supplemental_clinical_data.shape

(2125, 9)

### 5- Merging Train Clinical Data and Supplemental Clinical Data

###### Here we are going to combine both of our files (train_clinical_data and suppelemental_clinical_data) into one file

In [34]:
Final_clinical_data = pd.concat([train_clinical_data, supplemental_clinical_data])
Final_clinical_data.drop_duplicates(keep=False, inplace=True)

In [35]:
Final_clinical_data.head()

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,updrs_4_missing,clinical_state_on_medication
0,55_0,55,0,10.0,6.0,15.0,0.0,1,On
1,55_3,55,3,10.0,7.0,25.0,0.0,1,On
2,55_6,55,6,8.0,10.0,34.0,0.0,1,On
3,55_9,55,9,8.0,9.0,30.0,0.0,0,On
4,55_12,55,12,10.0,10.0,41.0,0.0,0,On


In [37]:
Final_clinical_data.shape

(4740, 9)

In [40]:
Final_clinical_data.duplicated().any()

False

In [41]:
Final_clinical_data.isna().sum()

visit_id                        0
patient_id                      0
visit_month                     0
updrs_1                         0
updrs_2                         0
updrs_3                         0
updrs_4                         0
updrs_4_missing                 0
clinical_state_on_medication    0
dtype: int64

###### Our first dataframe is clean and set for usage

### 6- Discovering Train Peptides 

In [42]:
Peptides_data.head()

Unnamed: 0,visit_id,visit_month,patient_id,UniProt,Peptide,PeptideAbundance
0,55_0,0,55,O00391,NEQEQPLGQWHLS,11254.3
1,55_0,0,55,O00533,GNPEPTFSWTK,102060.0
2,55_0,0,55,O00533,IEIPSSVQQVPTIIK,174185.0
3,55_0,0,55,O00533,KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK,27278.9
4,55_0,0,55,O00533,SMEQNGPGLEYR,30838.7


In [44]:
Peptides_data.shape

(981834, 6)

In [46]:

summed_peptide_data = Peptides_data.groupby(['visit_id'])['PeptideAbundance'].sum().reset_index()


In [47]:
summed_peptide_data

Unnamed: 0,visit_id,PeptideAbundance
0,10053_0,4.713352e+08
1,10053_12,4.666371e+08
2,10053_18,5.220732e+08
3,10138_12,6.515605e+08
4,10138_24,6.720870e+08
...,...,...
1108,8699_24,6.618368e+08
1109,942_12,5.540194e+08
1110,942_24,5.677032e+08
1111,942_48,5.979831e+08


In [49]:
from scipy import stats
Summary_stats_peptides = summed_peptide_data.groupby(['visit_id'])['PeptideAbundance'].describe()
print(summary_stats)


NameError: name 'summary_stats' is not defined

In [None]:
Peptides_data['visit_month'].unique() 

In [None]:
Peptides_data['UniProt'].value_counts() #227 UniProt
Peptides_data['Peptide'].value_counts() #968 Peptide
Peptides_data['PeptideAbundance'].value_counts()

In [None]:
Peptides_pivot = Peptides_data.pivot(index=['visit_id', 'UniProt'], columns='Peptide', values='PeptideAbundance')

In [None]:
Peptides_pivot #unfortunately unlike NPX, we can't replace the missing values with O 

In [None]:
Peptides_pivot.loc[('55_0', ), :].shape

Peptides_pivot.loc[('55_0', ), :].isnull().sum()

##### unfortunately unlike NPX in proteins , we can't replace the PeptideAbundance missing values by 0 , it's not advisable without careful consideration and doing so may introduce significant bias and incorrect information into your data (NOT SURE VERIFYING THIS WITH MEDICS)

In [None]:
Peptides_pivot.fillna(0)

In [None]:
Peptides_pivot.shape # same length as my protein data --> on peut faire un croisement des tabelaux

In [None]:
Peptides_pivot.isnull().sum()

### 7- Disocvering the Train protein

In [None]:
Proteins_data.isnull().sum()
Proteins_data.head()
print(Proteins_data.nunique())

In [None]:
print(Proteins_data.nunique())

In [None]:
len(Proteins_data)
    

In [None]:
Proteins_data['UniProt'].value_counts() # 227 uniprot

In [None]:
Proteins_data['NPX'].value_counts() #few first values are redudant and the rest are unique

In [None]:
PatientID = Proteins_data['patient_id'].unique()
PatientID

In [None]:
len(PatientID) #We have 248 Patient

In [None]:
Visits = Proteins_data['visit_month'].unique()
Visits
len(Visits)

In [None]:
VisitsID = Proteins_data['visit_id'].unique()

In [None]:
len(VisitsID)

In [None]:
Proteins_pivot = Proteins_data.pivot(index=['visit_id'], columns='UniProt', values='NPX')

Proteins_pivot.head()

In [None]:
Proteins_pivot.fillna(0) #Replace by Zero because of the protein indicator is NaN -> 0 value measured

In [None]:
Proteins_pivot.plot()
plt.show()

In [None]:
#Plot the first ID to see how NPX levels are for each Uniprot for the first few visits for Patient_10053
Proteins_pivot.loc["10053_0"].plot(c="pink")
plt.title('Patient 10053, visit0 NPX/UniProt')
plt.ylabel('NPX Levels')


In [None]:
#Second visit month 12 --> he skipped month 3 and 6
Proteins_pivot.loc["10053_12"].plot(c="pink")
plt.title('Patient 10053, visit12 NPX/UniProt')
plt.ylabel('NPX Levels')

### 7- Merging Train proteins and Train pepetides based on the shared UniProt

### 8- Calculate the mean fo each updr and divide patients by clusters and divide them by clusters 

### 9- Heat map the levels and indntify and coorelation between the UPDRS -> define UPDRS indicators

### 10- Check for each cluster how Peptide levels evolve -> define Peptide indiciator 

### 11-  Check for each cluster how Proteins levels evolve -> define Protein indiciator 

### 12- Calculate the coorelation for All indicators and establish a relationship

### 13- ML models testing 