 #  RNA-Seq Data Preprocessing STEP I


-------

- We keep only individuals with diagnosis of Health Control or Parkinson's Disease.
- We remove patients that have these gene mutations : SNCA, GBA, LRRK2, and taking dopaminergic drugs.
- We remove the duplicated gene IDs which are those that carry ensembl genes with suffix _PAR_Y and their X transcripts.
-  We only keep genes that are either in the 19393 protein coding gene list or in 5874 long intergenic non-coding RNAs (lincRNAs) list that we obtained from the official HGNC repository (date: 31-Jan-2024).
- We filter out genes with low expression levels, retaining only those genes that exhibit more than five counts in a minimum of 10% of the individuals.

In [1]:
import pandas as pd
from pathlib import Path
from datetime import datetime

In [2]:
path2 = Path("/home/znazari/data") # where the output data will be saved at the end.
path3=Path("/scratch/znazari/PPMI_ver_sep2022/study_data/Subject_Characteristics/")

In [3]:
# Read the main table of gene IDs vs invididuals 
read_ir3_counts = pd.read_csv(path2/"matrix_ir3_counts_bl.csv")

# Set the geneid as indexing column
read_ir3_counts.set_index('Geneid', inplace=True)

In [4]:
# Read the file which contains diagnosis
diago=pd.read_csv(path3/"Participant_Status.csv", header=None )
diago1=diago.rename(columns=diago.iloc[0]).drop(diago.index[0]).reset_index(drop=True)

# Select only diagnosis with parkinson's and control.
selected_diagnosis_pd_hc = diago1[diago1['COHORT_DEFINITION'].isin(['Healthy Control', "Parkinson's Disease"])]
pd_hc = selected_diagnosis_pd_hc['PATNO']

# Matrix of gene IDs with pateints only with PD and Control subjests
filtered_df = read_ir3_counts.loc[:, read_ir3_counts.columns.isin(pd_hc)]

In [6]:
# Read the file which contains patients with gene mutations and/or using dopaminergic drugs
union_drugs_mutations=pd.read_csv(path2/'union_drugs_mutations.csv', index_col=0)
s_union_drugs_mutations= union_drugs_mutations['0']
s_union_drugs_mutations_str = s_union_drugs_mutations.astype(str)

# Remove patients with gene mutations and/or using dopaminergic drugs
filtered_df_drug = filtered_df.drop(columns=s_union_drugs_mutations_str, errors='ignore')

# Make some copy for later use
filtered_df_drug_c = filtered_df_drug.copy()

## Genes with  _PAR_Y and their X counterpart

The suffix _PAR_Y in Ensembl ID refers to the pseudoautosomal region (PAR) of the Y chromosome (which can only be found in males). The PAR is a region of the Y chromosome that shares homology with the X chromosome, and it is the only part of the Y chromosome that recombines with the X chromosome during meiosis.

For simplicity, we exclude genes associated with the Y chromosome (_PAR_Y genes), as well as their X chromosome counterparts. Alternatively  for female we could remove the _PAR_Y as it is irrelevant for them and for males we could add the value of _PAR_Y genes with their X counterparts, considering that quantification should encompass the total value of these isoforms.

In [8]:
# Remove the version
filtered_df_drug_c.index =filtered_df_drug.index.str.split('.').str[0]

# Get the duplicated indices
duplicated_indices = filtered_df_drug_c.index[filtered_df_drug_c.index.duplicated()]

# Create a new dataframe with the duplicated indices
new_df = filtered_df_drug_c.loc[duplicated_indices]

# Sort the list of based on their indices
new_df.sort_index(inplace=True)

# Get the indices
gene_id_duplicated = new_df.index

# Make a list out of indices
list_duplicated_genes = list(gene_id_duplicated)

# Filter the list of genes with their _PAR_Y counter part and including their versions
matching_rows = filtered_df_drug[filtered_df_drug.index.str.contains('|'.join(list_duplicated_genes), case=False)]

# Make a list of genes that have duplicated Y chromosomes as well
Genes_list_duplicated = list(matching_rows.index)

filtered_df_druggg = filtered_df_drug[~filtered_df_drug.index.isin(Genes_list_duplicated)]

## Save complete list of Gene IDs with and withOUT versions

In [14]:
# Combine them together
all_IR3_counts_gene_IDs = pd.DataFrame({
    'Gene_IDs_with_version': filtered_df_drug.index,
    'Gene_IDs_withOUT_version': filtered_df_drug_c.index})

# Save complete list of Gene IDs with and withOUT versions
all_IR3_counts_gene_IDs.to_csv(path2/"all_IR3_counts_gene_IDs.csv", index=False)

## Ensembl gene IDs corresponding to protein-coding and long non-coding RNA genes

We obtained a complete list of long non-coding RNA and protein-coding gene symbols from the official HGNC repository (date: 31-Jan-2024), that were then converted to Ensemble gene IDs using a table from BioMart (date: 31-Jan-2024). Only expression data corresponding to long non-coding RNA and protein-coding genes (24694 Ensemble gene IDs in total) were selected for further analysis.

In [11]:
# Keep the genes with proteomic and non proteomic IncRNAs genes

# protein coding gene symbols:
protein_coding = pd.read_csv("../External_data/HGNC_protein_coding_genes_31Jan2024.txt", delimiter='\t')

# long non-coding protein gene symbols
non_protein_codying = pd.read_csv("../External_data/HGNC_lncRNA_non_coding_genes_31Jan2024.txt", delimiter='\t')

# Dictionary to convery gene symbols to Ensemble gene IDs:
dictionary = pd.read_csv("../External_data/HGNC_BioMart_symbol_name_EnsembleGeneID_31Jan2024.txt", delimiter='\t')

# Ensemble gene IDs for protein and long non-coding protein genes:
intersection_genes=pd.read_csv("../External_data/Intersection_genes_IR3_counts_gene_IDs_with_HGNC_lncRNA_Protein_Coding_04feb2024.txt", delimiter='\t')

In [12]:
# Copy the dataframe
full_list_genes_hc_pd = filtered_df_druggg.copy() 

# Remove the version
full_list_genes_hc_pd.index =full_list_genes_hc_pd.index.str.split('.').str[0]

# Protein and long non-coding protein genes list
protein_long_non_ptotein= list(intersection_genes["Ensemble_Gene_ID"])

# Filter the DataFrame based on the intersection of ensemble gene IDs
full_ensemblgene = full_list_genes_hc_pd[full_list_genes_hc_pd.index.isin(protein_long_non_ptotein)]

## Lowly expressed genes removal
We filter out genes with low expression levels, retaining only those genes that exhibit more than five counts in a minimum of 10% of the individuals.

In [19]:
# Calculate the percentage of patients in which each gene has more than five counts
gene_percentages = (full_ensemblgene > 5).sum(axis=1) / full_ensemblgene.shape[1]

# Set the threshold value
threshold = 0.1

# Filter out genes whose percentage is below the threshold
filtered_genes = gene_percentages[gene_percentages >= threshold].index

# Remove the filtered genes from the dataframe
highly_expressed_genes = full_ensemblgene.loc[filtered_genes]

In [20]:
highly_expressed_genes

Unnamed: 0_level_0,3000,3001,3002,3003,3004,3008,3010,3011,3012,3013,...,4075,4076,4079,4081,4091,4102,4108,4115,4136,4139
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000003,40,13,87,11,27,24,14,35,20,35,...,60,22,25,18,43,20,7,8,16,15
ENSG00000000005,4,0,28,2,10,0,2,0,0,2,...,21,1,0,0,19,0,1,0,5,1
ENSG00000000419,563,815,879,855,1194,980,1185,1446,672,1048,...,492,528,712,687,468,855,555,628,426,754
ENSG00000000457,1869,1510,1438,1593,2418,1607,2210,2702,1573,2573,...,923,1160,1647,1808,1223,1586,1271,1378,1037,1390
ENSG00000000460,512,367,460,444,581,488,605,922,515,834,...,291,398,540,530,356,395,438,510,343,291
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000285844,23,2,28,8,34,6,11,7,14,21,...,34,28,8,3,39,7,5,4,13,10
ENSG00000285869,0,0,16,0,7,1,0,0,0,0,...,23,6,0,0,8,0,0,0,4,2
ENSG00000285967,1043,1291,1173,963,1609,1300,1729,1950,1188,2077,...,670,776,1038,1141,784,796,890,1081,700,784
ENSG00000285972,7,2,7,1,3,0,4,1,1,4,...,5,10,0,0,14,2,0,0,8,2


In [83]:
# Get the current date
current_date = datetime.now().date()

# Print the current date
print("Last update :", current_date)

Last update : 2024-02-06


In [17]:
import pandas as pd

# Assuming you have a DataFrame named df with patient IDs as columns and gene counts as rows
# Replace these with your actual DataFrame

# Sample data creation (replace this with your actual data)
data = {'GeneID': ['GeneA', 'GeneB', 'GeneC'],
        'Patient1': [3, 7, 10],
        'Patient2': [1, 8, 12],
        'Patient3': [2, 5, 9],
       'Patient4': [2, 5, 9]}
df = pd.DataFrame(data)

# Set 'GeneID' column as the index
df.set_index('GeneID', inplace=True)

# Calculate the percentage of patients in which each gene has more than five counts
gene_percentages = (df > 5).sum(axis=1) / df.shape[1]

# Display the result
gene_percentages


GeneID
GeneA    0.0
GeneB    0.5
GeneC    1.0
dtype: float64

In [18]:
df

Unnamed: 0_level_0,Patient1,Patient2,Patient3,Patient4
GeneID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GeneA,3,1,2,2
GeneB,7,8,5,5
GeneC,10,12,9,9


In [29]:
(df > 5).sum(axis=1)

GeneID
GeneA    0
GeneB    2
GeneC    4
dtype: int64

In [30]:
df

Unnamed: 0_level_0,Patient1,Patient2,Patient3,Patient4
GeneID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GeneA,3,1,2,2
GeneB,7,8,5,5
GeneC,10,12,9,9


In [32]:
df.shape[0]

3

In [20]:
import pandas as pd



# Calculate the number of counts for each gene across all patients
#gene_counts = df.sum(axis=0)

# Calculate the percentage of patients in which each gene has more than five counts
#gene_percentages = (df > 5).sum(axis=0) / df.shape[0]

# Set the threshold value
threshold = 0.1

# Filter out genes whose percentage is below the threshold
filtered_genes = gene_percentages[gene_percentages >= threshold].index

# Remove the filtered genes from the dataframe
df_filtered = df.loc[filtered_genes]


In [19]:
filtered_genes

Index(['GeneB', 'GeneC'], dtype='object', name='GeneID')

In [21]:
df_filtered

Unnamed: 0_level_0,Patient1,Patient2,Patient3
GeneID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
GeneB,7,8,5
GeneC,10,12,9


In [None]:
#rna_filtered.to_csv(path2/'ir3_rna_step1.csv', index=True)
# combine the target as well

In [None]:
####### here read a transcriptomic matrix, transpose it,  correlate genes with Diagnosis (0/1) using Pearson.
#######  Select the top 10% most correlated genes in absolute values, then add a Class column as last column with PD/CTR diagnosis

import os
import sys
import csv
import pandas
import numpy
import scipy


working_path="D:\\disco_H\\bandi_grants\\Regione_Lazio_FILAS_2016_Confessore\\E_LIFE_submitted_13dic2016\\tesi_laurea\\Zainab_Nazari\\PPMI_analysis\\R_limma"

os.chdir(working_path)

name_input_file="mydata_TMM_Norm_Log2_CPM_filtered_batch_sex_effect_removed_RIN_covariate_06feb2024.txt"
name_output_file_T="mydata_TMM_Norm_Log2_CPM_filtered_batch_sex_effect_removed_RIN_covariate_T_06feb2024.txt"
name_output_file_T_Class="mydata_TMM_Norm_Log2_CPM_filt_batch_sex_RIN_covariate_T_06feb2024_Class_PD_CTR.txt"
name_factor_file="factor_ir3_rna_step1_preprocessing_06feb2024.txt"

myfactor=pandas.read_csv(name_factor_file,  sep="\t",   header=0,index_col=0)
myfactor2=myfactor.rename(columns={"Diagnosis": "Class"})
myfactor2



df_1=pandas.read_table(name_input_file,  header=0)
df_1_trasp=df_1.T
df_1_trasp_header= df_1_trasp.iloc[0,:]
df_1_trasp=df_1_trasp.iloc[1:,]
df_1_trasp.columns=df_1_trasp_header    #### set the 1st row of gene names as header




# df_1_trasp.to_csv(name_output_file_T, sep="\t",header=False)  ### write and re-read to eliminate the header line
# df_1_trasp=pandas.read_table(name_output_file_T,  header=0, index=0)
# df_1_trasp.shape

           #########here correlate genes with diagnosis 0/1


out_Pearson_table=pandas.DataFrame(index=range(df_1_trasp.shape[1]), columns=["Gene","Pearson_corr","Absolute_Pearson_corr","Pearson_pval"])

for rr in range(0,df_1_trasp.shape[1]):
    corrM_pval=scipy.stats.pearsonr(df_1_trasp.iloc[:,rr],myfactor2["Diagnosis_CTR_0_PD_1"])
    out_Pearson_table.iloc[rr,0]=df_1_trasp.columns[rr]
    out_Pearson_table.iloc[rr,1]=corrM_pval[0]
    out_Pearson_table.iloc[rr,2]=abs(corrM_pval[0])
    out_Pearson_table.iloc[rr,3]=corrM_pval[1]

   
out_Pearson_table.to_csv("correl_GX_to_0_1_diagnosis.txt",sep="\t",header=True)

percentile_90pc_abs_Pearson=numpy.percentile(out_Pearson_table.iloc[:,2],90.0)
percentile_90pc_abs_Pearson

bb=(out_Pearson_table["Absolute_Pearson_corr"] > percentile_90pc_abs_Pearson)
bb=list(bb)
df_1_trasp_top_10pc_abs_Pearson=df_1_trasp.loc[:, bb]   ### take only the top 10% most correlated with diagnosis

df_1_trasp_top_10pc_abs_Pearson_Class=df_1_trasp_top_10pc_abs_Pearson

df_1_trasp_top_10pc_abs_Pearson_Class.insert(df_1_trasp_top_10pc_abs_Pearson.shape[1],"Class",list(myfactor2["Class"]) )

df_1_trasp_top_10pc_abs_Pearson_Class.to_csv(name_output_file_T_Class,sep="\t",header=True)