 #  RNA-Seq Data Preprocessing Part I

-------

- We keep only individuals with diagnosis of Health Control or Parkinson's Disease.
- We remove patients that have these gene mutations : SNCA, GBA, LRRK2, and taking dopaminergic drugs.
- We remove the duplicated gene IDs which are those that carry ensembl genes with suffix _PAR_Y and their X transcripts.
-  We only keep genes that are either in the 19393 protein coding gene list or in 5874 long intergenic non-coding RNAs (lincRNAs) list that we obtained from the official HGNC repository (date: 31-Jan-2024).
- We filter out genes with low expression levels, retaining only those genes that exhibit more than five counts in a minimum of 10% of the individuals.

In [1]:
import pandas as pd
from pathlib import Path
from datetime import datetime

In [2]:
path2 = Path("/home/znazari/data") # where the output data will be saved at the end.
path3=Path("/scratch/znazari/PPMI_ver_sep2022/study_data/Subject_Characteristics/")

In [3]:
# Read the main table of gene IDs vs invididuals 
read_ir3_counts = pd.read_csv(path2/"matrix_ir3_counts_bl.csv")

# Set the geneid as indexing column
read_ir3_counts.set_index('Geneid', inplace=True)

In [4]:
# Read the file which contains diagnosis
diago=pd.read_csv(path3/"Participant_Status.csv", header=None )
diago1=diago.rename(columns=diago.iloc[0]).drop(diago.index[0]).reset_index(drop=True)

# Select only diagnosis with parkinson's and control.
selected_diagnosis_pd_hc = diago1[diago1['COHORT_DEFINITION'].isin(['Healthy Control', "Parkinson's Disease"])]
pd_hc = selected_diagnosis_pd_hc['PATNO']

# Matrix of gene IDs with pateints only with PD and Control subjests
filtered_df = read_ir3_counts.loc[:, read_ir3_counts.columns.isin(pd_hc)]

In [6]:
# Read the file which contains patients with gene mutations and/or using dopaminergic drugs
union_drugs_mutations=pd.read_csv(path2/'union_drugs_mutations.csv', index_col=0)
s_union_drugs_mutations= union_drugs_mutations['0']
s_union_drugs_mutations_str = s_union_drugs_mutations.astype(str)

# Remove patients with gene mutations and/or using dopaminergic drugs
filtered_df_drug = filtered_df.drop(columns=s_union_drugs_mutations_str, errors='ignore')

# Make some copy for later use
filtered_df_drug_c = filtered_df_drug.copy()

## Genes with  _PAR_Y and their X counterpart

The suffix _PAR_Y in Ensembl ID refers to the pseudoautosomal region (PAR) of the Y chromosome (which can only be found in males). The PAR is a region of the Y chromosome that shares homology with the X chromosome, and it is the only part of the Y chromosome that recombines with the X chromosome during meiosis.

For simplicity, we exclude genes associated with the Y chromosome (_PAR_Y genes), as well as their X chromosome counterparts. Alternatively  for female we could remove the _PAR_Y as it is irrelevant for them and for males we could add the value of _PAR_Y genes with their X counterparts, considering that quantification should encompass the total value of these isoforms.

In [8]:
# Remove the version
filtered_df_drug_c.index =filtered_df_drug.index.str.split('.').str[0]

# Get the duplicated indices
duplicated_indices = filtered_df_drug_c.index[filtered_df_drug_c.index.duplicated()]

# Create a new dataframe with the duplicated indices
new_df = filtered_df_drug_c.loc[duplicated_indices]

# Sort the list of based on their indices
new_df.sort_index(inplace=True)

# Get the indices
gene_id_duplicated = new_df.index

# Make a list out of indices
list_duplicated_genes = list(gene_id_duplicated)

# Filter the list of genes with their _PAR_Y counter part and including their versions
matching_rows = filtered_df_drug[filtered_df_drug.index.str.contains('|'.join(list_duplicated_genes), case=False)]

# Make a list of genes that have duplicated Y chromosomes as well
Genes_list_duplicated = list(matching_rows.index)

filtered_df_druggg = filtered_df_drug[~filtered_df_drug.index.isin(Genes_list_duplicated)]

## Save complete list of Gene IDs with and withOUT versions

In [14]:
# Combine them together
all_IR3_counts_gene_IDs = pd.DataFrame({
    'Gene_IDs_with_version': filtered_df_drug.index,
    'Gene_IDs_withOUT_version': filtered_df_drug_c.index})

# Save complete list of Gene IDs with and withOUT versions
all_IR3_counts_gene_IDs.to_csv(path2/"all_IR3_counts_gene_IDs.csv", index=False)

## Ensembl gene IDs corresponding to protein-coding and long non-coding RNA genes

We obtained a complete list of long non-coding RNA and protein-coding gene symbols from the official HGNC repository (date: 31-Jan-2024), that were then converted to Ensemble gene IDs using a table from BioMart (date: 31-Jan-2024). Only expression data corresponding to long non-coding RNA and protein-coding genes (24694 Ensemble gene IDs in total) were selected for further analysis.

In [11]:
# Keep the genes with proteomic and non proteomic IncRNAs genes

# protein coding gene symbols:
protein_coding = pd.read_csv("../External_data/HGNC_protein_coding_genes_31Jan2024.txt", delimiter='\t')

# long non-coding protein gene symbols
non_protein_codying = pd.read_csv("../External_data/HGNC_lncRNA_non_coding_genes_31Jan2024.txt", delimiter='\t')

# Dictionary to convery gene symbols to Ensemble gene IDs:
dictionary = pd.read_csv("../External_data/HGNC_BioMart_symbol_name_EnsembleGeneID_31Jan2024.txt", delimiter='\t')

# Ensemble gene IDs for protein and long non-coding protein genes:
intersection_genes=pd.read_csv("../External_data/Intersection_genes_IR3_counts_gene_IDs_with_HGNC_lncRNA_Protein_Coding_04feb2024.txt", delimiter='\t')

In [12]:
# Copy the dataframe
full_list_genes_hc_pd = filtered_df_druggg.copy() 

# Remove the version
full_list_genes_hc_pd.index =full_list_genes_hc_pd.index.str.split('.').str[0]

# Protein and long non-coding protein genes list
protein_long_non_ptotein= list(intersection_genes["Ensemble_Gene_ID"])

# Filter the DataFrame based on the intersection of ensemble gene IDs
full_ensemblgene = full_list_genes_hc_pd[full_list_genes_hc_pd.index.isin(protein_long_non_ptotein)]

## Lowly expressed genes removal
We filter out genes with low expression levels, retaining only those genes that exhibit more than five counts in a minimum of 10% of the individuals.

In [19]:
# Calculate the percentage of patients in which each gene has more than five counts
gene_percentages = (full_ensemblgene > 5).sum(axis=1) / full_ensemblgene.shape[1]

# Set the threshold value
threshold = 0.1

# Filter out genes whose percentage is below the threshold
filtered_genes = gene_percentages[gene_percentages >= threshold].index

# Remove the filtered genes from the dataframe
highly_expressed_genes = full_ensemblgene.loc[filtered_genes]

In [21]:
highly_expressed_genes.to_csv(path2/'ir3_rna_step1_preprocessing.csv', index=True)

In [83]:
# Get the current date
current_date = datetime.now().date()

# Print the current date
print("Last update :", current_date)

Last update : 2024-02-06
