 #  RNA-Seq Data Preprocessing STEP I


-------

- We keep only individuals with diagnosis of Health Control or Parkinson's Disease.
- We remove patients that have these gene mutations : SNCA, GBA, LRRK2, and taking dopaminergic drugs.
- We remove the duplicated gene IDs in which they carry _PAR_Y and its X transcript.
-  We only keep genes with the intersection of counts and quants with protein coding and non-protein coding RNAincs.
--------
- We remove lowely expressed genes, by keeping only genes that had more than five counts in at least 10% of the individuals.

In [1]:
import pandas as pd
from pathlib import Path
from datetime import datetime

In [2]:
path2 = Path("/home/znazari/data") # where the output data will be saved at the end.
path3=Path("/scratch/znazari/PPMI_ver_sep2022/study_data/Subject_Characteristics/")

In [3]:
# Read the main table of gene IDs vs invididuals 
read_ir3_counts = pd.read_csv(path2/"matrix_ir3_counts_bl.csv")

# Set the geneid as indexing column
read_ir3_counts.set_index('Geneid', inplace=True)

In [4]:
# Read the file which contains diagnosis
diago=pd.read_csv(path3/"Participant_Status.csv", header=None )
diago1=diago.rename(columns=diago.iloc[0]).drop(diago.index[0]).reset_index(drop=True)

# Select only diagnosis with parkinson's and control.
selected_diagnosis_pd_hc = diago1[diago1['COHORT_DEFINITION'].isin(['Healthy Control', "Parkinson's Disease"])]
pd_hc = selected_diagnosis_pd_hc['PATNO']

filtered_df = read_ir3_counts.loc[:, read_ir3_counts.columns.isin(pd_hc)]

In [5]:
# Read the file which contains patients with gene mutations or dopaminzzergic drug users
union_drugs_mutations=pd.read_csv(path2/'union_drugs_mutations.csv', index_col=0)
s_union_drugs_mutations= union_drugs_mutations['0']
s_union_drugs_mutations_str = s_union_drugs_mutations.astype(str)

# Remove patients using dopaminergic drugs
filtered_df_drug = filtered_df.drop(columns=s_union_drugs_mutations_str, errors='ignore')

## Complete list of Gene IDs with and withOUT versions

In [6]:
# Remove patients using dopaminergic drugs

filtered_df_drug = filtered_df.drop(columns=s_union_drugs_mutations_str, errors='ignore')
with_version = filtered_df_drug.copy()

# Remove the after dot (.) value, i.e. the version of the geneIDs is removed.
filtered_df_drug.index =filtered_df_drug.index.str.split('.').str[0]
without_version = filtered_df_drug

# Combine them together
all_IR3_counts_gene_IDs = pd.DataFrame({
    'Gene_IDs_with_version': with_version.index,
    'Gene_IDs_withOUT_version': without_version.index
})

all_IR3_counts_gene_IDs.to_csv(path2/"all_IR3_counts_gene_IDs.csv", index=False)

## Genes with its _PAR_Y counterparts

We exclude genes associated with the Y chromosome in males, as well as their X chromosome counterparts, to eliminate some genes that contribute to speific gender.


The suffix _PAR_Y in Ensembl ID refers to the pseudoautosomal region (PAR) of the Y chromosome. The PAR is a region of the Y chromosome that shares homology with the X chromosome, and it is the only part of the Y chromosome that recombines with the X chromosome during meiosis.

In case we analyze genes exclusively in males, we aggregate counts for X and PAR_Y transcripts. Considering that quantification should encompass the total of these isoforms."

In [7]:
# Get the duplicated indices
duplicated_indices = filtered_df_drug.index[filtered_df_drug.index.duplicated()]

# Create a new dataframe with the duplicated indices
new_df = filtered_df_drug.loc[duplicated_indices]

# Sort the list of based on their indices
new_df.sort_index(inplace=True)

# Get the indices
gene_id_duplicated = new_df.index

# Make a list out of indices
list_duplicated_genes = list(gene_id_duplicated)

# Filter the list of genes with their _PAR_Y counter part and including their versions
matching_rows = filtered_df_drug[filtered_df_drug.index.str.contains('|'.join(list_duplicated_genes), case=False)]

# Make a list of genes that have duplicated Y chromosomes as well
Genes_list_duplicated = list(matching_rows.index)

# Remove them from the main list
filtered_df_drug_duplicate_indices = filtered_df_drug.drop(list_duplicated_genes)

In [None]:
# Keep the genes with proteomic and non proteomic IncRNAs genes

protein_coding = pd.read_csv(path2/'HGNC_protein_coding_genes_31Jan2024.txt', delimiter='\t')
non_protein_codying = pd.read_csv(path2/'HGNC_lncRNA_non_coding_genes_31Jan2024.txt',delimiter='\t')
intersection_genes=pd.read_csv(path2/'Intersection_genes_IR3_counts_gene_IDs_with_HGNC_lncRNA_Protein_Coding_31Jan2024.txt',delimiter='\t')

In [None]:
#rna_filtered.to_csv(path2/'ir3_rna_step1.csv', index=True)
# combine the target as well

In [87]:
# Get the current date
current_date = datetime.now().date()

# Print the current date
print("Last update :", current_date)

Last update : 2024-02-02


In [28]:
import pandas as pd

# Assuming you have a DataFrame named df with patient IDs as columns and gene counts as rows
# Replace these with your actual DataFrame

# Sample data creation (replace this with your actual data)
data = {'GeneID': ['GeneA', 'GeneB', 'GeneC'],
        'Patient1': [3, 7, 10],
        'Patient2': [1, 8, 12],
        'Patient3': [2, 5, 9],
       'Patient4': [2, 5, 9]}
df = pd.DataFrame(data)

# Set 'GeneID' column as the index
df.set_index('GeneID', inplace=True)

# Calculate the percentage of patients in which each gene has more than five counts
gene_percentages = (df > 5).sum(axis=1) / df.shape[1]

# Display the result
gene_percentages


GeneID
GeneA    0.0
GeneB    0.5
GeneC    1.0
dtype: float64

In [29]:
(df > 5).sum(axis=1)

GeneID
GeneA    0
GeneB    2
GeneC    4
dtype: int64

In [30]:
df

Unnamed: 0_level_0,Patient1,Patient2,Patient3,Patient4
GeneID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GeneA,3,1,2,2
GeneB,7,8,5,5
GeneC,10,12,9,9


In [32]:
df.shape[0]

3

In [20]:
import pandas as pd



# Calculate the number of counts for each gene across all patients
#gene_counts = df.sum(axis=0)

# Calculate the percentage of patients in which each gene has more than five counts
#gene_percentages = (df > 5).sum(axis=0) / df.shape[0]

# Set the threshold value
threshold = 0.1

# Filter out genes whose percentage is below the threshold
filtered_genes = gene_percentages[gene_percentages >= threshold].index

# Remove the filtered genes from the dataframe
df_filtered = df.loc[filtered_genes]


In [19]:
filtered_genes

Index(['GeneB', 'GeneC'], dtype='object', name='GeneID')

In [21]:
df_filtered

Unnamed: 0_level_0,Patient1,Patient2,Patient3
GeneID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
GeneB,7,8,5
GeneC,10,12,9
