 # <span style="color:#8B4513;"> RNA-Seq Data Preprocessing STEP I
</span>

- We keep only individuals with diagnosis of Health control or Parkinson's disease.
- We remove patients that have these gene mutations : SNCA, GBA, LRRK2, and taking dopaminergic drugs.
- We remove the duplicated gene IDs in which they are also lowly expressed.
- 0000
-  We only keep genes with the intersection of counts and quants with proteing coding and non protein coding RNAincs.
- We remove lowely expressed genes, by keeping only genes that had more than five counts in at least 10% of the individuals, which left us with 21,273 genes

In [1]:
import pandas as pd
import os
from pathlib import Path

In [2]:
path2 = Path("/home/znazari/data") # where the output data will be saved at the end.
path3=Path("/scratch/znazari/PPMI_ver_sep2022/study_data/Subject_Characteristics/")

In [3]:
# Read the main table of gene IDs vs invididuals 
read_ir3_counts = pd.read_csv(path2/"matrix_ir3_counts_bl.csv")

# Set the geneid as indexing column
read_ir3_counts.set_index('Geneid', inplace=True)

In [4]:
# Read the file which contains diagnosis
diago=pd.read_csv(path3/"Participant_Status.csv", header=None )
diago1=diago.rename(columns=diago.iloc[0]).drop(diago.index[0]).reset_index(drop=True)

# Select only diagnosis with parkinson's and control.
selected_diagnosis_pd_hc = diago1[diago1['COHORT_DEFINITION'].isin(['Healthy Control', "Parkinson's Disease"])]
pd_hc = selected_diagnosis_pd_hc['PATNO']

filtered_df = read_ir3_counts.loc[:, read_ir3_counts.columns.isin(pd_hc)]

In [5]:
# Read the file which contains patients with gene mutations or dopaminzzergic drug users
union_drugs_mutations=pd.read_csv(path2/'union_drugs_mutations.csv', index_col=0)
s_union_drugs_mutations= union_drugs_mutations['0']
s_union_drugs_mutations_str = s_union_drugs_mutations.astype(str)

# Remove patients using dopaminergic drugs
filtered_df_drug = filtered_df.drop(columns=s_union_drugs_mutations_str, errors='ignore')

In [None]:
# Remove duplicated genes

# we need to find the duplicated genes, find their values and take the mean of them and keep it with 
# a new version with only the gene names without version, and then remove all other versions.

# Since in this study we do not consider sex biases, we remvoe genes with Y chromosomes 
# as they can be found only in the males.
# I need to find out which gene ids including Y chromosomes, then I need to remove them from the list.

In [12]:
# result with removing the after dot (.) value, i.e. the version of the geneIDs is removed.
read_ir3_counts.index =read_ir3_counts.index.str.split('.').str[0]

# get the duplicated indices
duplicated_indices = read_ir3_counts.index[read_ir3_counts.index.duplicated()]

# create a new dataframe with the duplicated indices
new_df = read_ir3_counts.loc[duplicated_indices]


new_df['sum'] = new_df.sum(axis=1)

# print the new dataframe
new_df = new_df.sort_values(by='sum', ascending=False)

In [14]:
new_df.sort_index(inplace=True)
gene_id_duplicated = new_df.index
list_duplicated_genes = list(gene_id_duplicated)

In [17]:
#matching_indices = filtered_df_drug.index[filtered_df_drug.index.str.contains("_PAR_Y", case=False)]
matching_rows = filtered_df_drug[filtered_df_drug.index.str.contains('|'.join(list_duplicated_genes), case=False)]
list(matching_rows.index)

['ENSG00000002586.19',
 'ENSG00000002586.19_PAR_Y',
 'ENSG00000124333.15',
 'ENSG00000124333.15_PAR_Y',
 'ENSG00000124334.17',
 'ENSG00000124334.17_PAR_Y',
 'ENSG00000167393.17',
 'ENSG00000167393.17_PAR_Y',
 'ENSG00000168939.11',
 'ENSG00000168939.11_PAR_Y',
 'ENSG00000169084.13',
 'ENSG00000169084.13_PAR_Y',
 'ENSG00000169093.15',
 'ENSG00000169093.15_PAR_Y',
 'ENSG00000169100.13',
 'ENSG00000169100.13_PAR_Y',
 'ENSG00000178605.13',
 'ENSG00000178605.13_PAR_Y',
 'ENSG00000182162.10',
 'ENSG00000182162.10_PAR_Y',
 'ENSG00000182378.13',
 'ENSG00000182378.13_PAR_Y',
 'ENSG00000182484.15',
 'ENSG00000182484.15_PAR_Y',
 'ENSG00000185203.12',
 'ENSG00000185203.12_PAR_Y',
 'ENSG00000185291.11',
 'ENSG00000185291.11_PAR_Y',
 'ENSG00000185960.14',
 'ENSG00000185960.14_PAR_Y',
 'ENSG00000196433.12',
 'ENSG00000196433.12_PAR_Y',
 'ENSG00000197976.11',
 'ENSG00000197976.11_PAR_Y',
 'ENSG00000198223.16',
 'ENSG00000198223.16_PAR_Y',
 'ENSG00000205755.11',
 'ENSG00000205755.11_PAR_Y',
 'ENSG000002

In [None]:
# select the rows with a sum of zero
read_ir3_counts = read_ir3_counts[read_ir3_counts['sum'] ==0]

read_ir3_counts.shape
# remove the rows with a sum of zero
#df = df[df['sum'] != 0]

In [None]:
# Keep the genes with proteomic and non proteomic IncRNAs genes

protein_coding = pd.read_csv(path2/'HGNC_protein_coding_genes_31Jan2024.txt', delimiter='\t', skiprows=1)
non_protein_codying = pd.read_csv(path2/'HGNC_lncRNA_non_coding_genes_31Jan2024.txt',delimiter='\t', skiprows=1)
all_genes=pd.read_csv(path2/'IR3_counts_all_gene_IDs.txt',delimiter='\t')
intersection_genes=pd.read_csv(path2/'Intersection_genes_IR3_counts_gene_IDs_with_HGNC_lncRNA_Protein_Coding_31Jan2024.txt',delimiter='\t')

In [None]:
#rna_filtered.to_csv(path2/'ir3_rna_step1.csv', index=True)
# combine the target as well