# Table of Ensembl Gene IDs VS  Patient Numbers


- Retrieving data from the 'IR3/counts' folder involves extracting the final column (counts) associated with each patient during their baseline (BL) visit.

- Data from Project 133 RNA Sequencing Feature Counts/TPM (IR3/B38/Phases 1-2, version 2021-04-02

In [1]:
import dask.dataframe as dd
import pandas as pd
import numpy as np
import os
import glob
import functools
from pathlib import Path
import time
from datetime import datetime

In [2]:
# Note that the counts file in the IR3 is around 152 G, and the files are located in scratch area.

path1=Path("/scratch/znazari/PPMI_ver_sep2022/RNA_Seq_data/star_ir3/counts/")
path2 = Path("/home/znazari/data") # where the output data will be saved at the end.
path3=Path("/scratch/znazari/PPMI_ver_sep2022/study_data/Subject_Characteristics/")


<a id="matrixcreation"></a>
## Matrix of Gene IDs and Counts for Patients
 Loading the data from IR3/counts folder and extracting the column (counts) of each patient file for their BL visit.

In [3]:
# Get all file names in the folder
all_files = [file.name for file in path1.glob('*')]

# Filter the files that contain "BL" in their names
bl_files2 = [file for file in all_files if "BL" in file]

#Convert to dataframe
bl_files =pd.DataFrame(bl_files2)

# We define a function where we can take the second phrase seperated by dot. The second phrase 
# is the patient ID. So with this functin we want to get the patient IDs from their file's name
def function_names(fname):
    tokens=fname.split('.')
    return tokens[1]

# we create a list with the name of the each patients.
bl_list = [function_names(bl_files.iloc[i][0]) for i in range(len(bl_files))]

start_time = time.time()

# here we read all the files with with base visit(BL) from the counts folder (where we have all the files
# for all the patients and all the visit).
list_bl_files = [dd.read_csv(path1/bl_files.iloc[i][0],skiprows=1,delimiter='\t') for i in range(len(bl_files))]


# we get th last columns of each file in the list
last_columns = [ddf.iloc[:, -1:] for ddf in list_bl_files]

# concatinating the list of the columns in a single file.
single_file = dd.concat(last_columns, axis=1, ignore_unknown_divisions=True)

# we change the name of the each columns with the patient numbers.
single_file.columns = bl_list

# we get the Geneid column and convert it to dask dataframe
pd_tmp_file = list_bl_files[3].compute()
geneid = pd_tmp_file['Geneid']
ddf_geneid = dd.from_pandas(geneid, npartitions=1)

# here we set the Geneid column as the index of the matrix.
ddf_new_index = single_file.set_index(ddf_geneid)

# converting to pandas data frame and saving.
ir3_counts = ddf_new_index.compute()
ir3_counts.to_csv(path2/"matrix_ir3_counts_bl.csv")

end_time = time.time()

execution_time = end_time - start_time
print(f"Execution Time: {execution_time} seconds")

Execution Time: 1029.6724038124084 seconds


In [4]:
ir3_counts

Unnamed: 0_level_0,3385,3514,54854,55098,3390,3867,3776,3803,4058,3203,...,3322,54578,4025,3465,3518,42018,41662,3821,3162,3668
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000003.14,20,23,14,22,20,47,6,25,13,13,...,9,13,11,17,13,44,10,11,14,34
ENSG00000000005.5,4,6,2,0,0,12,1,3,3,1,...,0,1,0,0,0,2,0,0,0,0
ENSG00000000419.12,1818,223,514,725,777,1114,278,837,376,552,...,694,715,1051,1044,678,1623,339,431,501,1687
ENSG00000000457.13,3175,472,1463,2203,1370,1521,692,1830,843,1601,...,1657,1852,1956,2087,1550,3217,923,1265,1316,2324
ENSG00000000460.16,1080,215,514,668,417,351,202,952,344,516,...,581,466,502,710,586,820,208,411,393,837
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000285990.1,2,4,0,0,0,5,0,1,0,1,...,0,0,0,0,0,0,0,0,0,1
ENSG00000285991.1,3,8,0,0,1,4,0,0,0,0,...,0,5,1,0,2,2,0,1,0,5
ENSG00000285992.1,3,7,2,0,0,2,0,2,2,1,...,0,0,0,0,1,0,0,0,0,3
ENSG00000285993.1,3,4,2,0,4,1,0,3,4,0,...,0,0,0,1,0,2,0,0,0,0


In [5]:
# Get the current date
current_date = datetime.now().date()

# Print the current date
print("Last update :", current_date)

Last update : 2024-02-08
