In [1]:
### Choosing genes for analysis ###

### Citations of tools ###
# Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. https://doi.org/10.1093/bioinformatics/btac836

In [None]:
### Syntax suggestions

# wt_probabilities = calculate_probs(np.asarray(df.query("Strain=='WT'")["Count"]))
# msh2_probabilities = calculate_probs(np.asarray(df.query("Strain=='MSH2'")["Count"]))

In [2]:
import pandas as pd
import gget
import json

In [3]:
pd.set_option('display.max_rows', None)
# data file downloaded from https://www.encodeproject.org/experiments/ENCSR792OIJ/
# Accession ENCFF928NYA under processed data (tsv file) 
# Data added 2020-11-04
df = pd.read_table("ENCFF928NYA.tsv")
columns = ['gene_id', 'TPM', 'FPKM']
df = pd.DataFrame(df, columns=columns)

In [4]:
df.head()

Unnamed: 0,gene_id,TPM,FPKM
0,10904,0.0,0.0
1,12954,0.0,0.0
2,12956,0.0,0.0
3,12958,0.0,0.0
4,12960,0.0,0.0


In [13]:
expressed_df = df.query("TPM!=0.0")

high_thresh = 200
high_thresh_df = expressed_df.query("TPM>@high_thresh")

low_thresh = 10
low_thresh_df = expressed_df.query("TPM>5 & TPM<@low_thresh")
lowest_thresh_df = expressed_df.query("TPM<5")

In [14]:
expressed_df.head()

Unnamed: 0,gene_id,TPM,FPKM
46,13023,41.07,49.61
47,13024,16.31,19.7
465,30958,43.49,52.53
471,30964,52.81,63.79
649,ENSG00000000003.14,0.05,0.06


In [15]:
# rows were selected by looking at the entire dataframe and were chosen for varying expression levels
high_rows = [941, 1965, 10644, 17808, 24929, 48356, 53485, 54599, 55026, 55115, 57853]
low_rows = [657, 926]
lowest_rows = [649, 652]

chosen_rows = high_rows + low_rows + lowest_rows

In [17]:
chosen_rows_df = pd.DataFrame(df.loc[chosen_rows])

chosen_rows_gene_ids = []
for index, row in chosen_rows_df.iterrows():
    chosen_rows_gene_ids.append(row['gene_id'])
    
chosen_rows_df.head(n=len(chosen_rows_df.index))

Unnamed: 0,gene_id,TPM,FPKM
941,ENSG00000011304.19,342.64,413.86
1965,ENSG00000075624.14,1567.7,1893.59
10644,ENSG00000156508.17,5000.06,6039.46
17808,ENSG00000196565.14,4169.72,5036.51
24929,ENSG00000222328.1,7046.27,8511.03
48356,ENSG00000263934.4,13928.59,16824.04
53485,ENSG00000274012.1,94300.79,113903.82
54599,ENSG00000276168.1,149831.97,180978.68
55026,ENSG00000277027.1,23024.05,27810.24
55115,ENSG00000277209.1,43982.92,53125.98


In [8]:
res = gget.info(chosen_rows_gene_ids, json=True, uniprot=False)

Tue Jun 27 15:36:49 2023 INFO We noticed that you passed a version number with your Ensembl ID.
Please note that gget info will always return information linked to the latest Ensembl ID version (see 'ensembl_id').


In [18]:
ordered_gene_names = [None] * len(chosen_rows_gene_ids)

for key in res.keys():
    print(key, res[key]['ensembl_gene_name'])
    for i in range(len(chosen_rows_gene_ids)):
        if chosen_rows_gene_ids[i].startswith(key):
            ordered_gene_names[i] = res[key]['ensembl_gene_name']
    
print(ordered_gene_names)

ENSG00000011304 PTBP1
ENSG00000075624 ACTB
ENSG00000156508 EEF1A1
ENSG00000196565 HBG2
ENSG00000222328 RNU2-2P
ENSG00000263934 SNORD3A
ENSG00000274012 RN7SL2
ENSG00000276168 RN7SL1
ENSG00000277027 RMRP
ENSG00000277209 RPPH1
ENSG00000283293 RN7SK
ENSG00000001084 GCLC
ENSG00000011021 CLCN6
ENSG00000000003 TSPAN6
ENSG00000000457 SCYL3
['PTBP1', 'ACTB', 'EEF1A1', 'HBG2', 'RNU2-2P', 'SNORD3A', 'RN7SL2', 'RN7SL1', 'RMRP', 'RPPH1', 'RN7SK', 'GCLC', 'CLCN6', 'TSPAN6', 'SCYL3']


In [19]:
chosen_rows_df.insert(1, "gene name (ensembl)", ordered_gene_names, True)

In [20]:
chosen_rows_df.head(n=len(chosen_rows_df.index))

Unnamed: 0,gene_id,gene name (ensembl),TPM,FPKM
941,ENSG00000011304.19,PTBP1,342.64,413.86
1965,ENSG00000075624.14,ACTB,1567.7,1893.59
10644,ENSG00000156508.17,EEF1A1,5000.06,6039.46
17808,ENSG00000196565.14,HBG2,4169.72,5036.51
24929,ENSG00000222328.1,RNU2-2P,7046.27,8511.03
48356,ENSG00000263934.4,SNORD3A,13928.59,16824.04
53485,ENSG00000274012.1,RN7SL2,94300.79,113903.82
54599,ENSG00000276168.1,RN7SL1,149831.97,180978.68
55026,ENSG00000277027.1,RMRP,23024.05,27810.24
55115,ENSG00000277209.1,RPPH1,43982.92,53125.98
