In [48]:
### Choosing genes for analysis ###

### Citations of tools ###
# gget: Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. https://doi.org/10.1093/bioinformatics/btac836
# pybiomart: https://github.com/jrderuiter/pybiomart

In [49]:
import pandas as pd
import gget
import json

In [50]:
# data file downloaded from https://www.encodeproject.org/experiments/ENCSR792OIJ/
# Accession ENCFF928NYA under processed data (tsv file) 
# Data added 2020-11-04
df = pd.read_table("ENCFF928NYA.tsv")
columns = ['gene_id', 'TPM', 'FPKM']
df = pd.DataFrame(df, columns=columns)

In [51]:
df.head()

Unnamed: 0,gene_id,TPM,FPKM
0,10904,0.0,0.0
1,12954,0.0,0.0
2,12956,0.0,0.0
3,12958,0.0,0.0
4,12960,0.0,0.0


In [52]:
expressed_df = df.query("TPM!=0.0")

high_thresh = 200
high_thresh_df = expressed_df.query("TPM>@high_thresh")

low_thresh = 10
low_thresh_df = expressed_df.query("TPM>5 & TPM<@low_thresh")
lowest_thresh_df = expressed_df.query("TPM<5")

In [53]:
expressed_df.head()

Unnamed: 0,gene_id,TPM,FPKM
46,13023,41.07,49.61
47,13024,16.31,19.7
465,30958,43.49,52.53
471,30964,52.81,63.79
649,ENSG00000000003.14,0.05,0.06


In [54]:
# rows were selected by looking at the entire dataframe and were chosen for varying expression levels
high_rows = [941, 1965, 10644, 17808, 24929, 48356, 53485, 54599, 55026, 55115, 57853]
low_rows = [657, 926]
lowest_rows = [649, 652]

chosen_rows = high_rows + low_rows + lowest_rows

In [10]:
chosen_rows_df = pd.DataFrame(df.loc[chosen_rows])

chosen_rows_gene_ids = []
for index, row in chosen_rows_df.iterrows():
    chosen_rows_gene_ids.append(row['gene_id'])
    
chosen_rows_df.head(n=len(chosen_rows_df.index))

Unnamed: 0,gene_id,TPM,FPKM
941,ENSG00000011304.19,342.64,413.86
1965,ENSG00000075624.14,1567.7,1893.59
10644,ENSG00000156508.17,5000.06,6039.46
17808,ENSG00000196565.14,4169.72,5036.51
24929,ENSG00000222328.1,7046.27,8511.03
48356,ENSG00000263934.4,13928.59,16824.04
53485,ENSG00000274012.1,94300.79,113903.82
54599,ENSG00000276168.1,149831.97,180978.68
55026,ENSG00000277027.1,23024.05,27810.24
55115,ENSG00000277209.1,43982.92,53125.98


In [56]:
### METHOD USING GGET ###

In [11]:
res = gget.info(chosen_rows_gene_ids, json=True, uniprot=False)

Tue Jun 27 20:57:56 2023 INFO We noticed that you passed a version number with your Ensembl ID.
Please note that gget info will always return information linked to the latest Ensembl ID version (see 'ensembl_id').


In [57]:
ordered_gene_names = [None] * len(chosen_rows_gene_ids)

for key in res.keys():
    for i in range(len(chosen_rows_gene_ids)):
        if chosen_rows_gene_ids[i].startswith(key):
            ordered_gene_names[i] = res[key]['ensembl_gene_name']

In [58]:
chosen_rows_df.insert(1, "gene name (ensembl)", ordered_gene_names, True)
chosen_rows_df.head(n=len(chosen_rows_df.index))

Unnamed: 0,gene_id,gene name (ensembl),gene name (ensembl).1,TPM,FPKM
941,ENSG00000011304.19,PTBP1,PTBP1,342.64,413.86
1965,ENSG00000075624.14,ACTB,ACTB,1567.7,1893.59
10644,ENSG00000156508.17,EEF1A1,EEF1A1,5000.06,6039.46
17808,ENSG00000196565.14,HBG2,HBG2,4169.72,5036.51
24929,ENSG00000222328.1,RNU2-2P,RNU2-2P,7046.27,8511.03
48356,ENSG00000263934.4,SNORD3A,SNORD3A,13928.59,16824.04
53485,ENSG00000274012.1,RN7SL2,RN7SL2,94300.79,113903.82
54599,ENSG00000276168.1,RN7SL1,RN7SL1,149831.97,180978.68
55026,ENSG00000277027.1,RMRP,RMRP,23024.05,27810.24
55115,ENSG00000277209.1,RPPH1,RPPH1,43982.92,53125.98


In [62]:
# However, we need the RefSeq ID beginning in NM_ to use PaintSHOP, and gget doesn't let us do that. 
# So, the following method uses pybiomart which uses the ensembl API and MANE which correlates RefSeq IDs to Ensembl IDs.

In [61]:
from pybiomart import Dataset

dataset = Dataset(name='hsapiens_gene_ensembl',
                  host='http://www.ensembl.org')

In [66]:
refseq_results = []

# Gets the gene name and the RefSeq ID associated with the canonical (most common) transcript
for i in chosen_rows_gene_ids:
    gene_id = i.split(".")[0] # remove the version number
    result = dataset.query(attributes=['ensembl_gene_id', 'external_gene_name', 'external_transcript_name', 'transcript_mane_select', 'transcript_is_canonical', 'ensembl_transcript_id'],
                  filters={'link_ensembl_gene_id': [gene_id], 'transcript_is_canonical' : True})
    refseq_results.append(result)

In [67]:
refseq_df = pd.concat(refseq_results, ignore_index = True)
refseq_df['Ensembl Canonical'] = refseq_df['Ensembl Canonical'].astype(bool)  
print(refseq_df.to_string())
refseq_df.head(n=len(refseq_df.index))

     Gene stable ID Gene name Transcript name RefSeq match transcript (MANE Select)  Ensembl Canonical Transcript stable ID
0   ENSG00000011304     PTBP1       PTBP1-203                           NM_002819.5               True      ENST00000356948
1   ENSG00000075624      ACTB        ACTB-217                           NM_001101.5               True      ENST00000646664
2   ENSG00000156508    EEF1A1      EEF1A1-201                           NM_001402.6               True      ENST00000309268
3   ENSG00000196565      HBG2        HBG2-201                           NM_000184.3               True      ENST00000336906
4   ENSG00000222328   RNU2-2P     RNU2-2P-201                                   NaN               True      ENST00000410396
5   ENSG00000263934   SNORD3A     SNORD3A-201                                   NaN               True      ENST00000584923
6   ENSG00000274012    RN7SL2      RN7SL2-201                                   NaN               True      ENST00000490232
7   ENSG

Unnamed: 0,Gene stable ID,Gene name,Transcript name,RefSeq match transcript (MANE Select),Ensembl Canonical,Transcript stable ID
0,ENSG00000011304,PTBP1,PTBP1-203,NM_002819.5,True,ENST00000356948
1,ENSG00000075624,ACTB,ACTB-217,NM_001101.5,True,ENST00000646664
2,ENSG00000156508,EEF1A1,EEF1A1-201,NM_001402.6,True,ENST00000309268
3,ENSG00000196565,HBG2,HBG2-201,NM_000184.3,True,ENST00000336906
4,ENSG00000222328,RNU2-2P,RNU2-2P-201,,True,ENST00000410396
5,ENSG00000263934,SNORD3A,SNORD3A-201,,True,ENST00000584923
6,ENSG00000274012,RN7SL2,RN7SL2-201,,True,ENST00000490232
7,ENSG00000276168,RN7SL1,RN7SL1-201,,True,ENST00000618786
8,ENSG00000277027,RMRP,RMRP-201,,True,ENST00000363046
9,ENSG00000277209,RPPH1,RPPH1-201,,True,ENST00000516869


In [68]:
# Note that several of the genes have no matching RefSeq ID. This is because they are not mRNA, but other kinds of RNA (i.e. snRNA or rRNA). Ensembl doesn't already have the RefSeq IDs available for this.

In [69]:
# Export to file to use in PaintSHOP
f = open("refseqs_sample.txt", "w")
for index, row in refseq_df.iterrows():
    if str(row['RefSeq match transcript (MANE Select)']) != 'nan':
        f.write(str(row['RefSeq match transcript (MANE Select)']) + '\n')
f.close()

In [70]:
# Then we use PaintSHOP to make probes: https://doi.org/10.1038/s41592-021-01187-3