In [1]:
### Choosing genes for analysis ###

### Citations of tools ###
# gget: Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. https://doi.org/10.1093/bioinformatics/btac836
# pybiomart: https://github.com/jrderuiter/pybiomart

In [85]:
import pandas as pd
pd.set_option('display.max_columns', None)
import gget
import json

In [54]:
# data file downloaded from https://www.encodeproject.org/experiments/ENCSR792OIJ/
# Accession ENCFF928NYA under processed data (tsv file) 
# Data added 2020-11-04
df = pd.read_table("ENCFF928NYA.tsv")
columns = ['gene_id', 'TPM', 'FPKM', 'transcript_id(s)']
df = pd.DataFrame(df, columns=columns)

In [55]:
df.head()

Unnamed: 0,gene_id,TPM,FPKM,transcript_id(s)
0,10904,0.0,0.0,10904
1,12954,0.0,0.0,12954
2,12956,0.0,0.0,12956
3,12958,0.0,0.0,12958
4,12960,0.0,0.0,12960


In [68]:
# visualizations to choose genes which was done manually -- will be useful for automation

# expressed_df = df.query("TPM!=0.0")

# high_thresh = 200
# high_thresh_df = expressed_df.query("TPM>@high_thresh")

# low_thresh = 10
# low_thresh_df = expressed_df.query("TPM>5 & TPM<@low_thresh")
# lowest_thresh_df = expressed_df.query("TPM<5")


In [57]:
# print(high_thresh_df.to_string())
#print(len(mid_thresh_df.index))

In [58]:
# df.query("gene_id=='ENSG00000111640.14'")

In [59]:
expressed_df.head()

Unnamed: 0,gene_id,TPM,FPKM
46,13023,41.07,49.61
47,13024,16.31,19.7
465,30958,43.49,52.53
471,30964,52.81,63.79
649,ENSG00000000003.14,0.05,0.06


In [60]:
# rows were selected by looking at the entire dataframe and were chosen for varying expression levels
high_rows = [941, 1965, 4142, 4563, 10644, 17808, 24929, 48356, 53485, 54599, 55026, 55115, 57853]
low_rows = [657, 926]
lowest_rows = [649, 652, 1418, 21010]

chosen_rows = high_rows + low_rows + lowest_rows

In [69]:
chosen_rows_df = pd.DataFrame(df.loc[chosen_rows])

chosen_rows_gene_ids = []
for index, row in chosen_rows_df.iterrows():
    chosen_rows_gene_ids.append(row['gene_id'])
    
chosen_rows_df.head(n=len(chosen_rows_df.index))

Unnamed: 0,gene_id,TPM,FPKM,transcript_id(s)
941,ENSG00000011304.19,342.64,413.86,"ENST00000349038.8,ENST00000350092.8,ENST000003..."
1965,ENSG00000075624.14,1567.7,1893.59,"ENST00000331789.10,ENST00000414620.1,ENST00000..."
4142,ENSG00000108107.14,1401.16,1692.44,"ENST00000344063.6,ENST00000426763.3,ENST000004..."
4563,ENSG00000111640.14,1270.9,1535.09,"ENST00000229239.9,ENST00000396856.5,ENST000003..."
10644,ENSG00000156508.17,5000.06,6039.46,"ENST00000309268.10,ENST00000316292.13,ENST0000..."
17808,ENSG00000196565.14,4169.72,5036.51,"ENST00000336906.5,ENST00000380252.6,ENST000004..."
24929,ENSG00000222328.1,7046.27,8511.03,ENST00000410396.1
48356,ENSG00000263934.4,13928.59,16824.04,"ENST00000584923.1,ENST00000620232.1,ENST000006..."
53485,ENSG00000274012.1,94300.79,113903.82,ENST00000490232.3
54599,ENSG00000276168.1,149831.97,180978.68,ENST00000618786.1


In [10]:
### METHOD USING GGET ###

In [11]:
res = gget.info(chosen_rows_gene_ids, json=True, uniprot=False)

Fri Jun 30 03:35:01 2023 INFO We noticed that you passed a version number with your Ensembl ID.
Please note that gget info will always return information linked to the latest Ensembl ID version (see 'ensembl_id').


In [12]:
ordered_gene_names = [None] * len(chosen_rows_gene_ids)

for key in res.keys():
    for i in range(len(chosen_rows_gene_ids)):
        if chosen_rows_gene_ids[i].startswith(key):
            ordered_gene_names[i] = res[key]['ensembl_gene_name']

In [13]:
chosen_rows_df.insert(1, "gene name (ensembl)", ordered_gene_names, True)
chosen_rows_df.head(n=len(chosen_rows_df.index))

Unnamed: 0,gene_id,gene name (ensembl),TPM,FPKM
941,ENSG00000011304.19,PTBP1,342.64,413.86
1965,ENSG00000075624.14,ACTB,1567.7,1893.59
4142,ENSG00000108107.14,RPL28,1401.16,1692.44
10644,ENSG00000156508.17,EEF1A1,5000.06,6039.46
17808,ENSG00000196565.14,HBG2,4169.72,5036.51
24929,ENSG00000222328.1,RNU2-2P,7046.27,8511.03
48356,ENSG00000263934.4,SNORD3A,13928.59,16824.04
53485,ENSG00000274012.1,RN7SL2,94300.79,113903.82
54599,ENSG00000276168.1,RN7SL1,149831.97,180978.68
55026,ENSG00000277027.1,RMRP,23024.05,27810.24


In [14]:
# However, we need the RefSeq ID beginning in NM_ to use PaintSHOP, and gget doesn't let us do that. 
# So, the following method uses pybiomart which uses the ensembl API and MANE (Matched Annotation from the NCBI and EMBL-EBI) which correlates RefSeq IDs to Ensembl IDs.

In [62]:
from pybiomart import Dataset

dataset = Dataset(name='hsapiens_gene_ensembl',
                  host='http://www.ensembl.org')

In [63]:
refseq_results = []

# Gets the gene name and the RefSeq ID associated with the canonical (most common) transcript
for i in chosen_rows_gene_ids:
    gene_id = i.split(".")[0] # remove the version number
    result = dataset.query(attributes=['ensembl_gene_id', 'external_gene_name', 'external_transcript_name', 'transcript_mane_select', 'transcript_is_canonical', 'ensembl_transcript_id'],
                  filters={'link_ensembl_gene_id': [gene_id], 'transcript_is_canonical' : True})
    refseq_results.append(result)
    
to_extract = chosen_rows_df.reset_index()

In [65]:
refseq_df = pd.concat(refseq_results, ignore_index = True)
refseq_df.insert(2, "TPM", to_extract["TPM"])
refseq_df.insert(3, "FPKM", to_extract["FPKM"])
refseq_df.insert(3, "transcript_id(s)", to_extract["transcript_id(s)"])
refseq_df['Ensembl Canonical'] = refseq_df['Ensembl Canonical'].astype(bool)  
print(refseq_df.to_string())
refseq_df.head(n=len(refseq_df.index))

     Gene stable ID Gene name        TPM                                                                                                                                                                                                                                                                                                                                                                            transcript_id(s)       FPKM Transcript name RefSeq match transcript (MANE Select)  Ensembl Canonical Transcript stable ID
0   ENSG00000011304     PTBP1     342.64  ENST00000349038.8,ENST00000350092.8,ENST00000356948.10,ENST00000394601.8,ENST00000585535.2,ENST00000585856.2,ENST00000585932.5,ENST00000585956.5,ENST00000586481.5,ENST00000586944.5,ENST00000587094.2,ENST00000587136.1,ENST00000587191.3,ENST00000589575.5,ENST00000589883.1,ENST00000590887.5,ENST00000592113.5,ENST00000592804.1,ENST00000621737.4,ENST00000627714.2,ENST00000635647.1     413.86       PTBP1-203                           

Unnamed: 0,Gene stable ID,Gene name,TPM,transcript_id(s),FPKM,Transcript name,RefSeq match transcript (MANE Select),Ensembl Canonical,Transcript stable ID
0,ENSG00000011304,PTBP1,342.64,"ENST00000349038.8,ENST00000350092.8,ENST000003...",413.86,PTBP1-203,NM_002819.5,True,ENST00000356948
1,ENSG00000075624,ACTB,1567.7,"ENST00000331789.10,ENST00000414620.1,ENST00000...",1893.59,ACTB-217,NM_001101.5,True,ENST00000646664
2,ENSG00000108107,RPL28,1401.16,"ENST00000344063.6,ENST00000426763.3,ENST000004...",1692.44,RPL28-201,NM_000991.5,True,ENST00000344063
3,ENSG00000111640,GAPDH,1270.9,"ENST00000229239.9,ENST00000396856.5,ENST000003...",1535.09,GAPDH-201,NM_002046.7,True,ENST00000229239
4,ENSG00000156508,EEF1A1,5000.06,"ENST00000309268.10,ENST00000316292.13,ENST0000...",6039.46,EEF1A1-201,NM_001402.6,True,ENST00000309268
5,ENSG00000196565,HBG2,4169.72,"ENST00000336906.5,ENST00000380252.6,ENST000004...",5036.51,HBG2-201,NM_000184.3,True,ENST00000336906
6,ENSG00000222328,RNU2-2P,7046.27,ENST00000410396.1,8511.03,RNU2-2P-201,,True,ENST00000410396
7,ENSG00000263934,SNORD3A,13928.59,"ENST00000584923.1,ENST00000620232.1,ENST000006...",16824.04,SNORD3A-201,,True,ENST00000584923
8,ENSG00000274012,RN7SL2,94300.79,ENST00000490232.3,113903.82,RN7SL2-201,,True,ENST00000490232
9,ENSG00000276168,RN7SL1,149831.97,ENST00000618786.1,180978.68,RN7SL1-201,,True,ENST00000618786


In [80]:
# confirm that canonical transcript is among those expressed
for i, row in refseq_df.iterrows():
    found = False
    stable_t_id = row["Transcript stable ID"]
    transcript_ids = row["transcript_id(s)"].split(",")
    for t_id in transcript_ids:
        if t_id.startswith(stable_t_id):
            print(row["Gene name"], stable_t_id, t_id)
            found = True
    if (not found):
        print(stable_t_id, ": canonical transcript not present")

PTBP1 ENST00000356948 ENST00000356948.10
ACTB ENST00000646664 ENST00000646664.1
RPL28 ENST00000344063 ENST00000344063.6
GAPDH ENST00000229239 ENST00000229239.9
EEF1A1 ENST00000309268 ENST00000309268.10
HBG2 ENST00000336906 ENST00000336906.5
RNU2-2P ENST00000410396 ENST00000410396.1
SNORD3A ENST00000584923 ENST00000584923.1
RN7SL2 ENST00000490232 ENST00000490232.3
RN7SL1 ENST00000618786 ENST00000618786.1
RMRP ENST00000363046 ENST00000363046.1
RPPH1 ENST00000516869 ENST00000516869.1
RN7SK ENST00000636484 ENST00000636484.1
GCLC ENST00000650454 ENST00000650454.1
CLCN6 ENST00000346436 ENST00000346436.10
TSPAN6 ENST00000373020 ENST00000373020.8
SCYL3 ENST00000367771 ENST00000367771.10
SERPINB3 ENST00000283752 ENST00000283752.9
SERPINB4 ENST00000341074 ENST00000341074.9


In [18]:
# mergedRes = pd.merge(expressed_df, refseq_df, on ='gene_id')
# mergedRes.head(len(mergedRes.index))

In [19]:
# Note that several of the genes have no matching RefSeq ID. This is because they are not mRNA, but other kinds of RNA (i.e. snRNA or rRNA). 
# Ensembl doesn't already have the RefSeq IDs available for this. --> see PartialAnalysis notebook to filter these out

In [66]:
valid_seqs = []
# Export to file to use in PaintSHOP
f = open("refseqs_sample.txt", "w")
for index, row in refseq_df.iterrows():
    if str(row['RefSeq match transcript (MANE Select)']) != 'nan':
        valid_seqs.append(str(row['RefSeq match transcript (MANE Select)']))
        f.write(str(row['RefSeq match transcript (MANE Select)']) + '\n')
f.close()

In [82]:
# original 8 + 4
# 19 - 7 with no NM_
f = open("refseqs_sample.txt", "r")
for line in f:
    print(line)

NM_002819.5

NM_001101.5

NM_000991.5

NM_002046.7

NM_001402.6

NM_000184.3

NM_001498.4

NM_001286.5

NM_003270.4

NM_020423.7

NM_006919.3

NM_002974.4



In [67]:
# Then we use PaintSHOP to make probes: https://doi.org/10.1038/s41592-021-01187-3

In [117]:
data = pd.read_table('230629-PaintSHOP-probes.txt')

In [114]:
data.head(n=20)

Unnamed: 0,refseq,chrom,start,stop,sequence,Tm,on_target,off_target,repeat_seq,prob,max_kmer,probe_strand,transcript_id,gene_id
0,NM_000558,chr16,177346,177375,AGCCAGGAACTTGTCCAGGGAGGCGTGCAC,46.93,99.744,99.744,0,0.259,2,-,NM_000558,HBA1
1,NM_000558,chr16,177407,177436,GAAGCATGGCCACCGAGGCTCCAGCTTAAC,44.4,98.814,0.0,0,0.212,2,-,NM_000558,HBA1
2,NM_000558,chr16,176735,176765,TACCCCAGGCGGCCTTGACGTTGGTCTTGTC,46.44,99.573,99.573,0,0.195,2,-,NM_000558,HBA1
3,NM_000558,chr16,176988,177018,TTCTTGCCGTGGCCCTTAACCTGGGCAGAGC,46.96,99.631,99.631,0,0.214,2,-,NM_000558,HBA1
4,NM_000558,chr16,176703,176732,CAGGAGACAGCACCATGGTGGGTTCTCTCT,42.76,98.707,98.707,0,0.215,2,-,NM_000558,HBA1
5,NM_000558,chr16,177376,177406,GGTATTTGGAGGTCAGCACGGTGCTCACAGA,43.24,100.0,100.0,0,0.171,2,-,NM_000558,HBA1
6,NM_000558,chr16,177100,177129,AAGTTGACCGGGTCCACCCGAAGCTTGTGC,45.73,100.0,100.0,0,0.125,2,-,NM_000558,HBA1
7,NM_000558,chr16,176949,176978,GGTCGAAGTGCGGGAAGTAGGTCTTGGTGG,43.51,100.0,293.621,0,0.288,4,-,NM_000558,HBA1
8,NM_000558,chr16,177487,177518,GCCCACTCAGACTTTATTCAAAGACCACGGGG,42.17,100.0,54.236,0,0.358,2,-,NM_000558,HBA1
9,NM_000968,chr15,66504789,66504818,ATGGCGGAGAGAGGAGACAGCCACGCTCCT,46.98,98.944,98.944,0,0.146,4,+,NM_000968,RPL4


In [115]:
mask = (data['sequence'].str.len() <= 31)
data = data.loc[mask]
data = data.sort_values(['gene_id', 'off_target'], ascending=[True, True])
data.head(n=300)

#data.query("'gene_id'=='EEF1A1'").head(n=20)

Unnamed: 0,refseq,chrom,start,stop,sequence,Tm,on_target,off_target,repeat_seq,prob,max_kmer,probe_strand,transcript_id,gene_id
1,NM_000558,chr16,177407,177436,GAAGCATGGCCACCGAGGCTCCAGCTTAAC,44.40,98.814,0.000,0,0.212,2,-,NM_000558,HBA1
4,NM_000558,chr16,176703,176732,CAGGAGACAGCACCATGGTGGGTTCTCTCT,42.76,98.707,98.707,0,0.215,2,-,NM_000558,HBA1
2,NM_000558,chr16,176735,176765,TACCCCAGGCGGCCTTGACGTTGGTCTTGTC,46.44,99.573,99.573,0,0.195,2,-,NM_000558,HBA1
3,NM_000558,chr16,176988,177018,TTCTTGCCGTGGCCCTTAACCTGGGCAGAGC,46.96,99.631,99.631,0,0.214,2,-,NM_000558,HBA1
0,NM_000558,chr16,177346,177375,AGCCAGGAACTTGTCCAGGGAGGCGTGCAC,46.93,99.744,99.744,0,0.259,2,-,NM_000558,HBA1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27,NM_000968,chr15,66503107,66503136,CTGGGAATTCGAGCCACAGCTCTGCCAGTA,43.27,100.000,196.996,0,0.184,2,+,NM_000968,RPL4
20,NM_000968,chr15,66501441,66501471,GCTGGATACGGCGACGGTTTCTCATTTTGCC,43.23,99.074,198.148,0,0.233,4,+,NM_000968,RPL4
33,NM_000968,chr15,66503077,66503106,CCAGAGCGGTGAGTCCCACCACCTCGAACT,45.90,98.931,282.175,0,0.236,3,+,NM_000968,RPL4
15,NM_000968,chr15,66502640,66502669,TGAGGCAGCCAGGGCAGAACAGATGGCGTA,46.29,98.291,289.607,0,0.267,5,+,NM_000968,RPL4


In [116]:
gene_probes_dict = {}

for i in valid_seqs:
    gene = data.query("refseq=='" + i.split(".")[0] + "'")
    gene_probes_dict[i] = gene
    print(i, len(gene.index))

NM_002819.5 0
NM_001101.5 0
NM_000991.5 48
NM_002046.7 0
NM_001402.6 0
NM_000184.3 0
NM_001498.4 0
NM_001286.5 0
NM_003270.4 0
NM_020423.7 0
NM_006919.3 0
NM_002974.4 0


In [111]:
to_print = gene_probes_dict["NM_001286.5"]
for i, row in to_print.iterrows():
    print("> ID" + str(i))
    print(row["sequence"])

> ID19
AGGAAGCATCTTCCCCACGCACCCTCCCTG
> ID23
AAGGAAGCTGCGACTGCGGCAATGGCCAAG
> ID24
CCCTGATCACCTCGCCTAGGGGCCATGGAA
> ID26
TTTCCACGTGAGCCCTTGGTTCCAGAAGGA
> ID30
ATGCTCACTTTGGGTGCCTAGTCAGAGGGG
> ID35
GTCCTCGGCCATCTCGGCATAGGAGAGGCG
> ID37
TCTGTGACCACCGGGAAGGCATGGTGGACC
> ID38
TCACTGAACGACAGGGTGCGCTTCCCATGC
> ID39
TCCGCAGAGCTGTGGTGCAATACTGCACAT
> ID43
AGGGGAGAGAGAGGTGTTTGTGAAGGGCAG
> ID44
GATTGGCGCCCCGAAAGCTGCAGCAACTCC
> ID46
GCAGCTCCACACAGCAGAGAAGGCACAAAA
> ID47
ACTGCCACTCTGGACCCCTCTACCAACCCC
> ID48
ATCTGCTGCAGGAGGTCCTCCTTCTCGGCT
> ID50
ATGGTCATGCGGACCACCCCGCCCAAGAAA
> ID52
GGTGACAAGCTGCGACCGAAGGATCAGGCC
> ID54
GCAAGACCAGAGCCTCTGCTCACACCCAGG
> ID56
GGGAGCTGGAAGGAACCCCAGCTTCCAAAC
> ID57
CCGTAGGCTCCCCATCCACAGATCTGCTTTA
> ID63
TTCATGTGAAACCTTCAGGCACGTGCTGGC
> ID65
GTGGAAGGTCAGAGGGCGGAACCGCTCCTC
> ID66
TTCATGGACTGGCTCCGTTTGCGCTGCTCG
> ID67
GTGTGCGGGTAGACGTAGGTCAGGTTGGGC
> ID69
CGCAGGATGCTCACCAGAGACTGGATGCGG
> ID70
TGGTGGGTGATGCCAAGATCCTCTGAGCTC
> ID73
AAGGCAGACTTGGCCAAGGAGGTCGCCCAG
> ID74
GACA

In [102]:
# HGB2: 
# 2: one for HGB1 and another for HGB2 (one off target on ch2 --> not a gene)
# 6: one for HGB1 and another for HGB2
# 12: one for HGB1 and another for HGB2
# 0: one for HGB1 and another for HGB2 (off target gene on ch6)
# 12, 6, 2, 7, 8 (7, and 8 made smaller to fit)

# ACTB:
# 14: ch7 -> match to ACTB, ch6 no gene, ch2 POTEI off target
# 18: of 7 results, 2 no gene, 1 is ACTB, 4 off target genes
# 17: has ACTB, 3 no gene
# 13: has ACTB, ACTG1 and OMIM genes on ch17, 2 no gene
# 16: has ACTB, lots of overlapping genes

# TSPAN6:
# 257 has off target gene with correct TSPAN6
# 249, 242, and 255 are good

# None for EEF1A1

# PTBP1:
# 183, 186, 187, 188, 190
# 185 is bad
# 233, 210, 234, 219, 201, 216, 214, 236, 224 are bad

# GCLC:
# 156, 167, 169, 175, 177
# 145, 176, 165 are bad

# SCYL3:
# 289 and 294 are bad
# 268, 275, 276, 280, 284

# CLCN6:
# 106, 121, 109, 40 are bad
# 46, 54, 102 have no gene


In [112]:
to_print.head(n=70)

Unnamed: 0,refseq,chrom,start,stop,sequence,Tm,on_target,off_target,repeat_seq,prob,max_kmer,probe_strand,transcript_id,gene_id
19,NM_001286,chr1,11841340,11841369,AGGAAGCATCTTCCCCACGCACCCTCCCTG,46.41,100.000,0.000,0,0.251,2,-,NM_001286,CLCN6
23,NM_001286,chr1,11842940,11842969,AAGGAAGCTGCGACTGCGGCAATGGCCAAG,46.52,100.000,0.000,0,0.182,0,-,NM_001286,CLCN6
24,NM_001286,chr1,11842657,11842686,CCCTGATCACCTCGCCTAGGGGCCATGGAA,45.98,98.453,0.000,0,0.207,0,-,NM_001286,CLCN6
26,NM_001286,chr1,11827189,11827218,TTTCCACGTGAGCCCTTGGTTCCAGAAGGA,43.08,100.000,0.000,0,0.191,0,-,NM_001286,CLCN6
30,NM_001286,chr1,11841456,11841485,ATGCTCACTTTGGGTGCCTAGTCAGAGGGG,43.05,100.000,0.000,0,0.220,0,-,NM_001286,CLCN6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122,NM_001286,chr1,11842708,11842737,TCCCTCAGCACAAGGTCCCAAAGCCAGGGC,46.97,100.000,0.000,0,0.269,2,-,NM_001286,CLCN6
125,NM_001286,chr1,11840198,11840227,GCTGTCAGATGGTCTGGTAGTGCTGCCTCA,43.06,100.000,0.000,0,0.219,0,-,NM_001286,CLCN6
126,NM_001286,chr1,11840448,11840477,CACTGGAGGGTGGGAACACAGGGAGCAGGA,45.98,100.000,0.000,0,0.458,2,-,NM_001286,CLCN6
128,NM_001286,chr1,11842208,11842237,TAACATTCACCGGGGCAGCCTCAGGGCAGG,46.77,100.000,0.000,0,0.215,3,-,NM_001286,CLCN6
