In [1]:
# Partial analysis

In [164]:
import pandas as pd
pd.set_option('display.max_columns', None)
import gget
import json
import math

In [151]:
# data file downloaded from https://www.encodeproject.org/experiments/ENCSR792OIJ/
# Accession ENCFF928NYA under processed data (tsv file) 
# Data added 2020-11-04
df = pd.read_table("ENCFF928NYA.tsv")
columns = ['gene_id', 'TPM', 'FPKM']
df = pd.DataFrame(df, columns=columns)

In [152]:
df.head()

Unnamed: 0,gene_id,TPM,FPKM
0,10904,0.0,0.0
1,12954,0.0,0.0
2,12956,0.0,0.0
3,12958,0.0,0.0
4,12960,0.0,0.0


In [153]:
expressed_df = df.query("TPM!=0.0")

high_thresh = 200
high_thresh_df = expressed_df.query("TPM>@high_thresh")
mid_thresh_df = high_thresh_df.query("TPM<20000")
mid_thresh_df = mid_thresh_df.drop(index=[59429, 59431, 59456, 59471, 59484, 59493, 59500, 59524])

In [163]:
expressed_df.head()

Unnamed: 0,gene_id,TPM,FPKM
46,13023,41.07,49.61
47,13024,16.31,19.7
465,30958,43.49,52.53
471,30964,52.81,63.79
649,ENSG00000000003.14,0.05,0.06


In [154]:
# print(mid_thresh_df.to_string())
# print(len(mid_thresh_df.index))

In [155]:
from pybiomart import Dataset

dataset = Dataset(name='hsapiens_gene_ensembl',
                  host='http://www.ensembl.org')

In [156]:
refseq_results = []

# Gets the gene name and the RefSeq ID associated with the canonical (most common) transcript
for i, row in mid_thresh_df.iterrows():
    gene_id = row["gene_id"].split(".")[0] # remove the version number
    # print(gene_id)
    result = dataset.query(attributes=['ensembl_gene_id', 'external_gene_name', 'external_transcript_name', 'transcript_mane_select', 'transcript_is_canonical', 'ensembl_transcript_id'],
                  filters={'link_ensembl_gene_id': [gene_id], 'transcript_is_canonical' : True})
    result["gene_id"] = row["gene_id"]
    refseq_results.append(result)


In [157]:
refseq_df = pd.concat(refseq_results, ignore_index = True)
refseq_df['Ensembl Canonical'] = refseq_df['Ensembl Canonical'].astype(bool)
#print(refseq_df.to_string())
# refseq_df.head(n=len(refseq_df.index))

# for i, row in 

In [158]:
mergedRes = pd.merge(expressed_df, refseq_df, on ='gene_id')
mergedRes.head(len(mergedRes.index))

Unnamed: 0,gene_id,TPM,FPKM,Gene stable ID,Gene name,Transcript name,RefSeq match transcript (MANE Select),Ensembl Canonical,Transcript stable ID
0,ENSG00000004455.16,229.47,277.18,ENSG00000004455,AK2,AK2-214,NM_001625.4,True,ENST00000672715
1,ENSG00000005022.5,359.12,433.77,ENSG00000005022,SLC25A5,SLC25A5-201,NM_001152.5,True,ENST00000317881
2,ENSG00000008018.8,201.88,243.85,ENSG00000008018,PSMB1,PSMB1-201,NM_002793.4,True,ENST00000262193
3,ENSG00000008988.9,1142.57,1380.08,ENSG00000008988,RPS20,RPS20-201,NM_001023.4,True,ENST00000009589
4,ENSG00000009307.15,242.56,292.98,ENSG00000009307,CSDE1,CSDE1-203,NM_001007553.3,True,ENST00000358528
...,...,...,...,...,...,...,...,...,...
367,ENSG00000278591.1,4298.21,5191.71,ENSG00000278591,U2,U2.19-201,,True,ENST00000618602
368,ENSG00000278677.1,657.47,794.14,ENSG00000278677,H2AC17,H2AC17-201,NM_003514.2,True,ENST00000359611
369,ENSG00000278705.1,668.82,807.86,ENSG00000278705,H4C2,H4C2-201,NM_003544.3,True,ENST00000377745
370,ENSG00000278828.1,224.92,271.67,ENSG00000278828,H3C10,H3C10-202,NM_003536.3,True,ENST00000685041


In [166]:
# print(mergedRes.to_string())

In [159]:
mergedRes = mergedRes.sort_values(['TPM'], ascending=[False])
mergedRes = mergedRes.reset_index()

to_drop = []    
for i, row in mergedRes.iterrows():
    if type(mergedRes.loc[mergedRes.index[i], "RefSeq match transcript (MANE Select)"]) == float:
        to_drop.append(i)

In [160]:
mergedRes = mergedRes.drop(index=to_drop)  

In [161]:
cropped = mergedRes.query("TPM>1000")
for i, row in cropped.iterrows():
    print(row["RefSeq match transcript (MANE Select)"] + ", ", end="")

NM_001402.6, NM_000184.3, NM_021968.4, NM_022551.3, NM_002952.4, NM_001034996.3, NM_005319.4, NM_006013.5, NM_001002.4, NM_001004.4, NM_000968.4, NM_001005.5, NM_001317782.2, NM_012423.4, NM_001101.5, NM_001312653.2, NM_001035267.2, NM_003548.2, NM_000146.4, NM_005617.4, NM_001025.5, NM_000559.3, NM_001012.2, NM_000991.5, NM_001024.4, NM_001020.6, NM_001014.5, NM_001003.3, NM_001022.4, NM_001961.4, NM_002520.7, NM_000972.3, NM_000980.4, NM_000977.4, NM_002046.7, NM_001031.5, NM_000994.4, NM_002295.6, NM_001428.5, NM_021130.5, NM_002568.4, NM_002948.5, NM_001015.5, NM_001023.4, NM_000984.6, NM_001006.5, NM_000558.5, NM_001021.6, NM_000981.4, NM_006082.3, NM_000997.5, NM_006098.5, NM_001009.4, NM_001243177.4, 

In [162]:
cropped.head(len(cropped.index))

Unnamed: 0,index,gene_id,TPM,FPKM,Gene stable ID,Gene name,Transcript name,RefSeq match transcript (MANE Select),Ensembl Canonical,Transcript stable ID
3,154,ENSG00000156508.17,5000.06,6039.46,ENSG00000156508,EEF1A1,EEF1A1-201,NM_001402.6,True,ENST00000309268
17,239,ENSG00000196565.14,4169.72,5036.51,ENSG00000196565,HBG2,HBG2-201,NM_000184.3,True,ENST00000336906
19,247,ENSG00000197238.4,2867.51,3463.6,ENSG00000197238,H4C11,H4C11-201,NM_021968.4,True,ENST00000355057
22,314,ENSG00000231500.6,2266.3,2737.41,ENSG00000231500,RPS18,RPS18-230,NM_022551.3,True,ENST00000439602
24,122,ENSG00000140988.15,2163.15,2612.82,ENSG00000140988,RPS2,RPS2-201,NM_002952.4,True,ENST00000343262
26,233,ENSG00000188846.13,2097.45,2533.46,ENSG00000188846,RPL14,RPL14-202,NM_001034996.3,True,ENST00000396203
27,230,ENSG00000187837.3,2095.53,2531.15,ENSG00000187837,H1-2,H1-2-201,NM_005319.4,True,ENST00000343677
28,141,ENSG00000147403.16,1939.74,2342.97,ENSG00000147403,RPL10,RPL10-202,NM_006013.5,True,ENST00000369817
30,24,ENSG00000089157.15,1734.98,2095.65,ENSG00000089157,RPLP0,RPLP0-203,NM_001002.4,True,ENST00000392514
31,207,ENSG00000177600.8,1717.62,2074.67,ENSG00000177600,RPLP2,RPLP2-201,NM_001004.4,True,ENST00000321153


In [117]:
data = pd.read_table('230629-addl-probes.txt')

In [118]:
data.head(len(data.index))

Unnamed: 0,refseq,chrom,start,stop,sequence,Tm,on_target,off_target,repeat_seq,prob,max_kmer,probe_strand,transcript_id,gene_id
0,NM_000558,chr16,177346,177375,AGCCAGGAACTTGTCCAGGGAGGCGTGCAC,46.93,99.744,99.744,0,0.259,2,-,NM_000558,HBA1
1,NM_000558,chr16,177407,177436,GAAGCATGGCCACCGAGGCTCCAGCTTAAC,44.40,98.814,0.000,0,0.212,2,-,NM_000558,HBA1
2,NM_000558,chr16,176735,176765,TACCCCAGGCGGCCTTGACGTTGGTCTTGTC,46.44,99.573,99.573,0,0.195,2,-,NM_000558,HBA1
3,NM_000558,chr16,176988,177018,TTCTTGCCGTGGCCCTTAACCTGGGCAGAGC,46.96,99.631,99.631,0,0.214,2,-,NM_000558,HBA1
4,NM_000558,chr16,176703,176732,CAGGAGACAGCACCATGGTGGGTTCTCTCT,42.76,98.707,98.707,0,0.215,2,-,NM_000558,HBA1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189,NM_000997,chr5,40834565,40834597,CGTCTTATTGCGACGCTTTCCAAACGATGACGT,42.60,100.000,188.529,0,0.208,3,+,NM_000997,RPL37
190,NM_000997,chr5,40827322,40827357,ATAAGGAGAAAGCTGAAATCAGAGGAATCGTGCCCT,42.11,100.000,0.000,0,0.210,2,+,NM_000997,RPL37
191,NM_000997,chr5,40827358,40827387,CCACTGCCAGCTTGCAGTTTAAAGCACACC,42.31,100.000,0.000,0,0.170,0,+,NM_000997,RPL37
192,NM_000997,chr5,40827510,40827545,AAAATACATGGGTCAGGTCAAGTCTTCATCCTGGGG,42.47,100.000,0.000,0,0.204,0,+,NM_000997,RPL37


In [119]:
mask = (data['sequence'].str.len() <= 30)
data = data.loc[mask]
data = data.sort_values(['gene_id', 'off_target'], ascending=[True, True])
data.head(n=300)

#data.query("'gene_id'=='EEF1A1'").head(n=20)

Unnamed: 0,refseq,chrom,start,stop,sequence,Tm,on_target,off_target,repeat_seq,prob,max_kmer,probe_strand,transcript_id,gene_id
1,NM_000558,chr16,177407,177436,GAAGCATGGCCACCGAGGCTCCAGCTTAAC,44.40,98.814,0.000,0,0.212,2,-,NM_000558,HBA1
4,NM_000558,chr16,176703,176732,CAGGAGACAGCACCATGGTGGGTTCTCTCT,42.76,98.707,98.707,0,0.215,2,-,NM_000558,HBA1
0,NM_000558,chr16,177346,177375,AGCCAGGAACTTGTCCAGGGAGGCGTGCAC,46.93,99.744,99.744,0,0.259,2,-,NM_000558,HBA1
6,NM_000558,chr16,177100,177129,AAGTTGACCGGGTCCACCCGAAGCTTGTGC,45.73,100.000,100.000,0,0.125,2,-,NM_000558,HBA1
7,NM_000558,chr16,176949,176978,GGTCGAAGTGCGGGAAGTAGGTCTTGGTGG,43.51,100.000,293.621,0,0.288,4,-,NM_000558,HBA1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18,NM_000968,chr15,66500072,66500101,AGAATGGTGTTCCGGCGCATGGTCTTTGCA,44.38,98.380,196.760,0,0.323,3,+,NM_000968,RPL4
27,NM_000968,chr15,66503107,66503136,CTGGGAATTCGAGCCACAGCTCTGCCAGTA,43.27,100.000,196.996,0,0.184,2,+,NM_000968,RPL4
33,NM_000968,chr15,66503077,66503106,CCAGAGCGGTGAGTCCCACCACCTCGAACT,45.90,98.931,282.175,0,0.236,3,+,NM_000968,RPL4
15,NM_000968,chr15,66502640,66502669,TGAGGCAGCCAGGGCAGAACAGATGGCGTA,46.29,98.291,289.607,0,0.267,5,+,NM_000968,RPL4


In [123]:
print(valid_seqs)

['NM_001402.6', 'NM_000184.3', 'NM_021968.4', 'NM_022551.3', 'NM_002952.4', 'NM_001034996.3', 'NM_005319.4', 'NM_006013.5', 'NM_001002.4', 'NM_001004.4', 'NM_000968.4', 'NM_001005.5', 'NM_001317782.2', 'NM_012423.4', 'NM_001101.5', 'NM_001312653.2', 'NM_001035267.2', 'NM_003548.2', 'NM_005617.4', 'NM_000559.3', 'NM_001012.2', 'NM_000991.5', 'NM_001024.4', 'NM_001020.6', 'NM_001014.5', 'NM_001003.3', 'NM_001022.4', 'NM_001961.4', 'NM_002520.7', 'NM_000972.3', 'NM_000980.4', 'NM_002046.7', 'NM_001031.5', 'NM_000994.4', 'NM_002295.6', 'NM_021130.5', 'NM_002948.5', 'NM_001015.5', 'NM_001023.4', 'NM_000984.6', 'NM_001006.5', 'NM_000558.5', 'NM_001021.6', 'NM_000981.4', 'NM_006082.3', 'NM_000997.5', 'NM_006098.5', 'NM_001009.4']


In [125]:
gene_probes_dict = {}
valid_seqs = ['NM_000558', 'NM_000968', 'NM_000984', 'NM_000991', 'NM_000994', 'NM_000997']

for i in valid_seqs:
    gene = data.query("refseq=='" + i.split(".")[0] + "'")
    gene_probes_dict[i] = gene
    print(type("refseq=='" + i.split(".")[0] + "'"))
    print(i, len(gene.index))

<class 'str'>
NM_000558 5
<class 'str'>
NM_000968 11
<class 'str'>
NM_000984 1
<class 'str'>
NM_000991 44
<class 'str'>
NM_000994 6
<class 'str'>
NM_000997 5


In [167]:
# to_print = gene_probes_dict["NM_000991"]
# for i, row in to_print.iterrows():
#     print("> ID" + str(i))
#     print(row["sequence"])

In [146]:
# to_print.head(n=50)

In [None]:
# HBA1:
# could potentially all work
# 6, 0, 1, 4 are preferred

# RPL28
# 101, 90, 71 are bad
# 62, 94 no gene
# lots of good probes

# RPL4 
# 16, 9, 40, 18, 27, 33, 15, 22 bad
# 14, 32, 43 good

# RPL23A
# not enough probes

# RPL32
# 124, 129, 132 are good
# 130, 137, 140 have off target genes

# RPL37
# none are good