In [1]:
# Entire data set processing

In [49]:
import pandas as pd
pd.set_option('display.max_columns', None)
import gget
import json

In [34]:
# data file downloaded from https://www.encodeproject.org/experiments/ENCSR792OIJ/
# Accession ENCFF928NYA under processed data (tsv file) 
# Data added 2020-11-04
df = pd.read_table("ENCFF928NYA.tsv")
columns = ['gene_id', 'TPM', 'FPKM']
df = pd.DataFrame(df, columns=columns)

In [35]:
df.head()

Unnamed: 0,gene_id,TPM,FPKM
0,10904,0.0,0.0
1,12954,0.0,0.0
2,12956,0.0,0.0
3,12958,0.0,0.0
4,12960,0.0,0.0


In [36]:
expressed_df = df.query("TPM>1")

In [37]:
expressed_df = expressed_df.sort_values(['TPM'], ascending=[False])

In [38]:
expressed_df.head()

Unnamed: 0,gene_id,TPM,FPKM
54599,ENSG00000276168.1,149831.97,180978.68
53485,ENSG00000274012.1,94300.79,113903.82
57853,ENSG00000283293.1,66929.44,80842.58
55115,ENSG00000277209.1,43982.92,53125.98
55026,ENSG00000277027.1,23024.05,27810.24


In [39]:
len(expressed_df.index)

12614

In [40]:
from pybiomart import Dataset

dataset = Dataset(name='hsapiens_gene_ensembl',
                  host='http://www.ensembl.org')

In [41]:
refseq_results = []

# Gets the gene name and the RefSeq ID associated with the canonical (most common) transcript
for i, row in expressed_df.iterrows():
    gene_id = row["gene_id"].split(".")[0] # remove the version number
    # print(gene_id)
    result = dataset.query(attributes=['ensembl_gene_id', 'external_gene_name', 'external_transcript_name', 'transcript_mane_select', 'transcript_is_canonical', 'ensembl_transcript_id'],
                  filters={'link_ensembl_gene_id': [gene_id], 'transcript_is_canonical' : True})
    result["gene_id"] = row["gene_id"]
    refseq_results.append(result)


In [50]:
refseq_df = pd.concat(refseq_results, ignore_index = True)
refseq_df['Ensembl Canonical'] = refseq_df['Ensembl Canonical'].astype(bool)  
# print(refseq_df.to_string())
refseq_df.head(n=len(refseq_df.index))

Unnamed: 0,Gene stable ID,Gene name,Transcript name,RefSeq match transcript (MANE Select),Ensembl Canonical,Transcript stable ID,gene_id
0,ENSG00000276168,RN7SL1,RN7SL1-201,,True,ENST00000618786,ENSG00000276168.1
1,ENSG00000274012,RN7SL2,RN7SL2-201,,True,ENST00000490232,ENSG00000274012.1
2,ENSG00000283293,RN7SK,RN7SK-201,,True,ENST00000636484,ENSG00000283293.1
3,ENSG00000277209,RPPH1,RPPH1-201,,True,ENST00000516869,ENSG00000277209.1
4,ENSG00000277027,RMRP,RMRP-201,,True,ENST00000363046,ENSG00000277027.1
...,...,...,...,...,...,...,...
12498,ENSG00000135338,LCA5,LCA5-201,NM_001122769.3,True,ENST00000369846,ENSG00000135338.13
12499,ENSG00000268996,MAN1B1-DT,MAN1B1-DT-201,,True,ENST00000596585,ENSG00000268996.3
12500,ENSG00000158301,GPRASP2,GPRASP2-202,NM_001004051.4,True,ENST00000483720,ENSG00000158301.18
12501,ENSG00000232586,KIAA1614-AS1,KIAA1614-AS1-201,,True,ENST00000415647,ENSG00000232586.1


In [47]:
refseq_df.to_csv("full_k532_gene_analysis.csv")

In [43]:
mergedRes = pd.merge(expressed_df, refseq_df, on ='gene_id')

In [52]:
mergedRes.head(100)

Unnamed: 0,gene_id,TPM,FPKM,Gene stable ID,Gene name,Transcript name,RefSeq match transcript (MANE Select),Ensembl Canonical,Transcript stable ID
0,ENSG00000276168.1,149831.97,180978.68,ENSG00000276168,RN7SL1,RN7SL1-201,,True,ENST00000618786
1,ENSG00000274012.1,94300.79,113903.82,ENSG00000274012,RN7SL2,RN7SL2-201,,True,ENST00000490232
2,ENSG00000283293.1,66929.44,80842.58,ENSG00000283293,RN7SK,RN7SK-201,,True,ENST00000636484
3,ENSG00000277209.1,43982.92,53125.98,ENSG00000277209,RPPH1,RPPH1-201,,True,ENST00000516869
4,ENSG00000277027.1,23024.05,27810.24,ENSG00000277027,RMRP,RMRP-201,,True,ENST00000363046
...,...,...,...,...,...,...,...,...,...
95,ENSG00000145592.13,1056.55,1276.18,ENSG00000145592,RPL37,RPL37-201,NM_000997.5,True,ENST00000274242
96,ENSG00000204628.11,1038.83,1254.78,ENSG00000204628,RACK1,RACK1-227,NM_006098.5,True,ENST00000512805
97,ENSG00000083845.8,1014.18,1225.00,ENSG00000083845,RPS5,RPS5-201,NM_001009.4,True,ENST00000196551
98,ENSG00000149925.18,1003.49,1212.09,ENSG00000149925,ALDOA,ALDOA-218,NM_001243177.4,True,ENST00000642816


In [None]:
# remove refseqs that are nan (are there any non NM_?)

In [None]:
to_drop = []
for i, row in mergedRes.iterrows():
    if type(mergedRes.loc[mergedRes.index[i], "RefSeq match transcript (MANE Select)"]) == float:
        to_drop.append(i)

In [76]:
valid_transcripts = mergedRes.drop(index=to_drop).reset_index()  

In [77]:
valid_transcripts.head()

Unnamed: 0,index,gene_id,TPM,FPKM,Gene stable ID,Gene name,Transcript name,RefSeq match transcript (MANE Select),Ensembl Canonical,Transcript stable ID
0,8,ENSG00000156508.17,5000.06,6039.46,ENSG00000156508,EEF1A1,EEF1A1-201,NM_001402.6,True,ENST00000309268
1,22,ENSG00000196565.14,4169.72,5036.51,ENSG00000196565,HBG2,HBG2-201,NM_000184.3,True,ENST00000336906
2,24,ENSG00000197238.4,2867.51,3463.6,ENSG00000197238,H4C11,H4C11-201,NM_021968.4,True,ENST00000355057
3,27,ENSG00000231500.6,2266.3,2737.41,ENSG00000231500,RPS18,RPS18-230,NM_022551.3,True,ENST00000439602
4,29,ENSG00000140988.15,2163.15,2612.82,ENSG00000140988,RPS2,RPS2-201,NM_002952.4,True,ENST00000343262


In [81]:
## Check if any of the refseqs recovered are not for mRNA
count = 0
for i, row in valid_transcripts.iterrows():
    if not mergedRes.loc[mergedRes.index[i], "RefSeq match transcript (MANE Select)"].startswith("NM_"):
        count+=1
        print(i, mergedRes.loc[mergedRes.index[i], "RefSeq match transcript (MANE Select)"])
    

In [68]:
len(mergedRes.index)

10405

In [69]:
i

10405

In [80]:
count

10405

In [82]:
valid_transcripts.to_csv("valid_k532_genes.csv")