In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
import gget
import json
import math

In [2]:
# data file downloaded from https://www.encodeproject.org/experiments/ENCSR792OIJ/
# Accession ENCFF928NYA under processed data (tsv file) 
# Data added 2020-11-04
df = pd.read_table("ENCFF928NYA.tsv")
columns = ['gene_id', 'TPM', 'FPKM']
df = pd.DataFrame(df, columns=columns)

In [30]:
neg_ctrl = df.query("gene_id=='ENSG00000057149.15' | gene_id=='ENSG00000206073.10'")

In [31]:
print(neg_ctrl.to_string())

                  gene_id  TPM  FPKM
1418   ENSG00000057149.15  0.0   0.0
21010  ENSG00000206073.10  0.0   0.0


In [32]:
from pybiomart import Dataset

dataset = Dataset(name='hsapiens_gene_ensembl',
                  host='http://www.ensembl.org')

In [33]:
refseq_results = []

# Gets the gene name and the RefSeq ID associated with the canonical (most common) transcript
for i, row in neg_ctrl.iterrows():
    gene_id = row["gene_id"].split(".")[0] # remove the version number
    # print(gene_id)
    result = dataset.query(attributes=['ensembl_gene_id', 'external_gene_name', 'external_transcript_name', 'transcript_mane_select', 'transcript_is_canonical', 'ensembl_transcript_id'],
                  filters={'link_ensembl_gene_id': [gene_id], 'transcript_is_canonical' : True})
    result["gene_id"] = row["gene_id"]
    refseq_results.append(result)


In [34]:
refseq_df = pd.concat(refseq_results, ignore_index = True)
refseq_df['Ensembl Canonical'] = refseq_df['Ensembl Canonical'].astype(bool)
print(refseq_df.to_string())
# refseq_df.head(n=len(refseq_df.index))

# for i, row in 

    Gene stable ID Gene name Transcript name RefSeq match transcript (MANE Select)  Ensembl Canonical Transcript stable ID             gene_id
0  ENSG00000057149  SERPINB3    SERPINB3-201                           NM_006919.3               True      ENST00000283752  ENSG00000057149.15
1  ENSG00000206073  SERPINB4    SERPINB4-201                           NM_002974.4               True      ENST00000341074  ENSG00000206073.10


In [35]:
mergedRes = pd.merge(neg_ctrl, refseq_df, on ='gene_id')
mergedRes.head(len(mergedRes.index))

Unnamed: 0,gene_id,TPM,FPKM,Gene stable ID,Gene name,Transcript name,RefSeq match transcript (MANE Select),Ensembl Canonical,Transcript stable ID
0,ENSG00000057149.15,0.0,0.0,ENSG00000057149,SERPINB3,SERPINB3-201,NM_006919.3,True,ENST00000283752
1,ENSG00000206073.10,0.0,0.0,ENSG00000206073,SERPINB4,SERPINB4-201,NM_002974.4,True,ENST00000341074


In [55]:
data = pd.read_table('230630-SERPINB3-probes.txt')
#data = data.sort_values(['off_target', 'prob'], ascending=[True, False])

In [58]:
mask = (data['sequence'].str.len() <= 37)
data = data.loc[mask]
data = data.sort_values(['off_target', 'prob'], ascending=[True, False])
data.head(n=300)

#data.query("'gene_id'=='EEF1A1'").head(n=20)

Unnamed: 0,refseq,chrom,start,stop,sequence,Tm,on_target,off_target,repeat_seq,prob,max_kmer,probe_strand,transcript_id,gene_id
4,NM_006919,chr18,63655754,63655790,GTTGAAGTAGGTGATGATCCGAATCCTACTACAGCGG,42.02,100.0,0.0,0,0.264,0,+,NM_006919,SERPINB3
16,NM_006919,chr18,63656915,63656951,CTGTACATCCTCCAGCGAGGCAAAATGAAAAGATGTG,42.22,99.986,0.0,0,0.159,2,+,NM_006919,SERPINB3
3,NM_006919,chr18,63655853,63655882,GATAGCACGAGACCGCGGCTCCCGGTCATG,46.69,98.078,0.0,0,0.157,0,+,NM_006919,SERPINB3
5,NM_006919,chr18,63655791,63655820,TGGCAGCTGCAGCTTCTGCTCCCTCCTCTG,46.62,99.251,0.0,0,0.153,3,+,NM_006919,SERPINB3
0,NM_006919,chr18,63655603,63655639,ACCAGAACATCTGCAGGTGAACATTTTCCAAATGGAG,42.14,100.0,0.0,0,0.096,3,+,NM_006919,SERPINB3
14,NM_006919,chr18,63655960,63655996,TTCCACTTTGAACCGAGGTAAGTGTAAATCGACACGT,42.28,96.532,61.311,0,0.203,2,+,NM_006919,SERPINB3
18,NM_006919,chr18,63658560,63658596,TCTGGAGCATTTGCAAAATCAACAGATTCCACACTGG,42.41,100.0,79.944,0,0.089,2,+,NM_006919,SERPINB3
10,NM_006919,chr18,63661156,63661192,TTCTGAACTGTTGGAACAGGTCGAACATGAACTTGGT,42.66,100.0,81.777,0,0.09,2,+,NM_006919,SERPINB3
12,NM_006919,chr18,63655883,63655916,CCTGAGAGGTCTGCATCCCCATTGAAGATATCCA,42.05,94.174,91.286,0,0.142,2,+,NM_006919,SERPINB3
2,NM_006919,chr18,63655821,63655852,TAACCTCCACAAAGGCCTTGTGTAGGACTCCA,42.17,100.0,93.235,0,0.09,2,+,NM_006919,SERPINB3


In [53]:
gene_probes_dict = {}
valid_seqs = ['NM_006919']

for i in valid_seqs:
    gene = data.query("refseq=='" + i.split(".")[0] + "'")
    gene_probes_dict[i] = gene
    print(type("refseq=='" + i.split(".")[0] + "'"))
    print(i, len(gene.index))

<class 'str'>
NM_006919 19


In [54]:
to_print = gene_probes_dict["NM_006919"]
for i, row in to_print.iterrows():
    print("> ID" + str(i))
    print(row["sequence"])

> ID0
ACCAGAACATCTGCAGGTGAACATTTTCCAAATGGAG
> ID3
GATAGCACGAGACCGCGGCTCCCGGTCATG
> ID4
GTTGAAGTAGGTGATGATCCGAATCCTACTACAGCGG
> ID5
TGGCAGCTGCAGCTTCTGCTCCCTCCTCTG
> ID16
CTGTACATCCTCCAGCGAGGCAAAATGAAAAGATGTG
> ID14
TTCCACTTTGAACCGAGGTAAGTGTAAATCGACACGT
> ID18
TCTGGAGCATTTGCAAAATCAACAGATTCCACACTGG
> ID10
TTCTGAACTGTTGGAACAGGTCGAACATGAACTTGGT
> ID12
CCTGAGAGGTCTGCATCCCCATTGAAGATATCCA
> ID2
TAACCTCCACAAAGGCCTTGTGTAGGACTCCA
> ID8
TGATATGTTGCAGCTTTTCCTGTGGTGTTCTCTGTGA
> ID9
AAGAGGACCATCCCTAATGCTGATGTGATGCTGA
> ID1
TGACAGACTAATTGCATCTACGGGGATGAGAATCTGC
> ID17
TTCTTCTCCCACTGCCCTTTGAAATAGATTGCGTTCA
> ID11
GTTGGCTTCACTGAGTGAATTCATGGTGAACTCGATG
> ID6
TATGAGCCAAGAGAATCTGTTGTTGCCAGCAATCAGT
> ID7
TTTTCTCCGAAGAGCTTGTTGGCGATCTTCAGC
> ID13
CCATTCCCATGGTTCTCAACGTGTCCTTGAGG
> ID15
GATCTTTGCCTTTGTATGGTATTTCCAGGACCTTGGC


In [59]:
# SERPINB3:
# 13 (32 nt), 2 (32 nt), 3 (30 nt), 7 (33 nt) are good (but they all have the SERPINB3 vs SERPINB4 problem --> SERPINB4 is slightly expressed) --> show that they essentially share probes and both have no expression (ENCODE shows expression for one exon in one track for SERPIN4 (might be an error))
# 12, 5, 9 are bad
# at 37, there are a lot that work --> after trying to remove some nt, 
# 4, 16, 0, 14

# all cut to 30 nt, removing from front
# 13, 2, 3
# 4, 16