In [1]:
import pandas as pd
import pybedtools as pyb
from pylab import *

Loading frames

In [2]:
vSTRs = pd.read_table('../data/STRs_depth5_chr_start_maf09.tsv')
nvSTRs = pd.read_table('../data/STRs_depth5_chr_start_maf09_nv.tsv')

Function to make BED file from DataFrame

In [3]:
def make_bed_from_frame(filename, frame):
    tab = '\t'
    newline = '\n'

    fh = open(filename, 'w')
    for i in frame.itertuples():
        fh.write(i.CHR + tab + str(i.START) + tab + str(i.END) + tab + str(i.CHR_START) + newline)
    fh.close()

Subsetting for making BED

In [4]:
v_sub_for_bed = vSTRs[['CHR', 'START', 'END', 'CHR_START']].drop_duplicates()
nv_sub_for_bed = nvSTRs[['CHR', 'START', 'END', 'CHR_START']].drop_duplicates()

Making BED

In [5]:
make_bed_from_frame('../data/STRs_depth5_chr_start_maf09.bed', v_sub_for_bed)
make_bed_from_frame('../data/STRs_depth5_chr_start_maf09_nv.bed', nv_sub_for_bed)

Loading BED

In [6]:
vSTR_bed = pyb.BedTool('../data/STRs_depth5_chr_start_maf09.bed')
nvSTR_bed = pyb.BedTool('../data/STRs_depth5_chr_start_maf09_nv.bed')

Loading GFF as BED, used awk to produce the file

In [7]:
GFF = pyb.BedTool('../data/TAIR10_GFF3_genes_with_strand.bed')

Loading GFF as DataFrame

In [8]:
GFF_frame = pd.read_csv('../data/TAIR10_GFF3_genes.gff', sep = '\t', header = None,
                        names = ['CHR', 'TAIR', 'feature', 'feat_start', 'feat_end', '.', 'strand', 'frame', 'ID'])

Writing a BED file for each feature

In [9]:
features = []
for i in GFF:
    features.append(i[3])
    
def create_sub(filtered):
    sub = GFF.filter(lambda x: x.name == filtered)
    sub.saveas('../data/TAIR10_GFF3_genes_with_strand.bed.%s' % filtered)
    
for i in set(features):
    create_sub(i)

Loading BED files

In [10]:
CDS = pyb.BedTool('../data/TAIR10_GFF3_genes_with_strand.bed.CDS')
chromosome = pyb.BedTool('../data/TAIR10_GFF3_genes_with_strand.bed.chromosome')
Exon = pyb.BedTool('../data/TAIR10_GFF3_genes_with_strand.bed.exon')
UTR5 = pyb.BedTool('../data/TAIR10_GFF3_genes_with_strand.bed.five_prime_UTR')
Genes = pyb.BedTool('../data/TAIR10_GFF3_genes_with_strand.bed.gene')
mRNA = pyb.BedTool('../data/TAIR10_GFF3_genes_with_strand.bed.mRNA')
mRNA_TE_gene = pyb.BedTool('../data/TAIR10_GFF3_genes_with_strand.bed.mRNA_TE_gene')
miRNA = pyb.BedTool('../data/TAIR10_GFF3_genes_with_strand.bed.miRNA')
ncRNA = pyb.BedTool('../data/TAIR10_GFF3_genes_with_strand.bed.ncRNA')
protein = pyb.BedTool('../data/TAIR10_GFF3_genes_with_strand.bed.protein')
pseudogene = pyb.BedTool('../data/TAIR10_GFF3_genes_with_strand.bed.pseudogene')
pseudogenic_exon = pyb.BedTool('../data/TAIR10_GFF3_genes_with_strand.bed.pseudogenic_exon')
pseudogenic_transcript = pyb.BedTool('../data/TAIR10_GFF3_genes_with_strand.bed.pseudogenic_transcript')
rRNA = pyb.BedTool('../data/TAIR10_GFF3_genes_with_strand.bed.rRNA')
snRNA = pyb.BedTool('../data/TAIR10_GFF3_genes_with_strand.bed.snRNA')
snoRNA = pyb.BedTool('../data/TAIR10_GFF3_genes_with_strand.bed.snoRNA')
tRNA = pyb.BedTool('../data/TAIR10_GFF3_genes_with_strand.bed.tRNA')
UTR3 = pyb.BedTool('../data/TAIR10_GFF3_genes_with_strand.bed.three_prime_UTR')
transposable_element_gene = pyb.BedTool('../data/TAIR10_GFF3_genes_with_strand.bed.transposable_element_gene')

Testing for intersect with bedtools, requiring the complete STR to be withing the region

In [11]:
# Variable
UTR5_STRs = vSTR_bed.intersect(UTR5, wo = True, f = 1.0)
UTR3_STRs = vSTR_bed.intersect(UTR3, wo = True, f = 1.0)
Genic_STRs = vSTR_bed.intersect(Genes, wo = True, f = 1.0)
CDS_STRs = vSTR_bed.intersect(CDS, wo = True, f = 1.0)
Intronic_STRs = Genic_STRs.intersect(Exon, v = True, wo = True, f = 1.0)
mRNA_STRs = vSTR_bed.intersect(mRNA, wo = True, f = 1.0)
miRNA_STRs = vSTR_bed.intersect(miRNA, wo = True, f = 1.0)
ncRNA_STRs = vSTR_bed.intersect(ncRNA, wo = True, f = 1.0)
protein_STRs = vSTR_bed.intersect(protein, wo = True, f = 1.0)
pseudogene_STRs = vSTR_bed.intersect(pseudogene, wo = True, f = 1.0)
pseudogenic_exon_STRs = vSTR_bed.intersect(pseudogenic_exon, wo = True, f = 1.0)
pseudogenic_transcript_STRs = vSTR_bed.intersect(pseudogenic_transcript, wo = True, f = 1.0)
snRNA_STRs = vSTR_bed.intersect(snRNA, wo = True, f = 1.0)
snoRNA_STRs = vSTR_bed.intersect(snoRNA, wo = True, f = 1.0)
tRNA_STRs = vSTR_bed.intersect(tRNA, wo = True, f = 1.0)
transposable_element_gene_STRs = vSTR_bed.intersect(transposable_element_gene, wo = True, f = 1.0)
Non_genic_STRs = vSTR_bed.intersect(Genes, v = True, wo = True)

# Non-variable

nvUTR5_STRs = nvSTR_bed.intersect(UTR5, wo = True, f = 1.0)
nvUTR3_STRs = nvSTR_bed.intersect(UTR3, wo = True, f = 1.0)
nvGenic_STRs = nvSTR_bed.intersect(Genes, wo = True, f = 1.0)
nvCDS_STRs = nvSTR_bed.intersect(CDS, wo = True, f = 1.0)
nvIntronic_STRs = nvGenic_STRs.intersect(Exon, v = True, wo = True, f = 1.0)
nvmRNA_STRs = nvSTR_bed.intersect(mRNA, wo = True, f = 1.0)
nvmiRNA_STRs = nvSTR_bed.intersect(miRNA, wo = True, f = 1.0)
nvncRNA_STRs = nvSTR_bed.intersect(ncRNA, wo = True, f = 1.0)
nvprotein_STRs = nvSTR_bed.intersect(protein, wo = True, f = 1.0)
nvpseudogene_STRs = nvSTR_bed.intersect(pseudogene, wo = True, f = 1.0)
nvpseudogenic_exon_STRs = nvSTR_bed.intersect(pseudogenic_exon, wo = True, f = 1.0)
nvpseudogenic_transcript_STRs = nvSTR_bed.intersect(pseudogenic_transcript, wo = True, f = 1.0)
nvsnRNA_STRs = nvSTR_bed.intersect(snRNA, wo = True, f = 1.0)
nvsnoRNA_STRs = nvSTR_bed.intersect(snoRNA, wo = True, f = 1.0)
nvtRNA_STRs = nvSTR_bed.intersect(tRNA, wo = True, f = 1.0)
nvtransposable_element_gene_STRs = nvSTR_bed.intersect(transposable_element_gene, wo = True, f = 1.0)
nvNon_genic_STRs = nvSTR_bed.intersect(Genes, v = True, wo = True)

Retrieving the information to annotate the DataFrame

In [12]:
def get_starts_from_BED(BED):
    return [i[3] for i in BED]

Getting the start positions

In [13]:
# Variable

UTR5_starts = get_starts_from_BED(UTR5_STRs)
UTR3_starts = get_starts_from_BED(UTR3_STRs)
CDS_starts = get_starts_from_BED(CDS_STRs)
Intron_starts = get_starts_from_BED(Intronic_STRs)
miRNA_starts = get_starts_from_BED(miRNA_STRs)
ncRNA_starts = get_starts_from_BED(ncRNA_STRs)
pseudo_starts = get_starts_from_BED(pseudogene_STRs)
miRNA_starts = get_starts_from_BED(miRNA_STRs)
te_starts = get_starts_from_BED(transposable_element_gene_STRs)
non_genic_starts = get_starts_from_BED(Non_genic_STRs)

# Non-variable

nvUTR5_starts = get_starts_from_BED(nvUTR5_STRs)
nvUTR3_starts = get_starts_from_BED(nvUTR3_STRs)
nvCDS_starts = get_starts_from_BED(nvCDS_STRs)
nvIntron_starts = get_starts_from_BED(nvIntronic_STRs)
nvmiRNA_starts = get_starts_from_BED(nvmiRNA_STRs)
nvncRNA_starts = get_starts_from_BED(nvncRNA_STRs)
nvpseudo_starts = get_starts_from_BED(nvpseudogene_STRs)
nvmiRNA_starts = get_starts_from_BED(nvmiRNA_STRs)
nvte_starts = get_starts_from_BED(nvtransposable_element_gene_STRs)
nvnon_genic_starts = get_starts_from_BED(nvNon_genic_STRs)

Annotating variable DataFrame

In [14]:
frame = v_sub_for_bed.copy()

n = 0
total = len(frame)
whereabouts = []
for i in frame.itertuples():

    chrom = i[1]
    start = i[2]
    
    chrom_start = chrom + '_' + str(start) 
    
    if chrom_start in UTR5_starts:
        whereabouts.append('five_prime_UTR')
    elif chrom_start in UTR3_starts:
        whereabouts.append('three_prime_UTR')
    elif chrom_start in CDS_starts:
        whereabouts.append('CDS')
    elif chrom_start in Intron_starts:
        whereabouts.append('Intron')
    elif chrom_start in miRNA_starts:
        whereabouts.append('miRNA')
    elif chrom_start in ncRNA_starts:
        whereabouts.append('ncRNA')
    elif chrom_start in pseudo_starts:
        whereabouts.append('pseudogene')
    elif chrom_start in te_starts:
        whereabouts.append('transposable_element_gene')
    elif chrom_start in non_genic_starts:
        whereabouts.append('Non_genic')
    else:
        whereabouts.append('Spanning')
        
v_sub_for_bed['WHERE'] = whereabouts

Annotating non-variable DataFrame

In [15]:
frame = nv_sub_for_bed.copy()

n = 0
total = len(frame)
whereabouts = []
for i in frame.itertuples():

    chrom = i[1]
    start = i[2]
    
    chrom_start = chrom + '_' + str(start) 
    
    if chrom_start in nvUTR5_starts:
        whereabouts.append('five_prime_UTR')
    elif chrom_start in nvUTR3_starts:
        whereabouts.append('three_prime_UTR')
    elif chrom_start in nvCDS_starts:
        whereabouts.append('CDS')
    elif chrom_start in nvIntron_starts:
        whereabouts.append('Intron')
    elif chrom_start in nvmiRNA_starts:
        whereabouts.append('miRNA')
    elif chrom_start in nvncRNA_starts:
        whereabouts.append('ncRNA')
    elif chrom_start in nvpseudo_starts:
        whereabouts.append('pseudogene')
    elif chrom_start in nvte_starts:
        whereabouts.append('transposable_element_gene')
    elif chrom_start in nvnon_genic_starts:
        whereabouts.append('Non_genic')
    else:
        whereabouts.append('Spanning')
nv_sub_for_bed['WHERE'] = whereabouts

Adding info to frame

In [16]:
vSTRs = pd.merge(v_sub_for_bed[['CHR_START', 'WHERE']], vSTRs)
nvSTRs = pd.merge(nv_sub_for_bed[['CHR_START', 'WHERE']], nvSTRs)

Need to add gene info to the loci spanning multiple features

Function to make BED file

In [17]:
def make_bed_from_gene_frame(filename, frame):
    tab = '\t'
    newline = '\n'

    fh = open(filename, 'w')
    for i in frame.itertuples():
        fh.write(i.CHR.lower() + tab + str(i.feat_start) + tab + str(i.feat_end) + tab + str(i.GENE) + newline)
    fh.close()

Adding info of genes of spanning loci

In [18]:
Gene_frame = GFF_frame[GFF_frame['feature']=='gene']
Gene_frame['GENE'] = [i.split(';')[0].split('=')[1] for i in Gene_frame.ID]
make_bed_from_gene_frame('../data/Gene_frame.bed', Gene_frame)
Gene_frame_bed = pyb.BedTool('../data/Gene_frame.bed')

make_bed_from_frame('../data/spanning.bed', v_sub_for_bed[v_sub_for_bed['WHERE']=='Spanning'])
make_bed_from_frame('../data/nv_spanning.bed', nv_sub_for_bed[nv_sub_for_bed['WHERE']=='Spanning'])

spanning = pyb.BedTool('../data/spanning.bed')
nv_spanning = pyb.BedTool('../data/nv_spanning.bed')

def return_spanning_genes_and_loci(spanning_bed):
    spanning_chr_starts = []
    spanning_genes = []
    for i in Gene_frame_bed.intersect(spanning_bed):    
        chr_start = i[0] + '_' + str(i[1])
        gene = i[3]
        spanning_chr_starts.append(chr_start)
        spanning_genes.append(gene)
    return (spanning_chr_starts, spanning_genes)

spanning_chr_starts = return_spanning_genes_and_loci(spanning)[0]
nv_spanning_chr_starts = return_spanning_genes_and_loci(nv_spanning)[0]
spanning_genes = return_spanning_genes_and_loci(spanning)[1]
nv_spanning_genes = return_spanning_genes_and_loci(nv_spanning)[1]

vSpanFrame = pd.DataFrame([spanning_chr_starts, spanning_genes]).T.rename(columns = {0 : 'CHR_START', 1 : 'GENE2'})
nvSpanFrame = pd.DataFrame([nv_spanning_chr_starts, nv_spanning_genes]).T.rename(columns = {0 : 'CHR_START', 1 : 'GENE2'})


def get_gene_column(frame):
    gene2_col = []
    for i in frame.itertuples():
        if 'AT' in str(i.GENE):
            gene2_col.append(i.GENE.split('.')[0])
        else:
            gene2_col.append(i.GENE)   
    return gene2_col

vSTRs['GENE2'] = get_gene_column(vSTRs)
nvSTRs['GENE2'] = get_gene_column(nvSTRs)

vSTRs_span = pd.merge(vSTRs, vSpanFrame, on = ['CHR_START', 'GENE2'])
nvSTRs_span = pd.merge(nvSTRs, nvSpanFrame, on = ['CHR_START', 'GENE2'])

vSTRs_no_span = vSTRs[~vSTRs['CHR_START'].isin(vSpanFrame.CHR_START)]
nvSTRs_no_span = nvSTRs[~nvSTRs['CHR_START'].isin(nvSTRs_span.CHR_START)]

vSTRs = pd.concat([vSTRs_no_span, vSTRs_span])
nvSTRs = pd.concat([nvSTRs_no_span, nvSTRs_span])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Adding promoter info, need frame with only genes

In [19]:
def add_promoter_info_to_frame(dataframe):
    frame = dataframe[dataframe['WHERE']=='Non_genic']
    sub_for_prom = frame[['CHR', 'START', 'END', 'CHR_START']].drop_duplicates()
    make_bed_from_frame('../data/sub_for_prom.bed', sub_for_prom)
    Non_genic_bed = pyb.BedTool('../data/sub_for_prom.bed')
    Genes_sorted = Genes.sort()

    sub = Genes_sorted.filter(lambda x: x.strand == '+')
    sub.saveas('../data/Genes_sorted_plus.bed')
    Genes_sorted_plus = pyb.BedTool('../data/Genes_sorted_plus.bed')

    sub = Genes_sorted.filter(lambda x: x.strand == '-')
    sub.saveas('../data/Genes_sorted_minus.bed')
    Genes_sorted_minus = pyb.BedTool('../data/Genes_sorted_minus.bed')

    tab = '\t'
    newline = '\n'

    fh = open('../data/STR_promoters.txt', 'w')
    fh.write('CHR_START' + tab + 'CHR' + tab + 'START' + tab + 'feat_start'+  tab + 'DISTANCE_TO_PROMOTER' + newline)
    rep_lst = []
    for i in Genes_sorted.closest(Non_genic_bed, io = True, d = True, D = 'a'):
        chrom = i[0]
        gene_start = i[1]
        gene_end = i[2]
        rep_start = i[7]
        dist = i[-1]
        rep_lst.append(rep_start)
        chr_start = chrom + '_' + str(rep_start)
        if (abs(int(i[-1])) < 500):
            if int(i[-1]) < 0 and int(i[-1]) != -1:
                fh.write(chr_start + tab + chrom + tab + str(rep_start) + tab + str(gene_start) + tab + str(dist) + newline)
    fh.close()

    STR_promoters = pd.read_table('../data/STR_promoters.txt')
    Lowest = STR_promoters.groupby(['CHR_START'], sort=True)[['DISTANCE_TO_PROMOTER', 'feat_start']].max().reset_index()
    Final_promoter_frame = pd.merge(Gene_frame[['GENE', 'feat_start']], Lowest)
    map_this_to_main_frame = Final_promoter_frame[['GENE', 'CHR_START']]
    sub = dataframe[['CHR_START']].drop_duplicates()

    new_column = []
    for i in sub.itertuples():
        if i.CHR_START in list(map_this_to_main_frame.CHR_START):
            name = map_this_to_main_frame[map_this_to_main_frame['CHR_START']==i.CHR_START].GENE.values[0]
            new_column.append(name)
        else:
            new_column.append('None')

    sub['Closest'] = new_column
    Final_annotated_frame_closest = pd.merge(sub, dataframe)

    Where_updated = []
    for i in Final_annotated_frame_closest.itertuples():
        if i.Closest == 'None':
            Where_updated.append(i.WHERE.upper())
        else:
            Where_updated.append('PROMOTER')

    Final_annotated_frame_closest['WHERE'] = Where_updated
    return Final_annotated_frame_closest

In [20]:
vSTRs_w_prom = add_promoter_info_to_frame(vSTRs)
nvSTRs_w_prom = add_promoter_info_to_frame(nvSTRs)

Writing frames

In [21]:
vSTRs_w_prom.to_csv('../data/STRs_depth5_chr_start_maf09.annotated.tsv', sep = '\t', index=None)
nvSTRs_w_prom.to_csv('../data/STRs_depth5_chr_start_maf09_nv.annotated.tsv', sep = '\t', index=None)

In [22]:
vSTRs_w_prom = pd.read_table('../data/STRs_depth5_chr_start_maf09.annotated.tsv')

In [23]:
nvSTRs_w_prom = pd.read_table('../data/STRs_depth5_chr_start_maf09_nv.annotated.tsv')