In [1]:
%%javascript
var rate = 0;
// apply setting to  all current CodeMirror instances
IPython.notebook.get_cells().map(
    function(c) {  return c.code_mirror.options.cursorBlinkRate=rate;  }
);

// make sure new CodeMirror instance also use this setting
CodeMirror.defaults.cursorBlinkRate=rate;

<IPython.core.display.Javascript object>

In [2]:
with open("TumourSupressors.bed", 'w') as ts_file:
    ts_file.write('''1\t500\t1000\tTS1
1\t1500\t2000\tTS1
2\t500\t1000\tTS2
2\t1500\t2000\tTS2
''')

with open("AR.bed", 'w') as ar_file:
    ar_file.write('''X\t500\t1000\tTS1
X\t1500\t2000\tTS1
X\t2500\t3000\tTS2
X\t3500\t4000\tTS2
''')

with open("FusionCandidates.bed", 'w') as fusion_file:
    fusion_file.write('''21\t1000000\t1100000\tERG
21\t2000000\t2100000\tTMPRSS2
''')

with open("SVs.bed", 'w') as svs_file:
    svs_file.write('''21\t1500000\t1500100\tDEL\t0\t+
21\t2500000\t2500100\tDEL\t0\t+
1\t1600\t1700\tDEL\t0\t+
1\t1800\t1900\tTRA\t0\t+
1\t1400\t1600\tDEL\t0\t+
1\t3100\t3200\tDEL\t0\t+
''')

In [37]:
from enum import Enum
from pybedtools import BedTool
from collections import defaultdict, OrderedDict
import pandas as pd
from svcaller.calling.events import SvType


class SvEffect(Enum):
    NO_OVERLAP = "NO_OVERLAP"
    OVERLAP_WITH_EFFECT = "OVERLAP_WITH_EFFECT"
    OVERLAP_UNKNOWN_EFFECT = "OVERLAP_UNKNOWN_EFFECT"
    GENE_FUSION = "GENE_FUSION"


class GeneClass(Enum):
    TUMOUR_SUPRESSOR = "TUMOUR_SUPRESSOR"
    AR = "AR"
    FUSION_CANDIDATE = "FUSION_CANDIDATE"

In [4]:
def extract_groups(bed_filename):
    df = pd.read_table(bed_filename, header=None, sep="\t")
    return {name: table for name, table in df.groupby(df[3])}

In [5]:
list(extract_groups('SVs.bed').values())[0]

Unnamed: 0,0,1,2,3,4,5
0,21,1500000,1500100,DEL,0,+
1,21,2500000,2500100,DEL,0,+
2,1,1600,1700,DEL,0,+
4,1,1400,1600,DEL,0,+
5,1,3100,3200,DEL,0,+


In [6]:
def predict_effects(svs_filename, ts_filename, ar_filename, fusion_filename):
    """
    Predict the consequence of the specified structural variants on the specified
    tumour suppressors, androgen receptor, and gene fusion candidate.

    :param svs_filename: Location of bed file specifying the structural variant coordinates.
    :param ts_filename: Location of bed file specifying the tumour supressor gene region coords.
    :param ar_filename: Location of bed file specifying Androgren Receptor gene region coords.
    :param fusion_filename: Location of bed file specifying two broad gene fusion region.

    :return: A dictionary with gene class as key and results dictionary as value
    """

    gene_classes = [enum.value for enum in list(GeneClass)]

    svs_bed = extract_groups(svs_filename)

    gene_to_bed_tables = [extract_groups(filename) for filename in
                         [ts_filename, ar_filename, fusion_filename]]

    gene_class_to_gene_region_bed = dict(zip(gene_classes, gene_to_bed_tables))

    gene_class_to_results = {}
    for gene_class, gene_region_bed in gene_class_to_gene_region_bed.items():
        gene_class_to_results[gene_class] = \
            predict_effects_for_class(svs_bed, gene_class, gene_region_bed)
    
    return gene_class_to_results

In [7]:
def predict_del_effect(sv, functional_regions):
    return None

def predict_inv_effect_tumour_suppressor(sv, functional_regions):
    return None

def predict_dup_effect_tumour_suppressor(sv, functional_regions):
    return None

def predict_tra_effect_tumour_suppressor(sv, functional_regions):
    return None

def predict_del_effect(sv, functional_regions):
    return None

def predict_inv_effect_ar(sv, functional_regions):
    return None

def predict_dup_effect_ar(sv, functional_regions):
    return None

def predict_tra_effect_ar(sv, functional_regions):
    return None

In [8]:
def filter_svs(svs_table, sv_type, gene_coords):
    svs_table[3] == sv_type
    return None

In [32]:
def sv_in_regions(sv, regions):
    return (sv.iloc[0,0] == regions.iloc[0,0]) & \
           (sv.iloc[0,2] > min(regions.iloc[:,1])) & \
           (sv.iloc[0,1] < max(regions.iloc[:,2]))

In [10]:
def predict_effects_for_class(svs_table, gene_class, gene_to_table):
    # Data structure for emulating switch statement:
    sv_and_scenario_to_function = {
        (SvType.DEL, GeneClass.TUMOUR_SUPRESSOR): predict_del_effect,
        (SvType.INV, GeneClass.TUMOUR_SUPRESSOR): predict_inv_effect_tumour_suppressor,
        (SvType.DUP, GeneClass.TUMOUR_SUPRESSOR): predict_dup_effect_tumour_suppressor,
        (SvType.TRA, GeneClass.TUMOUR_SUPRESSOR): predict_tra_effect_tumour_suppressor,
        (SvType.DEL, GeneClass.AR): predict_del_effect,
        (SvType.INV, GeneClass.AR): predict_inv_effect_ar,
        (SvType.DUP, GeneClass.AR): predict_dup_effect_ar,
        (SvType.TRA, GeneClass.AR): predict_tra_effect_ar,
    }

    gene_to_effects = defaultdict(list)
    for gene in gene_to_table:
        gene_regions = gene_to_table[gene]
        for sv_type in [type_ for type_ in SvType]:
            # Filter the SVs to those which overlap the overall gene region:
            svs = svs_table.get(sv_type.value, pd.DataFrame({}))
            
            def sv_in_regions_tmp(sv):
                return sv_in_regions(sv, gene_regions)

            svs_of_interest = svs[svs.apply(sv_in_regions_tmp, axis=1)]
            
            # Retrieve the relevant predictor function:
            predictor_function = sv_and_scenario_to_function[(sv_type, gene_class)]

            # Apply that function to each SV:
            for _, sv_row in svs_of_interest.iterrows():
                gene_to_effects[gene].append(predictor_function(sv_row, gene_regions))

    return gene_to_effects

In [40]:
svs_table = extract_groups('SVs.bed')
ts_table = extract_groups('TumourSupressors.bed')

predict_effects_for_class(svs_table, GeneClass.TUMOUR_SUPRESSOR, ts_table)

IndexingError: ('Too many indexers', 'occurred at index 0')

In [39]:
sv = pd.DataFrame(
    OrderedDict({
    "chrom": [21],
    "start": [1500000],
    "end": [1500100],
    "type": ["DEL"],
    "score": [0],
    "strand": ["+"]
}))

regions = pd.DataFrame(
    OrderedDict({
    "chrom": [2, 2],
    "start": [500, 1500],
    "end": [1000, 2000],
    "gene": ["TS2", "TS2"]
}))

#(sv[0] == regions[0][0])# & \
#(sv[2] > min(regions[1])) & \
#(sv[1] < max(regions[2]))

sv_in_regions(sv, regions)

False

In [34]:
%debug

> [0;32m<ipython-input-32-393526bccf30>[0m(2)[0;36msv_in_regions[0;34m()[0m
[0;32m      1 [0;31m[0;32mdef[0m [0msv_in_regions[0m[0;34m([0m[0msv[0m[0;34m,[0m [0mregions[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0m
[0m[0;32m----> 2 [0;31m    [0;32mreturn[0m [0;34m([0m[0msv[0m[0;34m.[0m[0miloc[0m[0;34m[[0m[0;36m0[0m[0;34m,[0m[0;36m0[0m[0;34m][0m [0;34m==[0m [0mregions[0m[0;34m.[0m[0miloc[0m[0;34m[[0m[0;36m0[0m[0;34m,[0m[0;36m0[0m[0;34m][0m[0;34m)[0m [0;34m&[0m            [0;34m([0m[0msv[0m[0;34m.[0m[0miloc[0m[0;34m[[0m[0;36m0[0m[0;34m,[0m[0;36m2[0m[0;34m][0m [0;34m>[0m [0mmin[0m[0;34m([0m[0mregions[0m[0;34m.[0m[0miloc[0m[0;34m[[0m[0;34m:[0m[0;34m,[0m[0;36m1[0m[0;34m][0m[0;34m)[0m[0;34m)[0m [0;34m&[0m            [0;34m([0m[0msv[0m[0;34m.[0m[0miloc[0m[0;34m[[0m[0;36m0[0m[0;34m,[0m[0;36m1[0m[0;34m][0m [0;34m<[0m [0mmax[0m[0;34m([0m[0mregions[0m[0;34m.[0m[0mi