In [1]:
from reportlab.lib import colors
from reportlab.lib.units import cm
from Bio.Graphics import GenomeDiagram
from Bio import SeqIO
from Bio.SeqFeature import SeqFeature, SimpleLocation, CompoundLocation

In [2]:
import pandas as pd
from pathlib import Path

In [3]:
# create features
data_path = Path("/mnt/d/data/popnet_paper/variant_data/")
plasmo_gene_data = pd.read_csv(data_path / 'plasmo_genes.csv')
neis_gene_data = pd.read_csv(data_path / 'neis_genes.csv')

In [4]:
# plamsmo stuff
gene_features = []
for gene, rows in plasmo_gene_data.groupby('name'):
    # cds features
    is_forward = rows['direction'].unique()[0] == 'forward'
    qualifiers = {
        'segment': int(rows.region.unique()[0].split(':')[1]),
        'segment_length': 5000,
    }
    if len(rows) > 1:
        gene_features.append(SeqFeature(
            CompoundLocation(
                [
                    SimpleLocation(
                        r['cds_start'], r['cds_end'],
                        strand = 1 if is_forward else -1,
                    ) for r in rows.to_dict('records')
                ]
            ),
            type = 'gene', id = gene, qualifiers = qualifiers,
        ))
    else:
        row = rows.iloc[0]
        gene_features.append(SeqFeature(SimpleLocation(
            int(row.cds_start), int(row.cds_end),
            strand = 1 if row.direction == 'forward' else -1
        ), type = 'gene', id = gene, qualifiers = qualifiers,))


In [5]:
for gene, rows in neis_gene_data.groupby('name'):
    # cds features
    is_forward = rows['direction'].unique()[0] == 'forward'
    qualifiers = {
        'segment': int(rows.region.unique()[0]),
        'segment_length': 1000,
    }
    if len(rows) > 1:
        gene_features.append(SeqFeature(
            CompoundLocation(
                [
                    SimpleLocation(
                        r['cds_start'], r['cds_end'],
                        strand = 1 if is_forward else -1,
                    ) for r in rows.to_dict('records')
                ]
            ),
            type = 'gene', id = gene, qualifiers = qualifiers,
        ))
    else:
        row = rows.iloc[0]
        gene_features.append(SeqFeature(SimpleLocation(
            int(row.cds_start), int(row.cds_end),
            strand = 1 if row.direction == 'forward' else -1
        ), type = 'gene', id = gene, qualifiers = qualifiers,))

In [6]:
# variant and other gene features
def feature_key(feature):
    # need to order the other features in a specific way
    # this is the key function
    if feature.type == 'gene':
        return 0
    if 'is_sig' in feature.qualifiers and feature.qualifiers['is_sig']:
        return 3
    return 1

var_data = pd.read_csv(data_path.parent / 'variant_analysis.csv')
other_genes = pd.read_csv(data_path / 'other_genes.csv')
var_features_dict = {}
for gene in list(plasmo_gene_data.name.unique()) + list(neis_gene_data.name.unique()):
    
    gene_vars = var_data.loc[var_data.gene == gene]
    var_features = [
        SeqFeature(
            SimpleLocation(int(r['pos']), int(r['pos']) + 3
        ), type = 'variant', qualifiers = {'is_sig': r['is_sig']})
        for r in gene_vars.to_dict('records')
    ]
    og_slice = other_genes.loc[other_genes.parent == gene]
    og_features = [
        SeqFeature(SimpleLocation(
            int(row['cds_start']), int(row['cds_end']),
            strand = 1 if row['direction'] == 'forward' else -1,
        ), type = 'gene', id = row['name'], )
        for row in og_slice.to_dict('records')
    ]
    var_features_dict[gene] = sorted(og_features + var_features, key=feature_key, reverse=False)
    


In [7]:
def get_color(feature):
    # defines the color map of features
    if feature.id.endswith('utr'):
        return 'white'
    if feature.type == 'variant':
        if feature.qualifiers['is_sig']:
            return 'red'
        return 'yellow'
    if feature.strand == -1:
        return 'orange'
    return 'blue'

diagrams = []
for g in gene_features:
    gd_diagram = GenomeDiagram.Diagram(
        g.id, 
        start=g.qualifiers['segment'], 
        end=g.qualifiers['segment'] + g.qualifiers['segment_length'],
        track_size = 0.6,
    )
    track = gd_diagram.new_track(
        1, name=g.id, 
        scale=1, scale_format='SInt', scale_color=colors.black, greytrack = True
    )
    feature_set = track.new_set()
    feature_set.add_feature(
        g, sigil="BOX", color = get_color(g), 
        name = g.id, label=False, label_size=28, label_angle=0 if g.strand == 1 else 180,
        label_position = 'middle',
    )
    if g.id in var_features_dict:
        for v in var_features_dict[g.id]:
            feature_set.add_feature(
                v, color = get_color(v), name = v.id,
                sigil="BOX" if v.type == 'gene' else "BOX", 
                label = False if v.type == 'gene' else False,
                label_size=28, label_angle=0 if g.strand == 1 else 180,
                label_position = 'middle'
            )
    diagrams.append(gd_diagram)

In [8]:
# I had to turn off the labels because they were hard to deal with. I'll add them manually later
for d, g in zip(diagrams, gene_features):
    d.draw(format='linear', fragments = 1)
    d.write(data_path.parent / 'var_svgs' / f"{g.id}.svg", "SVG")