In [None]:
import numpy as np
import pandas as pd
from __future__ import division
%pylab inline
import glob
import seaborn as sns
import matplotlib.ticker as ticker
import matplotlib.pyplot as plt
import re
sns.set_style('ticks')
sns.set_context('paper')
from scipy.stats import ks_2samp
import json
import glob
import os
from scipy.stats import ks_2samp

In [None]:

def parse_sist(fn):
    """Parse raw SIST output; return an array of positions and probabilities"""
    data = []
    with open(fn,'r') as f:
        for line in f:
            line = line.strip()
            if 'Position' in line or 'WARNING' in line:
                continue
            line = line.split(",")
            # line = float(line)
            data.append(line)
    return np.array(data, dtype="float64")

def parse_sist_melt(fn):
    if 'C2' in fn:
        return parse_sist(fn)[:,1][:155000]
    else:
        return parse_sist(fn)[:,1]

def parse_sist_Z(fn):
    if 'C2' in fn:
        return parse_sist(fn)[:,2][:155000]
    else:
        return parse_sist(fn)[:,2]

def parse_sist_cruciform(fn):
    if 'C2' in fn:
        return parse_sist(fn)[:,3][:155000]
    else:
        return parse_sist(fn)[:,3]

## Averge non-b formation probability

In [None]:
hue = []
data = []
names = []
cens = ["2", "3", "4", "X", "Y"]
temps = ["18", "22", "25", "30", "35"]
# Iterate through the centromere for analysis on the various centromeres
centromere = "Y"

for temperature in temps:
    all_temps_cen = []
    all_temps_con = []

    data = []
    names = []

    fns = glob.glob('cen/sist-results/C'+centromere+'/T'+temperature+'/*.sist.csv')
    for fn in fns:
        sdata = np.mean(parse_sist_Z(fn))
        all_temps_cen.append(sdata)
        data.append(sdata)
        hue.extend([1])
        names.extend(['Cen'])

    fns = glob.glob('control/sist-results/C'+centromere+'/T'+temperature+'/*.sist.csv')
    for fn in fns:
        sdata = np.mean(parse_sist_Z(fn))
        data.append(sdata)
        hue.extend([0])
        names.extend(['Control'])

    all_temps_con.append(np.mean(data[1:]))

    data = []
    names = []

    fns = glob.glob('cen/sist-results/C'+centromere+'/T'+temperature+'/*.sist.csv')
    for fn in fns:
        sdata = np.mean(parse_sist_cruciform(fn))
        all_temps_cen.append(sdata)
        data.append(sdata)
        hue.extend([1])
        names.extend(['Cen'])

    fns = glob.glob('control/sist-results/C'+centromere+'/T'+temperature+'/*.sist.csv')
    for fn in fns:
        sdata = np.mean(parse_sist_cruciform(fn))
        data.append(sdata)
        hue.extend([0])
        names.extend(['Control'])

    all_temps_con.append(np.mean(data[1:]))

    data = []
    names = []

    fns = glob.glob('cen/sist-results/C'+centromere+'/T'+temperature+'/*.sist.csv')
    for fn in fns:
        sdata = np.mean(parse_sist_melt(fn))
        all_temps_cen.append(sdata)
        data.append(sdata)
        hue.extend([1])
        names.extend(['Cen'])

    fns = glob.glob('control/sist-results/C'+centromere+'/T'+temperature+'/*.sist.csv')
    for fn in fns:
        sdata = np.mean(parse_sist_melt(fn))
        data.append(sdata)
        hue.extend([0])
        names.extend(['Control'])

    all_temps_con.append(np.mean(data[1:]))

    print('\nFor temperature:', temperature)
    print("Cen:", np.sum(all_temps_cen))
    print("Control:", np.sum(all_temps_con))


## Enrichment analysis

In [None]:
hue = []
data = []
names = []
cens = ["2", "3", "4", "X", "Y"]
temps = ["18", "22", "25", "30", "35"]
# centromere = "2"
# temperature = "18"

for centromere in cens:
    for temperature in temps:

        data = []
        names = []

        fns = glob.glob('cen/sist-results/C'+centromere+'/T'+temperature+'/*.sist.csv')
        for fn in fns:
            sdata = np.mean(parse_sist_melt(fn))
            data.append(sdata)
            hue.extend([1])
            names.extend(['Cen'])
        np.savetxt("".join(["control/csv-results/", 'melt-centromere-', centromere, '-temperature-', temperature, '-.csv']), data, delimiter=",")

        data = []
        names = []

        fns = glob.glob('control/sist-results/C'+centromere+'/T'+temperature+'/*.sist.csv')
        for fn in fns:
            sdata = np.mean(parse_sist_melt(fn))
            # print(sdata)
            data.append(sdata)
            hue.extend([0])
            names.extend(['Control'])
        np.savetxt("".join(["control/csv-results/", 'melt-control-', centromere, '-temperature-', temperature, '-.csv']), data, delimiter=",")

for centromere in cens:
    for temperature in temps:

        data = []
        names = []

        fns = glob.glob('cen/sist-results/C'+centromere+'/T'+temperature+'/*.sist.csv')
        for fn in fns:
            sdata = np.mean(parse_sist_Z(fn))
            # print(sdata)
            data.append(sdata)
            hue.extend([1])
            names.extend(['Cen'])
        np.savetxt("".join(["control/csv-results/", 'Z-centromere-', centromere, '-temperature-', temperature, '-.csv']), data, delimiter=",")

        data = []
        names = []

        fns = glob.glob('control/sist-results/C'+centromere+'/T'+temperature+'/*.sist.csv')
        for fn in fns:
            sdata = np.mean(parse_sist_Z(fn))
            # print(sdata)
            data.append(sdata)
            hue.extend([0])
            names.extend(['Control'])
        np.savetxt("".join(["control/csv-results/", 'Z-control-', centromere, '-temperature-', temperature, '-.csv']), data, delimiter=",")

for centromere in cens:
    for temperature in temps:

        data = []
        names = []

        fns = glob.glob('cen/sist-results/C'+centromere+'/T'+temperature+'/*.sist.csv')
        for fn in fns:
            sdata = np.mean(parse_sist_cruciform(fn))
            # print(sdata)
            data.append(sdata)
            hue.extend([1])
            names.extend(['Cen'])
        np.savetxt("".join(["control/csv-results/", 'cruc-centromere-', centromere, '-temperature-', temperature, '-.csv']), data, delimiter=",")

        data = []
        names = []

        fns = glob.glob('control/sist-results/C'+centromere+'/T'+temperature+'/*.sist.csv')
        for fn in fns:
            sdata = np.mean(parse_sist_cruciform(fn))
            # print(sdata)
            data.append(sdata)
            hue.extend([0])
            names.extend(['Control'])
        np.savetxt("".join(["control/csv-results/", 'cruc-control-', centromere, '-temperature-', temperature, '-.csv']), data, delimiter=",")

In [None]:
# This is to get the KS values
p_values = []
cen_data = None
# centromere = '4'
# temperature = '25'

fns = glob.glob('cen/sist-results/C'+centromere+'/T'+temperature+'/*.sist.csv')
for fn in fns:
    cen_data = parse_sist_cruciform(fn)

fns = glob.glob('control/sist-results/C'+centromere+'/T'+temperature+'/*.sist.csv')
for fn in fns:
    pval = ks_2samp(cen_data, parse_sist_cruciform(fn))[1]
    # print(pval)
    p_values.append(
       pval 
    )
    
np.mean(np.array(p_values))