In [None]:
import numpy as np
import pandas as pd
from __future__ import division
%pylab inline
import glob
import seaborn as sns
import matplotlib.ticker as ticker
import re
sns.set_style('ticks')
sns.set_context('paper')
from scipy.stats import ks_2samp
import json
import glob
import os

In [None]:
def iter_palindrome(fn):
    """Generator -- parse EMBOSS palindrome output"""
    with open(fn,'r') as f:
        coords = []
        seqs = []
        name = None
        out = False

        for line in f:
            line = line.rstrip()
            if 'Palindromes of' in line:
                name = "read" #line.split()[2]
            elif ':' in line or len(line) == 0:
                continue
            else:
                if '|' in line:
                    nm = line.count('|')
                    out = True
                    continue
                else:
                    line = line.split()
                    s,e = int(line[0]),int(line[-1])
                    if s > e:
                        s,e = e,s
                    s -= 1 # Python indexing, no need to change e
                    coords.append((s,e))
                    seqs.append(line[1].upper())
                    
                if out:
                    
                    G = coords[1][0]-coords[0][1]
                    
                    for (a,b),c in zip(coords,seqs):
                        yield name,a,b,c,nm/len(c),G
                        
                    out = False
                    coords = []
                    seqs = []
                    

def pal2mat(fn,minstem=5,maxstem=20,mingap=0,maxgap=20,pct_id=0.8,pidscale=False,norm=True):
    """Create a matrix of stem length x gap size from an EMBOSS palindrome file"""
    mat = np.zeros((maxstem-minstem+1,maxgap-mingap+1))
    
    nreads = 0
    for _,s,e,seq,pi,G in iter_palindrome(fn):
        nreads+=1
        L = len(seq)
        
        if (L < minstem or L > maxstem):
            continue
        if (G < mingap or G > maxgap):
            continue
        if pi < pct_id:
            continue
        
        if not pidscale:
            mat[L-minstem,G-mingap] += 1
        else:
            mat[L-minstem,G-mingap] += pct_id
    
    return mat, nreads

def parse_data(fns):
    """Parse EMBOSS palindorme data from a list of files; note that
    the length of the DNA region must be encoded in the filename, for example:
    C2_control_0_24561_.fasta.emboss.txt"""
    data = []
    for fn in fns:
        sp = fn.split("_")
        if "emboss" in fn:
            L = int(sp[-2])
        else:
            if "C2" in fn:
                L = 24561
            elif "C3" in fn:
                L = 103827
            elif "C4" in fn:
                L = 93914
            elif "CX" in fn:
                L = 70180
            elif "CY" in fn:
                L = 139956
        x = 0
        for name,s,e,seq,cvg,gap in iter_palindrome(fn):
            x += len(seq)
        data.append(x/L)
    return data


def parse_data_number(fns):
    """Parse EMBOSS palindorme data from a list of files; note that
    the length of the DNA region must be encoded in the filename, for example:
    C2_control_0_24561_.fasta.emboss.txt
    This function returns the number density of palindromes rather than amount of base pair density
    """
    data = []
    for fn in fns:
        sp = fn.split("_")
        L = int(sp[-2])
        x = 0
        for name,s,e,seq,cvg,gap in iter_palindrome(fn):
            x += 1
        data.append((x/L) * 100)
    return data

def parse_data_ks(fns):
    """Parse EMBOSS palindorme data from a list of files; note that
    the length of the DNA region must be encoded in the filename, for example:
    C2_control_0_24561_.fasta.emboss.txt"""
    data = []
    for fn in fns:
        sp = fn.split("_")
        if "emboss" in fn:
            L = int(sp[-2])
        else:
            if "C2" in fn:
                L = 24561
            elif "C3" in fn:
                L = 103827
            elif "C4" in fn:
                L = 93914
            elif "CX" in fn:
                L = 70180
            elif "CY" in fn:
                L = 139956
        x = np.zeros(L)
        for name,s,e,seq,cvg,gap in iter_palindrome(fn):
            x[s:e] += 1
        data.append(x)
    return data


In [None]:
cen_of_focus = "C2"

control_dna = "control_dna"
control_dna_as_files = 'control_dna_as_files'

if not os.path.isdir(control_dna_as_files):
    os.mkdir(control_dna_as_files)

dump_folder = os.path.join(control_dna_as_files, cen_of_focus)

if not os.path.isdir(dump_folder):
    os.mkdir(dump_folder)

file_of_list = os.path.join(control_dna, "".join([cen_of_focus, '.txt']))
list_control_dna = open(file_of_list, "r")
controls = json.load(list_control_dna)
list_control_dna.close()

for index, value in enumerate(controls):
    L = value.__len__().__str__()
    dump_filename = os.path.join(dump_folder, ''.join([cen_of_focus, '_control_', str(index), '_', L, '_', '.fasta']))
    with open(dump_filename, 'w') as fb:
        fb.write(value)
    break

In [None]:
def create_files_of_control_dna_for_emboss(control_dna = "control_dna"):
    """
    This function has to be run the the inverted_repeats folder
    This function takes the file with a list of centromere controls and turns them into files to run with EMBOSS palindrome
    Control DNA folder has is called control_dna here and has the 5 files with the DNA list of controls
    """
    centromeres = ['C2', 'C3', 'C4', 'CX', 'CY']
    control_dna_as_files = 'control_dna_as_files'

    if not os.path.isdir(control_dna_as_files):
        os.mkdir(control_dna_as_files)

    for cen_of_focus in centromeres:
        dump_folder = os.path.join(control_dna_as_files, cen_of_focus)

        if not os.path.isdir(dump_folder):
            os.mkdir(dump_folder)

        file_of_list = os.path.join(control_dna, "".join([cen_of_focus, '.txt']))
        list_control_dna = open(file_of_list, "r")
        controls = json.load(list_control_dna)
        list_control_dna.close()

        for index, value in enumerate(controls):
            L = value.__len__().__str__()
            dump_filename = os.path.join(dump_folder, ''.join([cen_of_focus, '_control_', str(index), '_', L, '_', '.fasta']))
            with open(dump_filename, 'w') as fb:
                fb.write(value) # json.dump puts in the quotes of a string

In [None]:
create_files_of_control_dna_for_emboss()

# Palindrome analysis Data

In [None]:
import numpy as np
import pandas as pd
from __future__ import division
%pylab inline
import glob
import seaborn as sns
import matplotlib.ticker as ticker
import re
sns.set_style('ticks')
sns.set_context('paper')
from scipy.stats import ks_2samp
import json
import glob
import os

In [None]:
hue = []
data = []
cen_data = []
con_data = []
names = []
minl = 0
cens = ['2', '3', '4', 'X', 'Y']

# for centromere in cens:
centromere = '3'
fns = glob.glob('cen/dyad/C'+centromere+'/*.dyad.txt')
sdata = parse_data_ks(fns)
cen_data.extend(sdata)
hue.extend([1]*len(sdata))
names.extend(['Cen']*len(sdata))
np.savetxt("".join(["control/csv-results/", 'dyad-centromere-', centromere, '-temperature-', '.csv']), data, delimiter=",")

data = []
names = []

fns = glob.glob('control/dyad/C'+centromere+'/*.dyad.txt')    
sdata = parse_data_ks(fns)
con_data.extend(sdata)
hue.extend([0]*len(sdata))
names.extend(['Control']*len(sdata))
np.savetxt("".join(["control/csv-results/", 'dyad-control-', centromere, '-temperature-', '.csv']), data, delimiter=",")

In [None]:
# Statistical test
from scipy.stats import ttest_1samp

hue = []
data = []
cen_data = []
con_data = []
names = []
minl = 0
cens = ['2', '3', '4', 'X', 'Y']

# for centromere in cens:
centromere = 'Y'
fns = glob.glob('cen/dyad/C'+centromere+'/*.dyad.txt')
sdata = parse_data(fns)
cen_data.extend(sdata)
hue.extend([1]*len(sdata))
names.extend(['Cen']*len(sdata))
# np.savetxt("".join(["control/csv-results/", 'dyad-centromere-', centromere, '-temperature-', '.csv']), data, delimiter=",")

# data = []
# names = []

fns = glob.glob('control/dyad/C'+centromere+'/*.dyad.txt')    
sdata = parse_data(fns)
con_data.extend(sdata)
hue.extend([0]*len(sdata))
names.extend(['Control']*len(sdata))
# np.savetxt("".join(["control/csv-results/", 'dyad-control-', centromere, '-temperature-', '.csv']), data, delimiter=",")

ttest_1samp(con_data, cen_data)