In [23]:

import glob
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import matplotlib as mpl
import scipy as sp
import itertools
from collections import defaultdict
import seaborn as sns
import glob
import json
from bs4 import BeautifulSoup
import codecs
import sys
from Bio import SeqIO, pairwise2, Seq


In [38]:
ice_dpth = '../data/ICE/v1_2/'
ngs_dpth = 'data/NGS/'
tide_pth = 'data/TIDE/tide_output/'

# Helper Functions

In [6]:
### 

def get_efficiency(text):
    parse_str = 'overall efficiency = '
    start = text.find(parse_str) + len(parse_str)
    end = start+4
    
    return float(text[start:end])

def get_r2(text):
    parse_str = 'R² = '
    start = text.find(parse_str) + len(parse_str)
    end = start+4
    
    return float(text[start:end])

def get_indels(text):
    start_str = '\npercentage\npvalue'
    end_str = '\n×\nSave Experiment'
    start = text.find(start_str) + len(start_str)
    end = text.find(end_str)
    garbage = text[start:end]
    
    counter = 0
    size, freq, p = [],[],[]
    for line in garbage.split('\n'):
        if len(line):
            if np.mod(counter,3) == 0:
                size.append(line)
            elif np.mod(counter,3) == 1:
                freq.append(float(line))
            elif np.mod(counter,3) == 2:
                p.append(float(line))
            counter+=1
            
    return size,freq

def simple_length_counting(fname):
    ref_len = 0
    histogram = defaultdict(int)

    for record in SeqIO.parse(fname, "fasta"):
        split_header = record.description.split(",")
        if len(split_header)> 1:
            percentage = float(split_header[1].strip()[:-1])
            indel =   len(record.seq) - ref_len
            histogram[indel] += percentage
        else:
            #wt reference
            ref_len = len(record.seq)
            #print("wt is ", record.name)
    if ref_len == 0:
        print("ERROR", fname)
    return histogram, ref_len

# Import All Data

In [40]:
htmls = glob.glob(tide_pth + '*.htm')
labels = [s.split('/')[-1] for s in htmls]
tide_edits = pd.DataFrame(index = labels, columns = ['efficiency','r2'])
tide_indels = pd.DataFrame(index = labels, columns = np.arange(-10,11).astype('str'))
for hpth in htmls:
    hname = hpth.split('/')[-1]
    #open html
    f=codecs.open(hpth, 'r')
    
    #parse it w/ bs
    soup = BeautifulSoup(f, 'html.parser')
    for script in soup(["script", "style"]):
        script.extract()    # rip it out

    # get text
    text = soup.get_text()
    
    if '\nError\nError' in text:
        continue
    
    tide_edits.loc[hname,'efficiency'] = get_efficiency(text)
    tide_edits.loc[hname,'r2'] = get_r2(text)
    sizes,freqs = get_indels(text)
    tide_indels.loc[hname,np.arange(-10,11).astype('str')] = freqs
tide_indels = tide_indels.astype("float")
tide_indels = tide_indels.reindex( tide_indels.columns.tolist() + list(np.arange(-30,-10).astype("str")),axis=1)
tide_indels.loc[:,np.arange(-30,-10).astype("str")] = 0

In [24]:
#Get NGS data
files = glob.glob(ngs_dpth+"*.seq")
ngs = pd.DataFrame()
for f in files:
    indel = pd.Series(0,index=np.arange(-30,15))
    indel.update(pd.Series(simple_length_counting(f)[0]))
    indel['position'] = f.split('/')[-1].split('_')[0]
    #indel.name = f.split('/')[-1].split('_')[0]
    ngs = ngs.append(indel,ignore_index=True)

In [39]:
glob.glob(tide_pth + '*.htm')

['data/TIDE/tide_output/posF1_PANK2_1_B;PANK2_R_0.htm',
 'data/TIDE/tide_output/posD6_STK4_2_A;STK4_R_0.htm',
 'data/TIDE/tide_output/posH1_RET_1_B;RET_R_0.htm',
 'data/TIDE/tide_output/posG6_CHUK_2_A;CHUK_R_0.htm',
 'data/TIDE/tide_output/posA1_RYK_1_B;RYK_R_0.htm',
 'data/TIDE/tide_output/posD5_STK3_2_B;STK3_R_0.htm',
 'data/TIDE/tide_output/posE8_MYO3A_3_B;MYO3A_R_0.htm',
 'data/TIDE/tide_output/posF4_PANK2_2_A;PANK2_R_0.htm',
 'data/TIDE/tide_output/posG9_CHUK_3_B;CHUK_R_0.htm',
 'data/TIDE/tide_output/posG8_UCK2_3_A;UCK2_R_0.htm',
 'data/TIDE/tide_output/posD9_STK4_3_B;STK4_R_0.htm',
 'data/TIDE/tide_output/posA4_RYK_2_B;RYK_R_0.htm',
 'data/TIDE/tide_output/posH4_RET_2_B;RET_R_0.htm',
 'data/TIDE/tide_output/posC5_PRKAG3_2_A;PRKAG3_R_0.htm',
 'data/TIDE/tide_output/posB2_PIK3CA_1_B;PIK3CA_R_0.htm',
 'data/TIDE/tide_output/posB9_PIK3CB_3_A;PIK3CB_R_0.htm',
 'data/TIDE/tide_output/posC9_IRAK4_3_B;IRAK4_R_0.htm',
 'data/TIDE/tide_output/posF7_PANK2_3_A;PANK2_R_0.htm',
 'data/TIDE/ti

In [41]:
glob.glob('data/TIDE/tide_output/*.htm')

['data/TIDE/tide_output/posF1_PANK2_1_B;PANK2_R_0.htm',
 'data/TIDE/tide_output/posD6_STK4_2_A;STK4_R_0.htm',
 'data/TIDE/tide_output/posH1_RET_1_B;RET_R_0.htm',
 'data/TIDE/tide_output/posG6_CHUK_2_A;CHUK_R_0.htm',
 'data/TIDE/tide_output/posA1_RYK_1_B;RYK_R_0.htm',
 'data/TIDE/tide_output/posD5_STK3_2_B;STK3_R_0.htm',
 'data/TIDE/tide_output/posE8_MYO3A_3_B;MYO3A_R_0.htm',
 'data/TIDE/tide_output/posF4_PANK2_2_A;PANK2_R_0.htm',
 'data/TIDE/tide_output/posG9_CHUK_3_B;CHUK_R_0.htm',
 'data/TIDE/tide_output/posG8_UCK2_3_A;UCK2_R_0.htm',
 'data/TIDE/tide_output/posD9_STK4_3_B;STK4_R_0.htm',
 'data/TIDE/tide_output/posA4_RYK_2_B;RYK_R_0.htm',
 'data/TIDE/tide_output/posH4_RET_2_B;RET_R_0.htm',
 'data/TIDE/tide_output/posC5_PRKAG3_2_A;PRKAG3_R_0.htm',
 'data/TIDE/tide_output/posB2_PIK3CA_1_B;PIK3CA_R_0.htm',
 'data/TIDE/tide_output/posB9_PIK3CB_3_A;PIK3CB_R_0.htm',
 'data/TIDE/tide_output/posC9_IRAK4_3_B;IRAK4_R_0.htm',
 'data/TIDE/tide_output/posF7_PANK2_3_A;PANK2_R_0.htm',
 'data/TIDE/ti

In [None]:
from shutil import copyfile

for file in glob.glob('data/TIDE/tide_output/*.htm'):
    copyfile(file, dst)