In [1]:
#These are scripts which produce plots which originated in python

In [2]:
### plotting Fig 1C
import pandas as pd
import os
import sys
import matplotlib.pyplot as plt
from PIL import Image
import math
import numpy as np
import scipy
import seaborn as sns
from scipy import stats
import statsmodels.stats.multitest as multi

table = 'processed_data/benchmark_GM12878_results.txt'
df = pd.read_table(table)

#### filtering out negative controls with significance
df = df[~df['S'].str.contains('random') | (df['S'].str.contains('random') & (df['padj_S_run1'] > 0.001))]
df = df[~df['S'].str.contains('random') | (df['S'].str.contains('random') & (df['padj_S_run2'] > 0.001))]

## calc Pearson's r
df = df.drop_duplicates(subset=['log2FC_ES'])
df1 = df.dropna(subset=['log2FC_ES', 'log2FC_SE'])
cor = df1.iloc[:,[8,14]]
cor_p = cor.corr(method='pearson')
cor_p = cor_p.iloc[0,1]

r1 = 'r = '+ '{:.2f}'.format(cor_p)

## plot
fig = plt.figure(figsize=(4,4), dpi=300)
ax = fig.add_subplot(111, xticks=[-2,0,2,4,6],yticks=[-2,0,2,4,6])
plt.xlim((-2.9,7.5))
plt.ylim((-2.9,7.5))
plt.axhline(y=0, color='lightgrey', linestyle='dotted', zorder=0)
plt.axvline(x=0, color='lightgrey', linestyle='dotted', zorder=0)

## scatter plot
plt.scatter(y=df['log2FC_ES'], x=df['log2FC_SE'], s=1, alpha=0.1,color='0.3', zorder=1)

## 45 degree line
x2 = np.arange(-10,10,0.1)
y2 = x2
plt.plot(x2,y2, color='coral', alpha=0.8, zorder=4, linestyle='dotted')
plt.text(-2.3,6.5,r1, color='0.2')

plt.ylabel('Expression log2 (Fold Change)', color='0.2')
plt.xlabel('Expression log2 (Fold Change)', color='0.2')

k= '0.4'
ax.spines['right'].set_color(k)
ax.spines['left'].set_color(k)
ax.spines['top'].set_color(k)
ax.spines['bottom'].set_color(k)
ax.tick_params(axis='x', colors=k)
ax.tick_params(axis='y', colors=k)

plt.savefig('Fig1C.png')

ModuleNotFoundError: No module named 'pandas'

In [None]:
### plotting Fig 1D
### plotting duo and log additive model

df = pd.read_table(table)

#### filtering out negative controls with significance
df = df[~df['S'].str.contains('random') | (df['S'].str.contains('random') & (df['padj_S_run1'] > 0.001))]
df = df[~df['S'].str.contains('random') | (df['S'].str.contains('random') & (df['padj_S_run2'] > 0.001))]

df = df.dropna(subset=['log2FC_ES', 'log2FC_SE'])

# making single x single term
df['ES_ExS'] = df['log2FC_S_run2']*df['log2FC_E_run2']
df['SE_ExS'] = df['log2FC_S_run1']*df['log2FC_E_run1']

# linear regression
run2 = df.loc[:,['log2FC_S_run1', 'log2FC_E_run1','SE_ExS']].as_matrix()
SE = df['log2FC_SE'].as_matrix()
run3 = df.loc[:,['log2FC_S_run2', 'log2FC_E_run2', 'ES_ExS']].as_matrix()
ES = df['log2FC_ES'].as_matrix()

linSE = lm.LinearRegression()
linES = lm.LinearRegression()
linSE.fit(run2,SE)
linES.fit(run3,ES)

SEscore = linSE.score(run2,SE)
ESscore = linES.score(run3,ES)
print linSE.coef_
print linES.coef_

## add a column of additive model to plot
df['AP_SE_fit'] = linSE.intercept_ + (df['log2FC_S_run1'] * linSE.coef_[0]) + (df['log2FC_E_run1']*linSE.coef_[1]) + (df['SE_ExS']*linSE.coef_[2])
df['AP_ES_fit'] = linES.intercept_ + (df['log2FC_S_run2'] * linES.coef_[0]) + (df['log2FC_E_run2']*linES.coef_[1]) + (df['ES_ExS']*linSE.coef_[2])


SEscore = 'r^2 = '+ '{:.3f}'.format(linSE.score(run2,SE))
ESscore = 'r^2 = '+ '{:.3f}'.format(linES.score(run3,ES))
SEfunc = 'SE = '+'{:.3f}'.format(linSE.intercept_)+' + '+'{:.3f}'.format(linSE.coef_[0])+'S + '+'{:.3f}'.format(linSE.coef_[1])+'E + '+'{:.3f}'.format(linSE.coef_[2])+'ExS'
ESfunc = 'ES = '+'{:.3f}'.format(linES.intercept_)+' + '+'{:.3f}'.format(linES.coef_[0])+'S + '+'{:.3f}'.format(linES.coef_[1])+'E + '+'{:.3f}'.format(linES.coef_[2])+'ExS'

for key in ['ES', 'SE']:
    x = 'AP_%s_fit' % key
    y = 'log2FC_%s' % key

    ## plot
    fig = plt.figure(figsize=(4,4), dpi=300)
    ax = fig.add_subplot(111, xticks=[-2,0,2,4,6],yticks=[-2,0,2,4,6])

    plt.xlim((-2.9,7.5))
    plt.ylim((-2.9,7.5))

    plt.scatter(y=df[y], x=df[x], s=1, alpha=0.1,color='0.3', zorder=1)
    plt.axhline(y=0, color='lightgrey', linestyle='dotted', zorder=0)
    plt.axvline(x=0, color='lightgrey', linestyle='dotted', zorder=0)

    ## 45 degree line
    x2 = np.arange(-10,10,0.1)
    y2 = x2
    plt.plot(x2,y2, color='coral', alpha=0.8, zorder=4, linestyle='dotted')

    plt.ylabel('Expression log2(fold change)', color='0.2')
    plt.xlabel('log additive model', color='0.2')

    k= '0.4'
    ax.spines['right'].set_color(k)
    ax.spines['left'].set_color(k)
    ax.spines['top'].set_color(k)
    ax.spines['bottom'].set_color(k)
    ax.tick_params(axis='x', colors=k)
    ax.tick_params(axis='y', colors=k)

    plt.savefig('Fig1D.linmodel_%s.png' % key)


with open("Fig1D.linmodel_score.txt", "w") as results:
    results.write('SEscore: %s\n' % SEscore)
    results.write('%s\n' % SEfunc)
    results.write('ESscore: %s\n' % ESscore)
    results.write('%s\n' % ESfunc)

In [None]:
### plotting Fig 1E

df = pd.read_table(table)

#### filtering out negative controls with significance
df = df[~df['S'].str.contains('random') | (df['S'].str.contains('random') & (df['padj_S_run1'] > 0.001))]
df = df.dropna(subset=['log2FC_ES', 'log2FC_SE'])

annoA = {'2.REST','3.CTCF_TAD','4.CTCF_27ac', '5.GFI1', '6.YY1','7.active'}

## add annotation of Silencers
def dscri(x):
    if 'CTCF_TAD' in x:
        return '3.CTCF_TAD'
    elif 'CTCF_chr' in  x:
        return '4.CTCF_27ac'
    elif 'pos' in x:
        return '7.active'
    elif 'random' in x:
        return '1.random'
    elif 'YY1' in x:
        return '6.YY1'
    elif 'REST' in x:
        return '2.REST'
    elif 'GFI1' in x:
        return '5.GFI1'
    else:
        return '8.Blocker'

disc = lambda x: dscri(x)
df['annotation_S']= df['S'].apply(disc)
df = df[~(df['annotation_S']=='8.Blocker')]

df = df.sort_values(['annotation_S'], ascending=True)
low = df[df['E'] == 'En02']
high = df[df['E'] == 'En19']


for library in ['ES', 'SE']:
    key = 'log2FC_%s' % library

    ### calc median of background controls
    rndmL = low[(low['annotation_S']=='1.random')]
    rndmL_med = pd.Series(rndmL[key]).median()

    rndmH = high[(high['annotation_S']=='1.random')]
    rndmH_med = pd.Series(rndmH[key]).median()

    ## plot
    fig = plt.figure(figsize=(3,2.5), dpi=300)

    ymin, ymax = -4.5, 8.5
    pl = sns.light_palette("seagreen", n_colors=8)

    ax = fig.add_subplot(121, yticks=[-2,0,2,4,6])
    ax.set_aspect(1.4)
    fig.subplots_adjust(left=0.2, bottom=0.2)
    plt.ylim([-2,7])
    plt.hlines([rndmL_med], ymin, ymax, "coral", linewidth=1, zorder=0,alpha=0.5)
    sns.boxplot(x=low['annotation_S'],y=low[key], zorder=2, fliersize=0,palette=pl)
    plt.ylabel('expression log2 (fold change)', color='0.2',fontsize='xx-small')
    plt.xlabel('', color='0.2',fontsize='xx-small')
    plt.xticks(rotation=40,ha='right',fontsize="xx-small")
    plt.yticks(fontsize='xx-small')
    plt.text(0.5,6,'En02', color='0.2',fontsize='xx-small')

    k='0.4'
    ax.spines['right'].set_color(k)
    ax.spines['left'].set_color(k)
    ax.spines['top'].set_color(k)
    ax.spines['bottom'].set_color(k)
    ax.tick_params(axis='x', colors=k)
    ax.tick_params(axis='y', colors=k)


    ax2 = fig.add_subplot(122, yticks=[-2,0,2,4,6])
    ax2.set_aspect(1.4)
    fig.subplots_adjust(left=0.2, bottom=0.2)
    plt.ylim([-2,7])
    plt.hlines([rndmH_med], ymin, ymax, "coral",linewidth=1, zorder=0,alpha=0.5)
    sns.boxplot(x=high['annotation_S'],y=high[key], zorder=2, fliersize=0,palette=pl)
    plt.ylabel('', color='0.2',fontsize='xx-small')
    plt.xlabel('', color='0.2',fontsize='xx-small')
    plt.xticks(rotation=40,ha='right',fontsize="xx-small")
    plt.yticks(fontsize='xx-small')
    plt.text(0.5,6,'En19', color='0.2',fontsize='xx-small')


    k='0.4'
    ax2.spines['right'].set_color(k)
    ax2.spines['left'].set_color(k)
    ax2.spines['top'].set_color(k)
    ax2.spines['bottom'].set_color(k)
    ax2.tick_params(axis='x', colors=k)
    ax2.tick_params(axis='y', colors=k)

    fig.savefig('Fig1E.%s.box.pdf' % library, transparent=True)

In [None]:
### plotting Fig 1F

df = pd.read_table(table)

#### filtering out negative controls with significance
df = df[~df['S'].str.contains('random') | (df['S'].str.contains('random') & (df['padj_S_run1'] > 0.001))]
df = df.dropna(subset=['log2FC_ES', 'log2FC_SE'])

#annoA = {'1.CTCF_nonTAD','2.CTCF_TAD','3.GFI1','4.REST', '5.YY1', '6.active'}

### P's rank is based on the P's activity on run2
dfp = df.sort_values(by='E',ascending=True)
dfp = pd.Series(dfp['E'])
dfp = dfp.drop_duplicates()
lp = dfp.values.tolist()

### make an empty dataframe
output = pd.DataFrame(columns=['S', 'E', 'log2FC_E_run1', 'log2FC_E_run2','zES', 'zSE'])

### calc z scores

for p in lp:
    df1 = df[df['E'] == p].reset_index(drop=True)
    rndm = df1[df1['annotation_S'] == '7.random_genomic']
    rndm = rndm.dropna(subset=['log2FC_ES'])

    if len(rndm) >0:
        df1['zES'] = (df1['log2FC_ES'] - rndm['log2FC_ES'].mean()) / rndm['log2FC_ES'].std(ddof=0)
        df1['zSE'] = (df1['log2FC_SE'] - rndm['log2FC_SE'].mean()) / rndm['log2FC_SE'].std(ddof=0)

        df1 = df1.loc[:,['S','E','log2FC_E_run1', 'log2FC_E_run2', 'zES', 'zSE']]

        output = pd.concat([output, df1], axis=0)

    else:
        pass


key = 'REST'

output = output[output['S'].str.contains(key)]
output = output.dropna(subset=['zES'])

xmin, xmax = -20,20
fig = plt.figure(figsize=(16,16))

plt.subplot(2,1,1)
sns.boxplot(data = output, x='E', y='zES', color="darkgrey",fliersize=0)
plt.xticks(rotation=30)
plt.ylim(-4.5,4.5)
plt.hlines([0],xmin, xmax, "0.7", linestyles='dashed', zorder=0)
plt.ylabel('zscore ES')

plt.subplot(2,1,2)
sns.boxplot(data = output, x='E', y='zSE', color="darkgrey",fliersize=0)
plt.xticks(rotation=30)
plt.ylim(-5,5)
plt.hlines([0],xmin, xmax, "0.7", linestyles='dashed', zorder=0)
plt.ylabel('zscore SE')

plt.savefig('Fig1F_box.REST.png')


In [None]:
### plotting Fig 3B

file = 'processed_data/RESTscreen_derived_K562_results.run1.txt'
l2fc = pd.read_table(file)

## select En19
l2fc = l2fc[l2fc['ID'].str.contains('En19')]

## separate results to canonical, negative control, and no-canonical
ref = l2fc[l2fc['ID'].str.contains('_Ref')]
Neg = l2fc[l2fc['project'] == 'NegCtrl']
l2fc = l2fc[l2fc['project'] == 'noMotif']

### process canonical motifs
#### make a dictionary to call score
refsc = pd.read_table('processed_data/ref_all_fimo.txt', sep='\t')
refsc = refsc.set_index('sequence name')
refsc_dic = refsc['score'].to_dict()
def Refscore(x):
    return refsc_dic.get(x,0)

#### select RE1 with strong canonical motif
idref = ref['ID'].str.split('^',expand=True)
idref.columns = ['enhancer','silencer']
ref = pd.concat([ref,idref],axis=1)
ref = ref.loc[:,['enhancer','silencer','log2FoldChange']].reset_index(drop=True)
ref['score'] = ref['silencer'].apply(Refscore)
ref = ref[ref['score'] > 20.86].dropna(subset= ['log2FoldChange'])

#### process nomotifs
ids = l2fc['ID'].str.split('^',expand=True)
ids.columns = ['enhancer','silencer']
l2fc = pd.concat([l2fc,ids],axis=1)
l2fc = l2fc.loc[:,['enhancer','silencer','log2FoldChange']].reset_index(drop=True)


## FIMO result for left and right half motif
left = pd.read_table('processed_data/halfmotif_left_fimo.txt')
right = pd.read_table('processed_data/halfmotif_right_fimo.txt')

### make dictionaries to call half-score
lsc = left.loc[:,['sequence_name','score']].sort_values('score',ascending=False)
lsc = lsc.drop_duplicates(subset=['sequence_name'])
lsc = lsc.set_index('sequence_name')
dicL = lsc['score'].to_dict()
def Lscore(x):
    return dicL.get(x,0)

rsc = right.loc[:,['sequence_name','score']].sort_values('score',ascending=False)
rsc = rsc.drop_duplicates(subset=['sequence_name'])
rsc = rsc.set_index('sequence_name')
dicR = rsc['score'].to_dict()
def Rscore(x):
    return dicR.get(x,0)

l2fc['score_L'] = l2fc['silencer'].apply(Lscore)
l2fc['score_R'] = l2fc['silencer'].apply(Rscore)

### make a table for ones with both L and R halfs
both = list(set(left['sequence_name'].values.tolist())  & set(right['sequence_name'].values.tolist()))

cmn_l = left[left['sequence_name'].isin(both)]
cmn_l = cmn_l.drop_duplicates(subset=['sequence_name']).sort_values(by='sequence_name').set_index('sequence_name')
cmn_l = cmn_l.drop(['motif_id','motif_alt_id','q-value'],axis=1)
cmn_l.columns = ['startL','stopL','strandL','scoreL','p-valueL','mat_seq_L']

cmn_r = right[right['sequence_name'].isin(both)]
cmn_r = cmn_r.drop_duplicates(subset=['sequence_name']).sort_values(by='sequence_name').set_index('sequence_name')
cmn_r = cmn_r.drop(['motif_id','motif_alt_id','q-value'],axis=1)
cmn_r.columns = ['startR','stopR','strandR','scoreR','p-valueR','mat_seq_R']

df = pd.concat([cmn_l,cmn_r],axis=1)

### filter weak motifs
df['sum_score'] = df['scoreL'] + df['scoreR']
df = df[df['sum_score']>=20.86]


df = df.reset_index()
df['gap'] = 'NA'
df['alignment'] = 'NA'
df['strands'] = df['strandL'] + df['strandR']


## categorize non-canonical motifs
for i in range(len(df)):
    strands = df.loc[i,'strands']

    if df.loc[i,'stopL'] < df.loc[i,'startR']:
        gap = df.loc[i,'startR'] - df.loc[i,'stopL'] -1
        if  strands == '++':
            alignment = 'atypically_spaced'
        elif strands == '--' :
            alignment = 'flipped'
        elif strands == '+-':
            alignment = 'convergent'
        else:
            alignment = 'divergent'

    elif df.loc[i,'stopR'] < df.loc[i,'startL']:
        gap = df.loc[i,'startL'] - df.loc[i,'stopR'] -1
        if strands == '++':
            alignment = 'flipped'
        elif strands == '--' :
            alignment = 'atypically_spaced'
        elif strands == '+-':
            alignment = 'convergent'
        else:
            alignment = 'divergent'

    else:
        gap = 'NA'
        alignment = 'overlap'

    df.loc[i,'gap'] = gap
    df.loc[i,'alignment'] = alignment


df = df[(df['gap']>=0)&(df['gap']<200)]
df = df.astype({'gap': int})

df = df.set_index('sequence_name')


dic_gap = df['gap'].to_dict()
def gap(x):
    return dic_gap.get(x,'NA')

dic_algn = df['alignment'].to_dict()
def algn(x):
    return dic_algn.get(x,'others')

l2fc['gap'] = l2fc['silencer'].apply(gap)
l2fc['alignment'] = l2fc['silencer'].apply(algn)


for i in range(len(l2fc)):
    algn = l2fc.loc[i,'alignment']
    if algn == 'others':
        if l2fc.loc[i,'score_L'] > 10:
            g = 'left_only'
        elif l2fc.loc[i,'score_R'] > 10:
            g = 'right_only'
        else:
            g = 'No_half'
        l2fc.loc[i,'alignment'] = g
        l2fc.loc[i,'gap'] = 0

sortl = ['canonical','No_half','atypically_spaced','flipped','convergent','divergent','left_only','right_only']
l2fc['sort'] = l2fc['alignment'].apply(lambda x: sortl.index(x) if x in sortl else -1)
l2fc = l2fc.sort_values(by='sort')

l2fc = l2fc[l2fc['gap'] <= 100]


## add negative control
Neg['score_L'] = 0
Neg['score_R'] = 0
Neg['gap'] = 'NA'
Neg['alignment'] = 'NegCtrl'
negmed = Neg['log2FoldChange'].median()

## add canonical
ref['score_L'] = 0
ref['score_R'] = 0
ref['gap'] = 'NA'
ref['alignment'] = 'canonical'

l2fc = pd.concat([Neg, ref, l2fc],axis=0)


## plot
fig1 = plt.figure(figsize=(2,2), dpi=300)
fig1 = plt.xticks(fontsize= 'xx-small',rotation = 20)
fig1 = plt.yticks(fontsize= 'xx-small')
fig1 = plt.ylim((-3,8.5))
fig1 = plt.ylabel('log2FoldChange',fontsize='xx-small')
fig1 =  sns.boxplot(y=l2fc['log2FoldChange'], x=l2fc['alignment'],linewidth=1,zorder=1,showfliers = False)
fig1 = plt.axhline(y=negmed, color='grey', alpha=0.5,linewidth=1,zorder=0)
fig1 = plt.title('nomotifs expression (K562,En19)',fontsize='xx-small')
fig1 = plt.savefig('Fig3B_box.pdf')


## calc p and adjp values
p = []

for i in sortl:
    l2fc_nc = l2fc[l2fc['alignment'] == 'NegCtrl']
    l2fc_pd = l2fc[l2fc['alignment'] == i]

    pval = stats.mannwhitneyu(l2fc_nc['log2FoldChange'].dropna(),l2fc_pd['log2FoldChange'].dropna() , alternative='two-sided',use_continuity=True).pvalue
    p.append(pval)
    #print i,'\t',pval


adjp = multi.multipletests(np.array(p),alpha=0.05,method='fdr_bh')
adjp = adjp[1].tolist()
res = pd.Series(adjp,index=sortl)

res.to_csv('Fig3B_stat.txt',index=True,header=False,sep='\t')

In [None]:
### plotting Fig 3C

### import files
left = pd.read_table('processed_data/halfmotif_left_fimo.txt')
right = pd.read_table('processed_data/halfmotif_right_fimo.txt')


### make dataframe for ones with both L and R halfs
both = list(set(left['sequence_name'].values.tolist())  & set(right['sequence_name'].values.tolist()))

cmn_l = left[left['sequence_name'].isin(both)]
cmn_l = cmn_l.drop_duplicates(subset=['sequence_name']).sort_values(by='sequence_name').set_index('sequence_name')
cmn_l = cmn_l.drop(['motif_id','motif_alt_id','q-value'],axis=1)
cmn_l.columns = ['startL','stopL','strandL','scoreL','p-valueL','mat_seq_L']


cmn_r = right[right['sequence_name'].isin(both)]
cmn_r = cmn_r.drop_duplicates(subset=['sequence_name']).sort_values(by='sequence_name').set_index('sequence_name')
cmn_r = cmn_r.drop(['motif_id','motif_alt_id','q-value'],axis=1)
cmn_r.columns = ['startR','stopR','strandR','scoreR','p-valueR','mat_seq_R']

df = pd.concat([cmn_l,cmn_r],axis=1)


### cut off weak motifs
df['sum_score'] = df['scoreL'] + df['scoreR']
df = df[df['sum_score']>=20.86]

df = df.reset_index()
df['gap'] = 'NA'
df['alignment'] = 'NA'
df['strands'] = df['strandL'] + df['strandR']

## categorize non-canonical motifs
for i in range(len(df)):
    strands = df.loc[i,'strands']

    if df.loc[i,'stopL'] < df.loc[i,'startR']:
        gap = df.loc[i,'startR'] - df.loc[i,'stopL'] -1
        if  strands == '++':
            alignment = 'atypically_spaced'
        elif strands == '--' :
            alignment = 'flipped'
        elif strands == '+-':
            alignment = 'convergent'
        else:
            alignment = 'divergent'

    elif df.loc[i,'stopR'] < df.loc[i,'startL']:
        gap = df.loc[i,'startL'] - df.loc[i,'stopR'] -1
        if strands == '++':
            alignment = 'flipped'
        elif strands == '--' :
            alignment = 'atypically_spaced'
        elif strands == '+-':
            alignment = 'convergent'
        else:
            alignment = 'divergent'

    else:
        gap = 'NA'
        alignment = 'overlap'

    df.loc[i,'gap'] = gap
    df.loc[i,'alignment'] = alignment

df = df[(df['gap']>=0)&(df['gap']<=25)]
df = df.astype({'gap': int})


## plot
bins = np.arange(0, 25, 1)
t = 60
df = df[df['alignment'] != 'overlap']
LR = df[df['alignment'] == 'atypically_spaced']
LR = LR['gap'].values
RL = df[df['alignment'] == 'flipped']
RL = RL['gap'].values
inw = df[df['alignment'] == 'convergent']
inw = inw['gap'].values
outw = df[df['alignment'] == 'divergent']
outw = outw['gap'].values

fig1 = plt.figure(figsize=(4,4), dpi=300)
fig1 = plt.title('gap (bp) between half motifs', color='0.2')

ax1 = plt.subplot(221)
ax1 = plt.title('atypically_spaced',fontsize='xx-small')
ax1 = plt.ylim((0,t))
ax1 = plt.xlim((0,25))
ax1 = plt.hist(LR,bins)
ax1 = plt.xticks([0,25,50],fontsize='xx-small')
ax1 = plt.yticks(fontsize='xx-small')

ax2 = plt.subplot(222)
ax2 = plt.title('flipped',fontsize='xx-small')
ax2 = plt.ylim((0,t))
ax2 = plt.xlim((0,25))
ax2 = plt.hist(RL,bins)
ax2 = plt.xticks([0,25,50],fontsize='xx-small')
ax2 = plt.yticks(fontsize='xx-small')

ax3 = plt.subplot(223)
ax3 = plt.title('convergent',fontsize='xx-small')
ax3 = plt.ylim((0,t))
ax3 = plt.xlim((0,25))
ax3 = plt.hist(inw,bins)
ax3 = plt.xticks([0,25,50],fontsize='xx-small')
ax3 = plt.yticks(fontsize='xx-small')

ax4 = plt.subplot(224)
ax4 = plt.title('divergent',fontsize='xx-small')
ax4 = plt.ylim((0,t))
ax4 = plt.xlim((0,25))
ax4 = plt.hist(outw,bins)
ax4 = plt.xticks([0,25,50],fontsize='xx-small')
ax4 = plt.yticks(fontsize='xx-small')

fig1 = plt.savefig('Fig3C_hist.pdf' )

In [None]:
### plotting Fig 3D

l2fc = pd.read_table('processed_data/RESTscreen_derived_K562_results.run1.txt')

### select negative controls and no-canonicals
l2fc = l2fc[l2fc['ID'].str.contains('En19')]
Neg = l2fc[l2fc['project'] == 'NegCtrl']
l2fc = l2fc[l2fc['project'] == 'noMotif']


ids = l2fc['ID'].str.split('^',expand=True)
ids.columns = ['enhancer','silencer']
l2fc = pd.concat([l2fc,ids],axis=1)
l2fc = l2fc.loc[:,['enhancer','silencer','log2FoldChange']].reset_index(drop=True)

### inport half-scores
left = pd.read_table('processed_data/halfmotif_left_fimo.txt')
right = pd.read_table('processed_data/halfmotif_right_fimo.txt')

#### make dictionaries to call half-scores
lsc = left.loc[:,['sequence_name','score']].sort_values('score',ascending=False)
lsc = lsc.drop_duplicates(subset=['sequence_name'])
lsc = lsc.set_index('sequence_name')
dicL = lsc['score'].to_dict()
def Lscore(x):
    return dicL.get(x,0)

rsc = right.loc[:,['sequence_name','score']].sort_values('score',ascending=False)
rsc = rsc.drop_duplicates(subset=['sequence_name'])
rsc = rsc.set_index('sequence_name')
dicR = rsc['score'].to_dict()
def Rscore(x):
    return dicR.get(x,0)

l2fc['score_L'] = l2fc['silencer'].apply(Lscore)
l2fc['score_R'] = l2fc['silencer'].apply(Rscore)

### make a dataframe for ones with both L and R halfs
both = list(set(left['sequence_name'].values.tolist())  & set(right['sequence_name'].values.tolist()))

cmn_l = left[left['sequence_name'].isin(both)]
cmn_l = cmn_l.drop_duplicates(subset=['sequence_name']).sort_values(by='sequence_name').set_index('sequence_name')
cmn_l = cmn_l.drop(['motif_id','motif_alt_id','q-value'],axis=1)
cmn_l.columns = ['startL','stopL','strandL','scoreL','p-valueL','mat_seq_L']

cmn_r = right[right['sequence_name'].isin(both)]
cmn_r = cmn_r.drop_duplicates(subset=['sequence_name']).sort_values(by='sequence_name').set_index('sequence_name')
cmn_r = cmn_r.drop(['motif_id','motif_alt_id','q-value'],axis=1)
cmn_r.columns = ['startR','stopR','strandR','scoreR','p-valueR','mat_seq_R']

df = pd.concat([cmn_l,cmn_r],axis=1)

### cut off weak motifs
df['sum_score'] = df['scoreL'] + df['scoreR']
df = df[df['sum_score']>=20.86]

df = df.reset_index()
df['gap'] = 'NA'
df['alignment'] = 'NA'
df['strands'] = df['strandL'] + df['strandR']

## categorize non-canonical motifs
for i in range(len(df)):
    strands = df.loc[i,'strands']

    if df.loc[i,'stopL'] < df.loc[i,'startR']:
        gap = df.loc[i,'startR'] - df.loc[i,'stopL'] -1
        if  strands == '++':
            alignment = 'atypically_spaced'
        elif strands == '--' :
            alignment = 'flipped'
        elif strands == '+-':
            alignment = 'convergent'
        else:
            alignment = 'divergent'

    elif df.loc[i,'stopR'] < df.loc[i,'startL']:
        gap = df.loc[i,'startL'] - df.loc[i,'stopR'] -1
        if strands == '++':
            alignment = 'flipped'
        elif strands == '--' :
            alignment = 'atypically_spaced'
        elif strands == '+-':
            alignment = 'divergent'
        else:
            alignment = 'convergent'

    else:
        gap = 'NA'
        alignment = 'overlap'

    df.loc[i,'gap'] = gap
    df.loc[i,'alignment'] = alignment

df = df[(df['gap']>=0)&(df['gap']<200)]
df = df.astype({'gap': int})

df = df.set_index('sequence_name')


dic_gap = df['gap'].to_dict()
def gap(x):
    return dic_gap.get(x,'NA')

dic_algn = df['alignment'].to_dict()
def algn(x):
    return dic_algn.get(x,'others')

l2fc['gap'] = l2fc['silencer'].apply(gap)
l2fc['alignment'] = l2fc['silencer'].apply(algn)

## select atypically spaced strong non-canonical motifs with <25 bp gap
l2fc = l2fc[l2fc['alignment'] == 'atypically_spaced']
l2fc = l2fc[l2fc['gap'] <= 25]
l2fc = l2fc.sort_values(by='gap').reset_index(drop=True)
l2fc = l2fc[(l2fc['score_L'] + l2fc['score_R']) > 20.86]

## add dummy data to plot gap=5 and gap=6
l2fc = l2fc.loc[:,['ID','alignment','gap','score_L','score_R','log2FoldChange']]
gap5 = pd.Series(['dummy','LR',5,0,0,7], index=l2fc.columns)
gap6 = pd.Series(['dummy','LR',6,0,0,7], index=l2fc.columns)
l2fc =l2fc.append(gap5, ignore_index=True)
l2fc =l2fc.append(gap6, ignore_index=True)

l2fc = l2fc.sort_values(by='gap')

## concatenate long gap motif
for i in range(len(l2fc)):
    if l2fc.loc[i,'gap'] >= 12:
        l2fc.loc[i,'gap'] = '>=12'

## add negative control
Neg['score_L'] = 0
Neg['score_R'] = 0
Neg['gap'] = 'Neg'
Neg['alignment'] = 'NegCtrl'
negmed = Neg['log2FoldChange'].median()
Neg = Neg.loc[:,['ID','alignment','gap','score_L','score_R','log2FoldChange']]

l2fc = pd.concat([Neg,l2fc],axis=0)

l = ['Neg',0,1,2,3,4,5,6,7,8,9,10,11,'>=12']
l2fc = l2fc[l2fc['gap'].isin(l)]

## plot
fig1 = plt.figure(figsize=(2.25,2), dpi=300)
fig1 = plt.xlabel('')
fig1 = plt.xticks(fontsize= 'xx-small',rotation = 30)
fig1 = plt.yticks(fontsize= 'xx-small')
fig1 = plt.ylim((-3,8.5))
fig1 =  sns.boxplot(y=l2fc['log2FoldChange'], x=l2fc['gap'],linewidth=1,zorder=1,showfliers = False)
fig1 = plt.axhline(y=negmed, color='grey', alpha=0.5,linewidth=1,zorder=0)
fig1 = plt.title('nomotifs expression (K562,En19)')
fig1 = plt.savefig('Fig3D_gapbox.pdf')


## calc p and adj values
Neg = Neg['log2FoldChange'].dropna().values
p = []
gap = []
for i in range(26):
    test = l2fc[l2fc['gap'] == i]
    if len(test) > 3:
        test = test['log2FoldChange'].dropna().values
        pval = stats.mannwhitneyu(Neg,test , alternative='two-sided',use_continuity=True).pvalue
        p.append(pval)
        gap.append(i)

adjp = multi.multipletests(np.array(p),alpha=0.05,method='fdr_bh')
adjp = adjp[1].tolist()
res = pd.Series(adjp,index=gap)

res.to_csv('Fig3D_stat.txt',index=True,header=False,sep='\t')

In [3]:
### plotting Fig 5B

seg = 20.86
enhancer = 'En19'


df = pd.read_table('processed_data/RESTscreen_AF_K562_emVAR.out', sep='\t')
df = df.dropna(subset=['LogSkew'])
df = df[df['ID'].str.contains(enhancer)]
df = df[df['ID'].str.contains(':wP')]


### making a column for SNP position
spl = df['ID'].str.split(':wP',expand=True)
spl.columns = ['sil','SNP_pos']
df = pd.concat([df,spl['SNP_pos']],axis=1)
df['SNP_pos'] = df['SNP_pos'].astype(int)

df = df.reset_index(drop=True)

### calc delta binding score
ref = pd.read_table('processed_data/variants_ref_fimo.txt', sep='\t')
ref = ref[(ref['start']== 91) | (ref['start']== 92)]
ref = ref.sort_values(by='#pattern name', ascending=False) # prioritize MA0138.2
ref = ref.drop_duplicates(subset=['sequence name'])
spl_r = ref['sequence name'].str.split(':R:wP', expand=True)
spl_r.columns = ['SNP', 'position']
ref = pd.concat([ref, spl_r['SNP']],axis=1).set_index('SNP')
ref_score = ref['score'].to_dict()
ref_strand = ref['strand'].to_dict()

alt = pd.read_table('processed_data/variants_alt_fimo.txt', sep='\t')
alt = alt[(alt['start']== 91) | (alt['start']== 92)]
alt = alt.sort_values(by='#pattern name', ascending=False)
alt = alt.drop_duplicates(subset=['sequence name'])
spl_a = alt['sequence name'].str.split(':A:wP', expand=True)
spl_a.columns = ['SNP', 'position']
alt = pd.concat([alt, spl_a['SNP']],axis=1).set_index('SNP')
alt_score = alt['score'].to_dict()
alt_strand = alt['strand'].to_dict()


df['score_maj'] = 0
df['score_min'] = 0
df['strand'] = 'NA'

df = df.reset_index(drop=True)
for i in range(len(df)):
    id = df.loc[i,'ID']
    s = df.loc[i,'SNP']
    if 'R:wP' in id:
        df.loc[i,'score_maj'] = ref_score.get(s,0)
        df.loc[i,'score_min'] = alt_score.get(s,0)
        df.loc[i,'strand'] = ref_strand.get(s,0)


    elif 'A:wP' in id:
        df.loc[i,'score_maj'] = alt_score.get(s,0)
        df.loc[i,'score_min'] = ref_score.get(s,0)
        df.loc[i,'strand'] = alt_strand.get(s,0)


df = df[(df['score_maj'] > 0) & (df['score_min'] > 0)]
df['dBS'] = df['score_min'] - df['score_maj']

### orient motifs in a same direction
df['rel_pos'] = 0
df = df.reset_index(drop=True)
for i in range(len(df)):
    pos = df.loc[i,'SNP_pos']
    str = df.loc[i,'strand']
    if str == '+':
        df.loc[i,'rel_pos'] = pos
    elif str == '-':
        df.loc[i,'rel_pos'] = 202 - pos


### taking absolute value of logSkew
df['abs_skew'] = df['LogSkew'].abs()

df['dist'] = abs(101 - df['rel_pos'])
df = df.sort_values(by='dist')
df = df.drop_duplicates(subset=['SNP'])

df = df[(df['rel_pos'] <= 126) & (df['rel_pos'] >= 76)]


### separate above and below for line plots
above = df[(df['score_min'] > seg)| (df['score_maj'] > seg)]
below = df[(df['score_min'] < seg) & (df['score_maj'] < seg)]



### scatter: position-vs-skew
fig2 = plt.figure(figsize=(3,3), dpi=300)
boxprops = dict(linewidth=0.5, color='0.6')

ax2 = plt.subplot(211)
ax2.set(ylim=(-0.1, 3.5))
ax2.set(xlim=(76, 126))
ax2 = sns.boxplot(y=above['LogSkew'].abs(), x=above['rel_pos'],fliersize=0,color='coral',boxprops=dict(linewidth=1))
ax2 = plt.ylabel('abs_LogSkew_above', color='0.2', fontsize=6)
ax2 = plt.xlabel('')
plt.xticks(rotation=90,fontsize=4)
plt.yticks(fontsize=5)


ax3 = plt.subplot(212)
ax3.set(ylim=(-0.1, 3.5))
ax3.set(xlim=(76, 126))
ax3 = sns.boxplot(y=below['LogSkew'].abs(), x=below['rel_pos'],fliersize=0,color="darkgrey")
ax3 = plt.ylabel('abs_LogSkew_below', color='0.2',fontsize=6)
ax3 = plt.xlabel('SNP position (relative)', color='0.2', fontsize=4)
plt.xticks(rotation=90, fontsize=4)
plt.yticks(fontsize=5)

fig2 =  plt.savefig('Fig5b_abs_skew_box.pdf')

In [None]:
### plotting Fig 5C

### select one result for one SNP (select the closest one to the center)
df['distance'] = abs(101 - df['SNP_pos'])
df = df.sort_values(by = 'distance')
df = df.drop_duplicates(subset=['SNP'])


### select "inside" SNP
df = df[(df['SNP_pos']>=91) & (df['SNP_pos']<=111)]

df =df.dropna(subset=['LogSkew'])


### remove outliers of dBS derived from mis score calc
df = df[(df['dBS'] > -20) & (df['dBS'] < 20)]

### plots
for segment in ['above', 'below']:

    if segment == 'above':
        df1 = df[(df['score_maj'] > seg)| (df['score_min'] > seg)]
    elif segment == 'below':
        df1 = df[(df['score_maj'] < seg) & (df['score_min'] < seg)]

    skew = np.array(df1['LogSkew'].tolist()).reshape(-1,1)
    dbs = np.array(df1['dBS'].tolist()).reshape(-1,1)
    corr, pval = stats.pearsonr(dbs, skew)

    fig = plt.figure(figsize=(2,2), dpi=300)
    ax = fig.add_subplot(111)
    ax.set(ylim=(-4, 4))
    ax.set(xlim=(-11, 11))
    fig = plt.ylabel('LogSkew (Alt-Ref)', color='0.2', fontsize='xx-small')
    fig = plt.xlabel('delta BS (Alt-Ref)', color='0.2',fontsize='xx-small')
    fig =  plt.scatter(y=df1['LogSkew'], x=df1['dBS'], s=2, alpha=0.6,color="0.3", zorder=1)
    fig = plt.text(-10,-3,corr, color='0.2',fontsize='xx-small')

    fig = plt.axhline(y=0, color='0.8', zorder=0)
    fig = plt.axvline(x=0, color='0.8', zorder=0)

    k='0.4'
    ax.spines['right'].set_color(k)
    ax.spines['left'].set_color(k)
    ax.spines['top'].set_color(k)
    ax.spines['bottom'].set_color(k)
    ax.tick_params(axis='x', colors=k)
    ax.tick_params(axis='y', colors=k)

    fig =  plt.savefig('Fig5C_score_skew.%s.pdf' % segment)