# Construct position-weight matrix for ARM-like motifs

In [1]:
import matplotlib as mpl
# mpl.use('Agg')
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['text.usetex'] = False
mpl.rcParams['font.sans-serif'] = 'Arial'
mpl.rcParams['font.family'] = 'sans-serif'
mpl.rcParams['figure.dpi'] = 300
mpl.rcParams['image.interpolation'] = 'none'

import os, re
from pathlib import Path
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from pprint import pprint
import scipy
import seaborn as sns

from Bio import SeqIO
from Bio import motifs

from scipy.interpolate import UnivariateSpline
import matplotlib.transforms as mtransforms
from matplotlib.patches import FancyBboxPatch

import metapredict as meta

%matplotlib inline

In [2]:
tf_df = pd.read_csv('TF_full_update.csv')
seqs = tf_df['Subsequence'].tolist()

# Filter out anything not a 9-mer (need to look at this in the otehr script)
seqs = [x for x in seqs if len(x)==9]

In [26]:
# Create motifs https://biopython-cn.readthedocs.io/zh_CN/latest/en/chr14.html
# Create sequence logos # https://weblogo.readthedocs.io/en/latest/logo.html

motif1 = motifs.create(seqs,'ARNDCEQGHILKMFPSTWYV')
motif1.weblogo("seq_logo1.pdf",format="pdf",yaxis_scale=4,show_errorbars=False,show_xaxis=False,show_ends=False,show_fineprint=False)

rm_r = [(x[0:4]+x[5:10]) for x in seqs] # Remove the middle R
motif2 = motifs.create(rm_r,'ARNDCEQGHILKMFPSTWYV')
motif2.weblogo("seq_logo2.pdf",format="pdf",yaxis_scale=0.5,show_errorbars=False,show_xaxis=False,show_ends=False,show_fineprint=False)

In [8]:
pwm = motif1.counts.normalize()
print(pwm)

        0      1      2      3      4      5      6      7      8
A:   0.06   0.06   0.05   0.05   0.00   0.05   0.05   0.05   0.06
R:   0.18   0.20   0.24   0.22   1.00   0.14   0.24   0.19   0.18
N:   0.03   0.02   0.03   0.02   0.00   0.03   0.02   0.02   0.03
D:   0.03   0.03   0.02   0.02   0.00   0.04   0.02   0.03   0.04
C:   0.01   0.01   0.01   0.01   0.00   0.01   0.01   0.01   0.02
E:   0.06   0.07   0.05   0.06   0.00   0.07   0.05   0.06   0.07
Q:   0.04   0.04   0.04   0.04   0.00   0.05   0.04   0.04   0.03
G:   0.06   0.06   0.06   0.07   0.00   0.07   0.05   0.07   0.06
H:   0.02   0.02   0.04   0.02   0.00   0.04   0.04   0.02   0.02
I:   0.02   0.02   0.02   0.02   0.00   0.03   0.02   0.02   0.02
L:   0.05   0.04   0.06   0.05   0.00   0.05   0.05   0.05   0.05
K:   0.15   0.17   0.16   0.19   0.00   0.12   0.16   0.15   0.13
M:   0.02   0.01   0.01   0.01   0.00   0.01   0.01   0.01   0.01
F:   0.02   0.02   0.02   0.02   0.00   0.02   0.02   0.03   0.02
P:   0.08 