# A collection of methods for motif selection

In [1]:
import os
import json
import numpy as np
import itertools
from pandas import DataFrame
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import seaborn as sns
%matplotlib inline
plt.rcParams['figure.figsize'] = [16.5, 5]
plt.rcParams['font.size'] = 12

In [2]:
# pip install intervaltree
import intervaltree

In [37]:
def make_period_matrix(bases, max_period):
    """
    Makes a matrix containing information about matches between bases at different offsets
    period_matrix[i][j] == 1 if the base at index j is equal to the base at index j + i + 1
    """
    matrix = np.zeros((max_period, len(bases)), dtype=int)

    for row_index in range(max_period):
        for col_index in range(len(bases) - row_index - 1):
            if bases[col_index] == bases[col_index + row_index + 1]:
                matrix[row_index][col_index] = 1

    return matrix


def is_nested(tree, begin, stop):
    for interval in tree.overlap(begin, stop):
        if interval.begin <= begin and stop <= interval.end:
            return True
    return False


def find_motifs(bases):
    matrix = make_period_matrix(bases, int(len(bases) / 2))
    tree = intervaltree.IntervalTree()

    for row_index, row in enumerate(matrix):
        period = row_index + 1
        base_offset = 0
        for symbol, group in itertools.groupby(row):
            run_length = len(list(group))
            if symbol == 1 and run_length >= period and run_length + period >= 8:
                begin = base_offset
                end = begin + run_length

                if not is_nested(tree, begin, end):
                    motif = bases[base_offset:base_offset + period]
                    tree.addi(begin, end, motif)

            base_offset += run_length
    return [interval.data for interval in tree]


find_motifs("CAGCAGCAGCAGCAGCATCATCATCATCATCAT")

['CAG', 'CAT']