# KE5006 Applied Research

### Identifying enhancers and their strength with deep neural networks

# Data Pre-processing

## Load libraries

In [1]:
import pandas as pd
import os
import re

## Load data

In [2]:
raw_data_df = pd.read_csv(os.path.join('data', 'enhancer.txt'), delim_whitespace=True, header=None)

In [3]:
raw_data_df.head()

Unnamed: 0,0
0,>CHRX_48897056_48897256
1,CACAATGTAGAAGCAGAGACACAGGAACCAGGCTTGGTGATGGCTC...
2,>CHR12_6444339_6444539
3,GCCCTCACATTCCCTGGCCCATCCCCTCCACCTCAAAATTTACAAA...
4,>CHR12_6444939_6445139


## Pre-processing

Extract the id attribute.

In [4]:
id = raw_data_df.iloc[[i for i in raw_data_df.index if i % 2 == 0], :]
id.reset_index(drop=True, inplace=True)
id = id.applymap(lambda x: x[1:])
id.head()


Unnamed: 0,0
0,CHRX_48897056_48897256
1,CHR12_6444339_6444539
2,CHR12_6444939_6445139
3,CHR12_6445139_6445339
4,CHR12_6445339_6445539


Extract the sequence attribute.

In [5]:
sequence = raw_data_df.iloc[[i for i in raw_data_df.index if i % 2 > 0], :]
sequence.reset_index(drop=True, inplace=True)
sequence.head()

Unnamed: 0,0
0,CACAATGTAGAAGCAGAGACACAGGAACCAGGCTTGGTGATGGCTC...
1,GCCCTCACATTCCCTGGCCCATCCCCTCCACCTCAAAATTTACAAA...
2,GAGCAGGAGGCCAGTCACCCTGAGTCAGCCACGGGGAGACGCTGCA...
3,CCTCTGCTGAGAACAGGACTGGGGCTTCCAGGGCAACAGGAAGGGT...
4,ACAGCCTTAAAGGGAGCTTTTCAGGGACCTCTGGCCAGTGGGGGAT...


Combine the attributes into a data frame.

In [6]:
enhancer_df = pd.concat([id, sequence], axis=1)
enhancer_df.columns = ['id', 'sequence']
enhancer_df.head()

Unnamed: 0,id,sequence
0,CHRX_48897056_48897256,CACAATGTAGAAGCAGAGACACAGGAACCAGGCTTGGTGATGGCTC...
1,CHR12_6444339_6444539,GCCCTCACATTCCCTGGCCCATCCCCTCCACCTCAAAATTTACAAA...
2,CHR12_6444939_6445139,GAGCAGGAGGCCAGTCACCCTGAGTCAGCCACGGGGAGACGCTGCA...
3,CHR12_6445139_6445339,CCTCTGCTGAGAACAGGACTGGGGCTTCCAGGGCAACAGGAAGGGT...
4,CHR12_6445339_6445539,ACAGCCTTAAAGGGAGCTTTTCAGGGACCTCTGGCCAGTGGGGGAT...


In [7]:
len(enhancer_df.iloc[0, 1])

200

Check if all the sequences are the same length.

In [8]:
enhancer_df['sequence'].map(lambda x: len(x) == 200).all()

True

There are 1484 samples of enhancers which is correct according to the reference paper.

In [9]:
enhancer_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1484 entries, 0 to 1483
Data columns (total 2 columns):
id          1484 non-null object
sequence    1484 non-null object
dtypes: object(2)
memory usage: 23.3+ KB


Save preprocessed data to file.

In [10]:
enhancer_df.to_csv(os.path.join('data', 'enhancer.csv'), index=False)

## Repeat the pre-processing for the other files (except for independent.txt which has a different format)

In [11]:
def preprocess_data(filenames):
    for a_filename in filenames:
        raw_data_df = pd.read_csv(os.path.join('data', a_filename), delim_whitespace=True, header=None)
        id = raw_data_df.iloc[[i for i in raw_data_df.index if i % 2 == 0], :]
        id.reset_index(drop=True, inplace=True)
        id = id.applymap(lambda x: x[1:])
        sequence = raw_data_df.iloc[[i for i in raw_data_df.index if i % 2 > 0], :]
        sequence.reset_index(drop=True, inplace=True)
        preprocessed_df = pd.concat([id, sequence], axis=1)
        preprocessed_df.columns = ['id', 'sequence']
        newfilename = a_filename.split('.')[0] + '.csv'
        preprocessed_df.to_csv(os.path.join('data', newfilename), index=False)        

In [12]:
preprocess_data(['non_enhancer.txt', 'strong.txt', 'weak.txt'])

## Process the file independent.txt

Load the raw data

In [13]:
raw_data_df = pd.read_csv(os.path.join('data', 'independent.txt'), delim_whitespace=True, header=None, skiprows=6)

In [14]:
raw_data_df.head()

Unnamed: 0,0,1,2,3
0,(1),100.0,strong,enhancers
1,>Chr11_6627824_6628024,,,
2,ATGCTGCCAGAAGGAAAAGGGGTGGAATTAATGAAACTGGAAGGTT...,,,
3,TAAAGTATGGGGGCCAAAGTTGGCTATATGCTGGATATGAAGAGGG...,,,
4,TTCTTGAGATAGAAGTCCAGGCCCTGAGGTGGCAGGCAGCCTGATA...,,,


Locate the section dividers for strong, weak and non-enhancers  

In [15]:
type_string = re.compile('^\([0-9]+\)$')
raw_data_df.loc[raw_data_df.loc[:, 0].map(lambda x: type_string.match(x) != None), :].index

Int64Index([0, 512, 1160], dtype='int64')

In [16]:
raw_data_df.iloc[0, :]

0          (1)
1          100
2       strong
3    enhancers
Name: 0, dtype: object

In [17]:
raw_data_df.iloc[512, :]

0          (2)
1          100
2         weak
3    enhancers
Name: 512, dtype: object

In [18]:
raw_data_df.iloc[1160, :]

0              (3)
1              200
2    non-enhancers
3              NaN
Name: 1160, dtype: object

Extract the strong samples

In [19]:
strong_df = raw_data_df.iloc[1:512, 0]

Remove the page footers e.g. row 39 below

In [20]:
strong_df[:50]

1                                >Chr11_6627824_6628024
2     ATGCTGCCAGAAGGAAAAGGGGTGGAATTAATGAAACTGGAAGGTT...
3     TAAAGTATGGGGGCCAAAGTTGGCTATATGCTGGATATGAAGAGGG...
4     TTCTTGAGATAGAAGTCCAGGCCCTGAGGTGGCAGGCAGCCTGATA...
5                                                 CCATA
6                                >Chr11_9587224_9587424
7     GGCATTTTTTAACCTGTGTTTCATTTTCATCTGTGAAATGTGAATA...
8     GTTCCCGTAAAGATTAAATAAGTATGTAAAGCATCTGGGTCAGTGC...
9     AAAATATTCTCCCCTCTCCCAGCTCCTGCAAAGGCACCCCAGCTCT...
10                                                TCTCA
11                             >Chr11_65187024_65187224
12    GAAACCACAGAGCTGACCTGGCTTCAGAACAAGATGTGGGGCTCCA...
13    ACCCTGATTGGCAATAGCCGCAAGAAAGGGCTTGGAGACAGGGGCC...
14    GTAATCGCAGCACTTTGGGAGGTCAAGGCAGGTGGATCACTTGAGG...
15                                                CTGGC
16                             >Chr10_74014594_74014794
17    TTTGCATAGGGGCATTACCACTGGACTTGGGCTCAGAGCAAGTGTT...
18    CGTCTCAGAATCACAGAGCACTCCTCTACCTTTGACCTTCTG

In [21]:
page_string = re.compile('^\f')
strong_df.drop(strong_df[strong_df.map(lambda x: page_string.match(x) != None)].index, inplace=True)
strong_df[:50]

1                                >Chr11_6627824_6628024
2     ATGCTGCCAGAAGGAAAAGGGGTGGAATTAATGAAACTGGAAGGTT...
3     TAAAGTATGGGGGCCAAAGTTGGCTATATGCTGGATATGAAGAGGG...
4     TTCTTGAGATAGAAGTCCAGGCCCTGAGGTGGCAGGCAGCCTGATA...
5                                                 CCATA
6                                >Chr11_9587224_9587424
7     GGCATTTTTTAACCTGTGTTTCATTTTCATCTGTGAAATGTGAATA...
8     GTTCCCGTAAAGATTAAATAAGTATGTAAAGCATCTGGGTCAGTGC...
9     AAAATATTCTCCCCTCTCCCAGCTCCTGCAAAGGCACCCCAGCTCT...
10                                                TCTCA
11                             >Chr11_65187024_65187224
12    GAAACCACAGAGCTGACCTGGCTTCAGAACAAGATGTGGGGCTCCA...
13    ACCCTGATTGGCAATAGCCGCAAGAAAGGGCTTGGAGACAGGGGCC...
14    GTAATCGCAGCACTTTGGGAGGTCAAGGCAGGTGGATCACTTGAGG...
15                                                CTGGC
16                             >Chr10_74014594_74014794
17    TTTGCATAGGGGCATTACCACTGGACTTGGGCTCAGAGCAAGTGTT...
18    CGTCTCAGAATCACAGAGCACTCCTCTACCTTTGACCTTCTG

In [22]:
len(strong_df)

500

Combine the sub-sequences into 1

In [23]:
strong_df.values.reshape((-1, 5))[:5]

array([['>Chr11_6627824_6628024',
        'ATGCTGCCAGAAGGAAAAGGGGTGGAATTAATGAAACTGGAAGGTTGTGGTGCTGGTTTGAGGAG',
        'TAAAGTATGGGGGCCAAAGTTGGCTATATGCTGGATATGAAGAGGGGGTTAATTCCTTGCAGGTC',
        'TTCTTGAGATAGAAGTCCAGGCCCTGAGGTGGCAGGCAGCCTGATAGTGAACAGAACCCTTGTGC',
        'CCATA'],
       ['>Chr11_9587224_9587424',
        'GGCATTTTTTAACCTGTGTTTCATTTTCATCTGTGAAATGTGAATAAAAATAACTCTCTTACAGA',
        'GTTCCCGTAAAGATTAAATAAGTATGTAAAGCATCTGGGTCAGTGCCTATCATATAGTTGGCACC',
        'AAAATATTCTCCCCTCTCCCAGCTCCTGCAAAGGCACCCCAGCTCTTTGCAGCACTTAGGGCCTT',
        'TCTCA'],
       ['>Chr11_65187024_65187224',
        'GAAACCACAGAGCTGACCTGGCTTCAGAACAAGATGTGGGGCTCCAGGCACCCGGGAGACCAGTG',
        'ACCCTGATTGGCAATAGCCGCAAGAAAGGGCTTGGAGACAGGGGCCAGGCGCGGTGGCTACCCCT',
        'GTAATCGCAGCACTTTGGGAGGTCAAGGCAGGTGGATCACTTGAGGTCAGGAGTTGGAGACCAGC',
        'CTGGC'],
       ['>Chr10_74014594_74014794',
        'TTTGCATAGGGGCATTACCACTGGACTTGGGCTCAGAGCAAGTGTTTATGAGATAAATGAGGTGC',
        'CGTCTCAGAATCACAGAGCACTCCTCT

In [24]:
strong_df = pd.DataFrame(strong_df.values.reshape((-1, 5)))
strong_df.columns.values

array([0, 1, 2, 3, 4])

In [25]:
strong_df['sequence'] = strong_df[1] + strong_df[2] + strong_df[3] + strong_df[4]
strong_df['sequence'].head()

0    ATGCTGCCAGAAGGAAAAGGGGTGGAATTAATGAAACTGGAAGGTT...
1    GGCATTTTTTAACCTGTGTTTCATTTTCATCTGTGAAATGTGAATA...
2    GAAACCACAGAGCTGACCTGGCTTCAGAACAAGATGTGGGGCTCCA...
3    TTTGCATAGGGGCATTACCACTGGACTTGGGCTCAGAGCAAGTGTT...
4    CGGGAGGCGGGGGTTGCAGTGAGCCAAGATCACACCACTGCACTCC...
Name: sequence, dtype: object

In [26]:
strong_df.drop([1,2,3,4], axis=1, inplace=True)
strong_df.columns = ['id', 'sequence']
strong_df['id'] = strong_df['id'].map(lambda x: x[1:])
strong_df.head()

Unnamed: 0,id,sequence
0,Chr11_6627824_6628024,ATGCTGCCAGAAGGAAAAGGGGTGGAATTAATGAAACTGGAAGGTT...
1,Chr11_9587224_9587424,GGCATTTTTTAACCTGTGTTTCATTTTCATCTGTGAAATGTGAATA...
2,Chr11_65187024_65187224,GAAACCACAGAGCTGACCTGGCTTCAGAACAAGATGTGGGGCTCCA...
3,Chr10_74014594_74014794,TTTGCATAGGGGCATTACCACTGGACTTGGGCTCAGAGCAAGTGTT...
4,Chr10_105667810_105668010,CGGGAGGCGGGGGTTGCAGTGAGCCAAGATCACACCACTGCACTCC...


There are some additional rows in the weak, non-enhancer sections after each 'id'.

In [27]:
weak_df = raw_data_df.iloc[513:1160, 0]

In [28]:
weak_df[:10]

513                        >hg19_ct_UserTrack_3545_11005
514                      range=chr11:132363991-132364190
515                                              5'pad=0
516    TTATGGTCACCTTCGACCCCAGAAATAATGGTCTCTGTTGTCAGAT...
517    TTGTTGTCAATAATACTTTTGTAAATGCAAGAAGGACAATGTCAGT...
518    CTTCCCATACTGCTGTGAATCATGCATATGCCTCATCTGCTGCTAC...
519                                                CAATT
520                         >hg19_ct_UserTrack_3545_8529
521                      range=chr11:112506991-112507990
522                                              5'pad=0
Name: 0, dtype: object

Remove these rows.

In [29]:
keep_string = re.compile('^[>ACTG]')
weak_df.drop(weak_df[weak_df.map(lambda x: keep_string.match(x) is None)].index, inplace=True)

In [30]:
weak_df[:50]

513                        >hg19_ct_UserTrack_3545_11005
516    TTATGGTCACCTTCGACCCCAGAAATAATGGTCTCTGTTGTCAGAT...
517    TTGTTGTCAATAATACTTTTGTAAATGCAAGAAGGACAATGTCAGT...
518    CTTCCCATACTGCTGTGAATCATGCATATGCCTCATCTGCTGCTAC...
519                                                CAATT
520                         >hg19_ct_UserTrack_3545_8529
523    CATCCAGGCTTGGTCCTGGTTGTTCCTTGCTGTTATACCAGCCTGG...
524    GCTAAGAGTGGTTCCTTGCCTGACTGTTCACGCCATGGCTTCTTGT...
525    AATGCTGCAGAGCTGCAAATCTGTCACTGCCACCTCTGAGTAGATC...
526                                                CAGCA
527                         >hg19_ct_UserTrack_3545_7245
529    TTGTTTTTTTCTGTTTTGAGACGGAGTTTCGCTCTTGTTGCCCAGG...
530    TCTCGGCTCACTGCAACCTCCACCTCCCGGGTTCAAGCGATTCTCC...
531    AGTTGGGATTACAGGCATGTGCCACCAAGCCCGGCCAATTTTGTAT...
532                                                TCTCC
533                        >hg19_ct_UserTrack_3545_12669
537    ACTGTTAAATAGCAAAAATTATTGAGCTCAAACCATCTAACCAGGT...
538    GAGTGTTTTAGCACAGAAGAATTT

In [31]:
weak_df = pd.DataFrame(weak_df.values.reshape((-1, 5)))
weak_df['sequence'] = weak_df[1] + weak_df[2] + weak_df[3] + weak_df[4]
weak_df.drop([1,2,3,4], axis=1, inplace=True)
weak_df.columns = ['id', 'sequence']
weak_df['id'] = weak_df['id'].map(lambda x: x[1:])
weak_df.head()

Unnamed: 0,id,sequence
0,hg19_ct_UserTrack_3545_11005,TTATGGTCACCTTCGACCCCAGAAATAATGGTCTCTGTTGTCAGAT...
1,hg19_ct_UserTrack_3545_8529,CATCCAGGCTTGGTCCTGGTTGTTCCTTGCTGTTATACCAGCCTGG...
2,hg19_ct_UserTrack_3545_7245,TTGTTTTTTTCTGTTTTGAGACGGAGTTTCGCTCTTGTTGCCCAGG...
3,hg19_ct_UserTrack_3545_12669,ACTGTTAAATAGCAAAAATTATTGAGCTCAAACCATCTAACCAGGT...
4,hg19_ct_UserTrack_3545_5404,GAGAATTAAGTTTGTATTAAGTTGGAGACCAGGGCAGATGGAAAGA...


## Repeat the pre-processing for the 3 sections (strong, weak, non-enhancers)

In [32]:
def preprocess_indep_data():
    raw_data_df = pd.read_csv(os.path.join('data', 'independent.txt'), delim_whitespace=True, header=None, skiprows=6)
    type_string = re.compile('^\([0-9]+\)$')
    type_idx = raw_data_df.loc[raw_data_df.loc[:, 0].map(lambda x: type_string.match(x) != None), :].index.values.tolist()
    type_idx.append(len(raw_data_df))
    type_name = ['strong', 'weak', 'non-enhancer']
    all_df = []
    for i, start_idx in enumerate(type_idx[:-1]):
        # load raw data
        df = raw_data_df.iloc[start_idx+1:type_idx[i+1], 0]	
        # remove page strings
        page_string = re.compile('^\f')
        df.drop(df[df.map(lambda x: page_string.match(x) != None)].index, inplace=True)
        # remove id details strings i.e. all rows not starting with >, A, C, G or T
        keep_string = re.compile('^[>ACGT]')
        df.drop(df[df.map(lambda x: keep_string.match(x) is None)].index, inplace=True)
        # Combine sub-sequences into 1
        df = pd.DataFrame(df.values.reshape((-1, 5)))
        df['sequence'] = df[1] + df[2] + df[3] + df[4]
        df.drop([1,2,3,4], axis=1, inplace=True)
        # remove '>' from id
        df.rename(columns={0: 'id'}, inplace=True)
        df['id'] = df['id'].map(lambda x: x[1:])
        df['type'] = type_name[i]
        # rearrange columns
        df = df[['id', 'type', 'sequence']]
        all_df.append(df)
    # combine strong, weak, non-enhancer into 1 data frame
    indep_df = pd.concat(all_df)
    indep_df.to_csv(os.path.join('data', 'independent.csv'), index=False)
    return indep_df

In [33]:
indep_df = preprocess_indep_data()

In [34]:
indep_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 400 entries, 0 to 199
Data columns (total 3 columns):
id          400 non-null object
type        400 non-null object
sequence    400 non-null object
dtypes: object(3)
memory usage: 12.5+ KB


The number of samples are correct.

In [35]:
import collections
collections.Counter(indep_df['type'])

Counter({'strong': 100, 'weak': 100, 'non-enhancer': 200})

In [36]:
indep_df.iloc[:5, :2]

Unnamed: 0,id,type
0,Chr11_6627824_6628024,strong
1,Chr11_9587224_9587424,strong
2,Chr11_65187024_65187224,strong
3,Chr10_74014594_74014794,strong
4,Chr10_105667810_105668010,strong


In [37]:
indep_df.iloc[:5, 2]

0    ATGCTGCCAGAAGGAAAAGGGGTGGAATTAATGAAACTGGAAGGTT...
1    GGCATTTTTTAACCTGTGTTTCATTTTCATCTGTGAAATGTGAATA...
2    GAAACCACAGAGCTGACCTGGCTTCAGAACAAGATGTGGGGCTCCA...
3    TTTGCATAGGGGCATTACCACTGGACTTGGGCTCAGAGCAAGTGTT...
4    CGGGAGGCGGGGGTTGCAGTGAGCCAAGATCACACCACTGCACTCC...
Name: sequence, dtype: object