# KE5006 Applied Research

### Identifying enhancers and their strength with deep neural networks

# Data Pre-processing

## Load libraries

In [2]:
import pandas as pd
import os

## Load data

In [33]:
raw_data_df = pd.read_csv(os.path.join('data', 'enhancer.txt'), delim_whitespace=True, header=None)

In [34]:
raw_data_df.head()

Unnamed: 0,0
0,>CHRX_48897056_48897256
1,CACAATGTAGAAGCAGAGACACAGGAACCAGGCTTGGTGATGGCTC...
2,>CHR12_6444339_6444539
3,GCCCTCACATTCCCTGGCCCATCCCCTCCACCTCAAAATTTACAAA...
4,>CHR12_6444939_6445139


## Pre-processing

Extract the id attribute.

In [26]:
id = raw_data_df.iloc[[i for i in raw_data_df.index if i % 2 == 0], :]
id.reset_index(drop=True, inplace=True)
id = id.applymap(lambda x: x[1:])
id.head()


Unnamed: 0,0
0,CHRX_48897056_48897256
1,CHR12_6444339_6444539
2,CHR12_6444939_6445139
3,CHR12_6445139_6445339
4,CHR12_6445339_6445539


Extract the sequence attribute.

In [22]:
sequence = raw_data_df.iloc[[i for i in raw_data_df.index if i % 2 > 0], :]
sequence.reset_index(drop=True, inplace=True)
sequence.head()

Unnamed: 0,0
0,CACAATGTAGAAGCAGAGACACAGGAACCAGGCTTGGTGATGGCTC...
1,GCCCTCACATTCCCTGGCCCATCCCCTCCACCTCAAAATTTACAAA...
2,GAGCAGGAGGCCAGTCACCCTGAGTCAGCCACGGGGAGACGCTGCA...
3,CCTCTGCTGAGAACAGGACTGGGGCTTCCAGGGCAACAGGAAGGGT...
4,ACAGCCTTAAAGGGAGCTTTTCAGGGACCTCTGGCCAGTGGGGGAT...


Combine the attributes into a data frame.

In [27]:
enhancer_df = pd.concat([id, sequence], axis=1)
enhancer_df.columns = ['id', 'sequence']
enhancer_df.head()

Unnamed: 0,id,sequence
0,CHRX_48897056_48897256,CACAATGTAGAAGCAGAGACACAGGAACCAGGCTTGGTGATGGCTC...
1,CHR12_6444339_6444539,GCCCTCACATTCCCTGGCCCATCCCCTCCACCTCAAAATTTACAAA...
2,CHR12_6444939_6445139,GAGCAGGAGGCCAGTCACCCTGAGTCAGCCACGGGGAGACGCTGCA...
3,CHR12_6445139_6445339,CCTCTGCTGAGAACAGGACTGGGGCTTCCAGGGCAACAGGAAGGGT...
4,CHR12_6445339_6445539,ACAGCCTTAAAGGGAGCTTTTCAGGGACCTCTGGCCAGTGGGGGAT...


In [28]:
len(enhancer_df.iloc[0, 1])

200

Check if all the sequences are the same length.

In [32]:
enhancer_df['sequence'].map(lambda x: len(x) == 200).all()

True

Save preprocessed data to file.

In [35]:
enhancer_df.to_csv(os.path.join('data', 'enhancer.csv'), index=False)

## Repeat the pre-processing for the other files

In [38]:
def preprocess_data(filenames):
    for a_filename in filenames:
        raw_data_df = pd.read_csv(os.path.join('data', a_filename), delim_whitespace=True, header=None)
        id = raw_data_df.iloc[[i for i in raw_data_df.index if i % 2 == 0], :]
        id.reset_index(drop=True, inplace=True)
        id = id.applymap(lambda x: x[1:])
        sequence = raw_data_df.iloc[[i for i in raw_data_df.index if i % 2 > 0], :]
        sequence.reset_index(drop=True, inplace=True)
        preprocessed_df = pd.concat([id, sequence], axis=1)
        preprocessed_df.columns = ['id', 'sequence']
        newfilename = a_filename.split('.')[0] + '.csv'
        preprocessed_df.to_csv(os.path.join('data', newfilename), index=False)        

In [40]:
preprocess_data(['non_enhancer.txt', 'strong.txt', 'weak.txt'])