# CleanEx: Data Wrangling
In this notebook, I am examining each promoter sequence I downloaded from the EPD (), checking the length of each sequence, the alphabet used, and saving a list of 'clean' promoter sequences which only use the alphabet 'ATCG'.

In [1]:
# load required packages
import pandas as pd
import numpy as np
from Bio import SeqIO

# import FASTA file and print header, sequence, and length of sequence
for seq_record in SeqIO.parse("hg19_lnENr.fa", "fasta"):
    print(seq_record.id)
    print(repr(seq_record.seq))
    print(len(seq_record))


FP011394
Seq('GATATATAAATATCCCTTTCTGAGACTGACGGGAACTTGGACAGGGCCGAAGAG...CAC', SingleLetterAlphabet())
1002
FP012411
Seq('TCTCAGTTTGAGCATGCATCAGAATCGTCATAAAGTGTGTTAAAACAGATTGCT...TAG', SingleLetterAlphabet())
1002
FP012826
Seq('ACCTGACTAATTTTTATTTTATTTTATTTTTGTAGAGACAGAGTCTCACTGTGT...TGA', SingleLetterAlphabet())
1002
FP016490
Seq('GGCCGCCGCCTTGCAGCCCTCTCCTCACCTCATCTCCGGGCGGGTCCGCGCTGC...TGA', SingleLetterAlphabet())
1002
FP014026
Seq('ATGATACACAAAATCTGCCCTCAGACTTACTGAAGCACACAGTCACAGAGCCAG...CAG', SingleLetterAlphabet())
1002
FP010807
Seq('GCCGGGGGCGCCGCCCACGGCCTCCTGGGCTCGGCGCGCGGGGCCGGGCTGCGG...CGG', SingleLetterAlphabet())
1002
FP013092
Seq('AGATGGCACCACTGCACTCCAGCTTGGCGACAGAGTAACAGTTTATCTCAAAAT...GCT', SingleLetterAlphabet())
1002
FP012627
Seq('CTCGGGCGAATGACACGCCGGGTCCCCAGACGGGGGCTCCGGCGCCGGGCGGGC...CAC', SingleLetterAlphabet())
1002
FP016851
Seq('CCTGGGATTCTGGGGCATCCGCGTGCACTTGAGATACAAGCTGTCCCACTGGCC...CAG', SingleLetterAlphabet())
1002
FP009884
Seq('GTTCAAGACCGGCCTGGCCAAGATGGTGAAAC

In [6]:
# compute alphabet used across all sequences
from custom import compute_alphabet

seqrecords = [s for s in SeqIO.parse("hg19_lnENr.fa", "fasta")]

print compute_alphabet(seqrecords)

set(['A', 'C', 'G', 'N', 'T'])


In [8]:
# drop any sequences with the letter 'N'
s = []
for seq_record in SeqIO.parse("hg19_lnENr.fa", "fasta"):
    a = set(seq_record.seq)
    s.append(a)

s2 = pd.DataFrame(s)

# add rownames as seq_record.id
nm = []
for seq_record in SeqIO.parse("hg19_lnENr.fa", "fasta"):
    id = seq_record.description.split(" ")[0]
    nm.append(id)
    
s2.index = pd.Series(nm)

# keep rows with 'None' in 5th column ('N')
s3 = s2[pd.isnull(s2[(4)])]
keep = s3.index
print(len(keep))

17780


In [9]:
# export FASTA file of only 17780 clean promoter sequences
with open("promoter_seq_clean.fa", "w") as f:
    for seq_record in SeqIO.parse("hg19_lnENr.fa", "fasta"):
        if seq_record.id in keep:
            SeqIO.write([seq_record], f, "fasta")


In [10]:
# import to check
seqrecords = [s for s in SeqIO.parse("promoter_seq_clean.fa", "fasta")]
compute_alphabet(seqrecords)


{'A', 'C', 'G', 'T'}