There are two main data folders. 

- ncbi-genomes-2020-11-16:335

- 19fna:20

In [1]:
import os
import re
import gzip
import pandas as pd

In [2]:
! pwd

/Users/yanhans/Documents/GitHub/Antibiotic_Resistance_Prediction


In [3]:
def get_all_file_name(rpath_raw):
    """
    This function returns a list of file names in the given path
    Input:
        rpath_raw : a path relative to current dir /Antibiotic_Resistance_Prediction 
                e.x. "./data/part1/ncbi-genomes-2020-11-16/"
    Output:
        files: a string list
    """

    # list all the files in rpath_raw
    files = os.listdir(rpath_raw)
    return files

In [4]:
# rpath_raw is the relative path to raw file
rpath_raw = "./data/part1/ncbi-genomes-2020-11-16/"
part1 = get_all_file_name(rpath_raw)

In [5]:
len(part1)

338

In [6]:
for name in part1:
    if not re.match("^GCA_.*", name):
        print(name)

.DS_Store
md5checksums.txt
README.txt


In [7]:
part1.remove('md5checksums.txt')
part1.remove('README.txt')
part1.remove('.DS_Store')

In [8]:
len(part1)

335

In [9]:
def create_table(path):
    """
    This function creates the sequences record table
    !!! Make sure that ">" presents at the first of each sequence
    !!! ">" is the separator for each sequence record
    !!! Make sure that the sequence name is separate from the sequence
        content by "\n"
    !!! Examine using the print_header.sh
    Input:
        filename1 : a file name containing many sequences
    Output:
        sequences_table : a 2d python list [[seqname1, "ATCG"], ....]
    """
    # read in file
    
    with gzip.open(path,'r') as my_file:        
        my_file_contents = my_file.read().decode('ASCII')
    
    # split by >
    records = my_file_contents.split(">")
    
    # remove the first empty string
    records.pop(0)
    
    sequences_table = []
    
    for record in records:
        seqname, seq = record.split("\n", maxsplit=1)
        
        # remove \n from the seq list
        seq = seq.replace('\n', '')
        sequences_table.append(list((seqname, seq)))
    
    return sequences_table

In [12]:
def length_validation_check(sequences_table):
    """
    This function checks whether the processed sequence length matches the original record
    !!! Make sure length included in the name
    Input:
        sequences_table : output from create table function
    """
    unvalid_case = 0
    unprovided = 0
    for record in sequences_table:
        try:
            temp = re.findall("NODE_\d*_length_\d*",record[0])[0] # this is new
            if len(record[1]) != int(temp.split('_')[3]):
                unvalid_case += 1
                print("Invaild : ",record[0])
        except:
            unprovided += 1
    if unvalid_case == 0:
        print("All records length valid! {} unprovided.".format(unprovided))
    else:
        print("Invalid found!")
    return

In [13]:
length_validation_check(table)

All records length valid! 0 unprovided.


In [14]:
def content_validation_check(sequences_table):
    setA = set()
    for record in sequences_table:
        setA = setA | set(list(record[1]))
    if setA <= {'A', 'C', 'G', 'N', 'T'}:
        print("Entries valid")
    return

In [15]:
content_validation_check(table)

Entries valid


In [16]:
def generate_filename(rpath_encode, filename1, filename2):
    """
    Generate a csv file name in a given path
    Input:
        rpath_encode : './data/part2/encoded_data2/'
        filename1 : "xx.fasta"
        filename2 : "NODE_1xx"
    Output:
        Add underscore between filenames and .csv extension
        name : "../encoded_data/xx.fasta_NODE_1xx.csv"
    """
    name = rpath_encode + filename1 + '_' + filename2 + '.csv'
    return name

In [17]:
# rpath_encode is the relative path to the encoded data folder
rpath_encode = './data/part1/encoded_data1/'

In [18]:
! pwd

/Users/yanhans/Documents/GitHub/Antibiotic_Resistance_Prediction


In [19]:
def create_encoded_files(sequences_table, rpath_encode, filename1):
    """
    This function saves all encoded matrix of sequences of the given file to a given rpath_encode
    Input:
        table: 2d python list [[filename2, sequence],...]
        rpath_encode: on the disk
        filename1: a file
    """
    for record in sequences_table:
        # generate a filename
        name = generate_filename(rpath_encode, filename1, record[0])
        # generate the encoded matrix
        matrix = encoding_sequence(record[1])
        # save the file to the given position
        save_file(matrix,name)
    return

In [20]:
def encoding_sequence(sequence):
    """
    This function returns the one hot encoding matrix of the given sequence
    Input:
        sequence: A length L string containing "A", "T", "C", "G", possible "N"
    Output:
        encoding_matrix: A two dimensional list, (L, 4)
    """
    
    encoding_matrix = []
    
    # One hot encoding
    A = [1,0,0,0]
    T = [0,1,0,0]
    C = [0,0,1,0]
    G = [0,0,0,1]
    N = [0,0,0,0]
    
    for nucleotide in sequence:
        if nucleotide == 'A':
            encoding_matrix.append(A)
        elif nucleotide == 'T':
            encoding_matrix.append(T)
        elif nucleotide == 'C':
            encoding_matrix.append(C)
        elif nucleotide == 'G':
            encoding_matrix.append(G)
        else:
            encoding_matrix.append(N)
    return encoding_matrix

In [21]:
def save_file(matrix, filename):
    """
    This function saves the encoded object to a csv file on the disk
    Input:
        matrix: python 2D list from the encoding_sequence function
        filename: 
                combine the original file name and sequence name
                can be a relative path    
    """
    df = pd.DataFrame(matrix, columns =['A', 'T', 'C', 'G']) 
    df.to_csv(filename,index = False)
    return

In [22]:
for filename1 in part1:
    path = rpath_raw + filename1
    sequences_table = create_table(path)
    length_validation_check(sequences_table)
    content_validation_check(sequences_table)
    create_encoded_files(sequences_table, rpath_encode, filename1)

All records length valid! 0 unprovided.
Entries valid
All records length valid! 0 unprovided.
Entries valid
All records length valid! 0 unprovided.
Entries valid
All records length valid! 0 unprovided.
Entries valid
All records length valid! 191 unprovided.
Entries valid
All records length valid! 1 unprovided.
Entries valid
All records length valid! 7 unprovided.
Entries valid
All records length valid! 0 unprovided.
Entries valid
All records length valid! 0 unprovided.
Entries valid
All records length valid! 0 unprovided.
Entries valid
All records length valid! 0 unprovided.
Entries valid
All records length valid! 0 unprovided.
Entries valid
All records length valid! 0 unprovided.
Entries valid
All records length valid! 0 unprovided.
Entries valid
All records length valid! 0 unprovided.
Entries valid
All records length valid! 4 unprovided.
Entries valid
All records length valid! 0 unprovided.
Entries valid
All records length valid! 130 unprovided.
Entries valid
All records length valid

All records length valid! 0 unprovided.
Entries valid
All records length valid! 0 unprovided.
Entries valid
All records length valid! 0 unprovided.
Entries valid
All records length valid! 0 unprovided.
Entries valid
All records length valid! 0 unprovided.
Entries valid
All records length valid! 0 unprovided.
Entries valid
All records length valid! 0 unprovided.
Entries valid
All records length valid! 0 unprovided.
Entries valid
All records length valid! 0 unprovided.
Entries valid
All records length valid! 0 unprovided.
Entries valid
All records length valid! 0 unprovided.
Entries valid
All records length valid! 0 unprovided.
Entries valid
All records length valid! 0 unprovided.
Entries valid
All records length valid! 0 unprovided.
Entries valid
All records length valid! 0 unprovided.
Entries valid
All records length valid! 0 unprovided.
Entries valid
All records length valid! 0 unprovided.
Entries valid
All records length valid! 0 unprovided.
Entries valid
All records length valid! 0 

All records length valid! 4 unprovided.
Entries valid
All records length valid! 0 unprovided.
Entries valid
All records length valid! 6 unprovided.
Entries valid
All records length valid! 0 unprovided.
Entries valid
All records length valid! 0 unprovided.
Entries valid
All records length valid! 6 unprovided.
Entries valid
All records length valid! 0 unprovided.
Entries valid
All records length valid! 0 unprovided.
Entries valid
All records length valid! 0 unprovided.
Entries valid
All records length valid! 2 unprovided.
Entries valid
All records length valid! 0 unprovided.
Entries valid
All records length valid! 0 unprovided.
Entries valid
All records length valid! 0 unprovided.
Entries valid
All records length valid! 0 unprovided.
Entries valid
All records length valid! 0 unprovided.
Entries valid
All records length valid! 0 unprovided.
Entries valid
All records length valid! 14 unprovided.
Entries valid
All records length valid! 5 unprovided.
Entries valid
All records length valid! 0