There are two main data folders. 

- ncbi-genomes-2020-11-16:335

- 19fna:20

In [1]:
import os

In [121]:
! pwd

/Users/yanhans/Documents/GitHub/Antibiotic_Resistance_Prediction


In [122]:
def get_all_file_name(rpath_raw):
    """
    This function return a list of file name in the given path
    Input:
        rpath_raw : a path relative to current dir /Antibiotic_Resistance_Prediction 
                e.x. "./data/part2/19fna"
    Output:
        files: a string list
    """

    # list all the files in rpath_raw
    files = os.listdir(rpath_raw)
    return files

In [126]:
# rpath_raw is the relative path to rare file
rpath_raw = "./data/part2/19fna/"
part2 = get_all_file_name(rpath_raw)
print(part2)

['CMG27-3.fasta', 'FS46-2.fasta', 'FS55-1.fasta', 'MM114-2.fasta', '.DS_Store', '2019XSD9.fasta', '2019XSD11.fasta', 'CMG11-2.fasta', 'MMS32-1.fasta', '2019XSD8.fasta', '2019XSD10.fasta', 'FS35-1.fasta', '2019XSD6.fasta', 'MM112-2.fasta', 'MM111-1.fasta', '2019XSC8.fasta', 'MMS36-1-1.fasta', 'W1-1.fasta', 'FS38-2.fasta', '2019XSC9.fasta']


In [158]:
part2.remove('.DS_Store') # Mac OS file

In [159]:
part2

['CMG27-3.fasta',
 'FS46-2.fasta',
 'FS55-1.fasta',
 'MM114-2.fasta',
 '2019XSD9.fasta',
 '2019XSD11.fasta',
 'CMG11-2.fasta',
 'MMS32-1.fasta',
 '2019XSD8.fasta',
 '2019XSD10.fasta',
 'FS35-1.fasta',
 '2019XSD6.fasta',
 'MM112-2.fasta',
 'MM111-1.fasta',
 '2019XSC8.fasta',
 'MMS36-1-1.fasta',
 'W1-1.fasta',
 'FS38-2.fasta',
 '2019XSC9.fasta']

In [154]:
def create_table(path):
    """
    This function create the sequences record table
    !!! Make sure that ">" presents at the first of each sequence
    !!! ">" is the separator for each sequence record
    !!! Make sure that the sequence name is separate from the sequence
        content by "\n"
    !!! Examine using the print_header.sh
    Input:
        filename1 : a file name containing many sequences
    Output:
        sequences_table : a 2d python list [[seqname1, "ATCG"], ....]
    """
    # read in file
    my_file = open(path)
    my_file_contents = my_file.read()
    
    # split by >
    records = my_file_contents.split(">")
    
    # remove the first empty string
    records.pop(0)
    
    sequences_table = []
    
    for record in records:
        seqname, seq = record.split("\n", maxsplit=1)
        
        # remove \n from the seq list
        seq = seq.replace('\n', '')
        sequences_table.append(list((seqname, seq)))
    
    return sequences_table

In [131]:
def length_validation_check(sequences_table):
    """
    This function check whether the processed sequence length matches the original record
    !!! Make sure length included in the name
    Input:
        sequences_table : output from create table function
    """
    unvalid_case = 0
    for record in sequences_table:
        if len(record[1]) != int(record[0].split('_')[3]):
            unvalid_case += 1
            print("Invaild : ",record[0])
    if unvalid_case == 0:
        print("All records valid!")
    return

In [132]:
length_validation_check(sequences_table)

All records valid!


In [133]:
def content_validation_check(sequences_table):
    setA = set()
    for record in sequences_table:
        setA = setA | set(list(record[1]))
    return setA

In [134]:
content_validation_check(sequences_table)

{'A', 'C', 'G', 'N', 'T'}

In [147]:
def generate_filename(rpath_encode, filename1, filename2):
    """
    Generate a csv file name in a given path
    Input:
        rpath_encode : './data/part2/encoded_data2/'
        filename1 : "xx.fasta"
        filename2 : "NODE_1xx"
    Output:
        Add underscore between filenames and .csv extension
        name : "../encoded_data/xx.fasta_NODE_1xx.csv"
    """
    name = rpath_encode + filename1 + '_' + filename2 + '.csv'
    return name

In [148]:
# rpath_encode is the relative path to the encoded data folder
rpath_encode = './data/part2/encoded_data2/'

In [149]:
def create_encoded_files(sequences_table, rpath_encode, filename1):
    """
    This function saves all encoded matrix of sequences of the given file to a given rpath_encode
    Input:
        table: 2d python list [[filename2, sequence],...]
        rpath_encode: on the disk
        filename1: a file
    """
    for record in sequences_table:
        # generate a filename
        name = generate_filename(rpath_encode, filename1, record[0])
        # generate the encoded matrix
        matrix = encoding_sequence(record[1])
        # save the file to the given position
        save_file(matrix,name)
    return

In [135]:
def encoding_sequence(sequence):
    """
    This function return the one hot encoding matrix of the given sequence
    Input:
        sequence: A length L string containing "A", "T", "C", "G", possible "N"
    Output:
        encoding_matrix: A two dimensional list, (L, 4)
    """
    
    encoding_matrix = []
    
    # One hot encoding
    A = [1,0,0,0]
    T = [0,1,0,0]
    C = [0,0,1,0]
    G = [0,0,0,1]
    N = [0,0,0,0]
    
    for nucleotide in sequence:
        if nucleotide == 'A':
            encoding_matrix.append(A)
        elif nucleotide == 'T':
            encoding_matrix.append(T)
        elif nucleotide == 'C':
            encoding_matrix.append(C)
        elif nucleotide == 'G':
            encoding_matrix.append(G)
        else:
            encoding_matrix.append(N)
    return encoding_matrix

In [94]:
def save_file(matrix, filename):
    """
    This function saves the encoded object to a csv file on the disk
    Input:
        matrix: python 2D list from the encoding_sequence function
        filename: 
                combine the original file name and sequence name
                can be a relative path    
    """
    df = pd.DataFrame(matrix, columns =['A', 'T', 'C', 'G']) 
    df.to_csv(filename,index = False)
    return

In [160]:
for filename1 in part2:
    path = rpath_raw + filename1
    sequences_table = create_table(path)
    length_validation_check(sequences_table)
    print(content_validation_check(sequences_table))
    create_encoded_files(sequences_table, rpath_encode, filename1)

All records valid!
{'A', 'N', 'T', 'G', 'C'}
All records valid!
{'A', 'N', 'G', 'T', 'C'}
All records valid!
{'A', 'N', 'G', 'T', 'C'}
All records valid!
{'A', 'N', 'T', 'G', 'C'}
All records valid!
{'T', 'G', 'A', 'C'}
All records valid!
{'A', 'T', 'G', 'C'}
All records valid!
{'A', 'N', 'T', 'G', 'C'}
All records valid!
{'A', 'N', 'T', 'G', 'C'}
All records valid!
{'A', 'T', 'G', 'C'}
All records valid!
{'A', 'T', 'G', 'C'}
All records valid!
{'G', 'T', 'A', 'C', 'N'}
All records valid!
{'T', 'G', 'A', 'C'}
All records valid!
{'A', 'T', 'G', 'C'}
All records valid!
{'T', 'G', 'A', 'C', 'N'}
All records valid!
{'A', 'G', 'T', 'C'}
All records valid!
{'T', 'G', 'A', 'C', 'N'}
All records valid!
{'A', 'N', 'G', 'T', 'C'}
All records valid!
{'G', 'T', 'A', 'C', 'N'}
All records valid!
{'A', 'T', 'G', 'C'}
