# Preprocessing Pipelines Prototyping for fasta straints
#### Researcher: VMLCODE

## Import Reviews

In [None]:
import pandas as pd
import numpy as np
import os

from Bio import SeqIO
from Bio.SeqUtils import gc_fraction
from itertools import product

In [2]:
def extract_sequences_from_directory(directory):
    sequences = []
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        for record in SeqIO.parse(filepath, "fasta"):
            seq = str(record.seq)
            if len(seq) > 0:
                sequences.append([seq, record.id])
    return sequences

In [3]:
def populate_amr_fasta_dataset(dir_path):
    df = pd.DataFrame(columns=['sample_id', 'sequence', 'label (not resistant[0]/resistant[1] to Trimethoprim)'])
    for directory in os.listdir(dir_path):
        if directory == '0':
            for seq in extract_sequences_from_directory(os.path.join(dir_path, directory)):
                df.loc[len(df)] = [seq[1], seq[0], 0]
        if directory == '1':
            for seq in extract_sequences_from_directory(os.path.join(dir_path, directory)):
                df.loc[len(df)] = [seq[1], seq[0], 1]
    df.to_csv('../Examples/amr_fasta_dataset.csv', index=False)    
populate_amr_fasta_dataset("../Examples")    

In [4]:
def get_amr_fasta_dataset():
    df = pd.read_csv('../Examples/amr_fasta_dataset.csv')
    df['sequence'] = df['sequence']
    df['sample_id'] = df['sample_id']
    df['label (not resistant[0]/resistant[1] to Trimethoprim)'] = df['label (not resistant[0]/resistant[1] to Trimethoprim)'].astype(int)
    return df

get_amr_fasta_dataset()

Unnamed: 0,sample_id,sequence,label (not resistant[0]/resistant[1] to Trimethoprim)
0,CP133856.1,TAACTCCCTATAATGCGCCACCACTGACACGGAACAACGGCAAACA...,0
1,CP173596.1,TTTCGCCCGGATGGATGTCCTGTACGGATAACTGGAACATAGTTCT...,0
2,CP173597.1,TTTTACTGCGCCGGCTGACGCGGCGCGGCAGGAACGCTGCCTGTGG...,0
3,CP173595.1,ACCTCCCGGAGTTTGTTCGGGACCATCCACCGCTCCATCTCACGGA...,0
4,CP173593.1,TCCGGATGATCGGATTCGACAGTGAAAGTTTTATCTGTCGCATTTG...,0
5,CP173594.1,ACTTCTTTGACGGCTTTTGGCGTTGCAGCGCGCGTTTCAGAAGTAC...,0
6,CP119740.1,GTGTCACTTTCGCTTTGGCAGCAGTGTCTTGCCCGATTGCAGGATG...,1
7,CP119741.1,TGGTTTATGGGGCCCTCTCCCTGATTCGCATGTCGTGCTTTTTCTT...,1
8,CP119742.1,GGGGCTGGAATGTCAACGTATCCCGGGTGATGGATAACGCCACACA...,1
9,CP119743.1,TACACCAATTAGGTAAAGTTATTTTTAAGTATCGAGGCAACTTTCA...,1


In [5]:
def get_kmers(sequence, k):
    return [sequence[i:i+k] for i in range(len(sequence) - k + 1)]

In [24]:
def get_all_kmers_dict():
    bases = ['A', 'C', 'G', 'T']
    kmer_dict = {}
    kmers = [''.join(p) for p in product(bases, repeat=4)]
    for kmer in kmers:
            kmer_dict[kmer] = 0
    return kmer_dict 

In [23]:
def get_kmer_frequencies(sequence, k):
  kmer_list = get_kmers(sequence, k)
  kmer_dict = get_all_kmers_dict()
  for kmer in kmer_list:
        if kmer in kmer_dict:
         kmer_dict[kmer] += 1
  return kmer_dict

### Dataset 1 (Name: KMer Frequency)

##### Example:
- **AAAA**: 3474 (Frequency of every KMer)  
- **AAAC**: 5938  
- **....**: ....  

##### Additional Information:
- **GC_content**: Percentage of nitrogenous bases in the strain  
- **Length**: Length of the chain  
- **AT_ratio**: Proportion of adenine and thymine  
- **Label**: Metadata for forensic analysis of the model  

In [None]:
def get_gc_content(sequence):
    return gc_fraction(sequence)

In [None]:
def create_amr_fasta_dataset():
    df = get_amr_fasta_dataset()
    new_rows = []

    for i in range(len(df)):
        sequence = df['sequence'][i]
        kmer_frequencies = get_kmer_frequencies(sequence, 4)
        gc_content = get_gc_content(sequence)

        # Start with the original row (excluding 'sequence')
        row_data = df.drop(columns=['sequence']).iloc[i].to_dict()

        # Add k-mer frequencies to it
        row_data.update(kmer_frequencies)
        row_data['gc_content'] = gc_content

        new_rows.append(row_data)

    result_df = pd.DataFrame(new_rows)
    return result_df

create_amr_fasta_dataset()

Unnamed: 0,sample_id,label (not resistant[0]/resistant[1] to Trimethoprim),AAAA,AAAC,AAAG,AAAT,AACA,AACC,AACG,AACT,...,TTCG,TTCT,TTGA,TTGC,TTGG,TTGT,TTTA,TTTC,TTTG,TTTT
0,CP133856.1,0,39781,27787,25540,29277,24288,22226,26645,17618,...,21262,20831,21556,29443,14657,18574,25439,30797,26786,40314
1,CP173596.1,0,39464,27783,25162,28696,24123,22306,26742,17481,...,21372,20663,21275,29594,14551,18521,24878,30790,26679,40089
2,CP173597.1,0,905,519,522,584,461,371,429,395,...,372,415,472,487,250,314,421,512,480,727
3,CP173595.1,0,687,431,352,462,479,289,356,269,...,317,413,261,358,177,297,379,701,370,837
4,CP173593.1,0,621,418,421,490,425,366,435,340,...,414,531,457,639,308,495,448,686,607,985
5,CP173594.1,0,410,299,237,323,338,281,228,217,...,162,303,198,271,123,231,371,362,300,556
6,CP119740.1,1,39325,27425,25492,28477,24167,22237,26531,17743,...,20889,20833,21518,29558,14951,18701,24831,30815,26594,39933
7,CP119741.1,1,2466,1258,1628,1665,1099,960,951,949,...,839,1173,1250,1071,724,928,1512,1551,1289,2235
8,CP119742.1,1,959,537,471,534,520,371,431,325,...,308,484,309,342,145,372,449,733,384,997
9,CP119743.1,1,941,513,573,676,560,411,484,376,...,386,456,563,567,361,368,491,566,594,771
