# Preprocessing Pipelines Prototyping for fasta straints
#### Researcher: VMLCODE

## Import Reviews

In [53]:
import pandas as pd
import numpy as np
import os

from Bio import SeqIO
from Bio.SeqUtils import gc_fraction, molecular_weight
from itertools import product

In [2]:
def extract_sequences_from_directory(directory):
    sequences = []
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        for record in SeqIO.parse(filepath, "fasta"):
            seq = str(record.seq)
            if len(seq) > 0:
                sequences.append([seq, record.id])
    return sequences

In [3]:
def populate_amr_fasta_dataset(dir_path):
    df = pd.DataFrame(columns=['sample_id', 'sequence', 'label (not resistant[0]/resistant[1] to Trimethoprim)'])
    for directory in os.listdir(dir_path):
        if directory == '0':
            for seq in extract_sequences_from_directory(os.path.join(dir_path, directory)):
                df.loc[len(df)] = [seq[1], seq[0], 0]
        if directory == '1':
            for seq in extract_sequences_from_directory(os.path.join(dir_path, directory)):
                df.loc[len(df)] = [seq[1], seq[0], 1]
    df.to_csv('../Examples/amr_fasta_dataset.csv', index=False)    
populate_amr_fasta_dataset("../Examples")    

In [4]:
def get_amr_fasta_dataset():
    df = pd.read_csv('../Examples/amr_fasta_dataset.csv')
    df['sequence'] = df['sequence']
    df['sample_id'] = df['sample_id']
    df['label (not resistant[0]/resistant[1] to Trimethoprim)'] = df['label (not resistant[0]/resistant[1] to Trimethoprim)'].astype(int)
    return df

get_amr_fasta_dataset()

Unnamed: 0,sample_id,sequence,label (not resistant[0]/resistant[1] to Trimethoprim)
0,CP133856.1,TAACTCCCTATAATGCGCCACCACTGACACGGAACAACGGCAAACA...,0
1,CP173596.1,TTTCGCCCGGATGGATGTCCTGTACGGATAACTGGAACATAGTTCT...,0
2,CP173597.1,TTTTACTGCGCCGGCTGACGCGGCGCGGCAGGAACGCTGCCTGTGG...,0
3,CP173595.1,ACCTCCCGGAGTTTGTTCGGGACCATCCACCGCTCCATCTCACGGA...,0
4,CP173593.1,TCCGGATGATCGGATTCGACAGTGAAAGTTTTATCTGTCGCATTTG...,0
5,CP173594.1,ACTTCTTTGACGGCTTTTGGCGTTGCAGCGCGCGTTTCAGAAGTAC...,0
6,CP119740.1,GTGTCACTTTCGCTTTGGCAGCAGTGTCTTGCCCGATTGCAGGATG...,1
7,CP119741.1,TGGTTTATGGGGCCCTCTCCCTGATTCGCATGTCGTGCTTTTTCTT...,1
8,CP119742.1,GGGGCTGGAATGTCAACGTATCCCGGGTGATGGATAACGCCACACA...,1
9,CP119743.1,TACACCAATTAGGTAAAGTTATTTTTAAGTATCGAGGCAACTTTCA...,1


In [5]:
def get_kmers(sequence, k):
    return [sequence[i:i+k] for i in range(len(sequence) - k + 1)]

In [24]:
def get_all_kmers_dict():
    bases = ['A', 'C', 'G', 'T']
    kmer_dict = {}
    kmers = [''.join(p) for p in product(bases, repeat=4)]
    for kmer in kmers:
            kmer_dict[kmer] = 0
    return kmer_dict 

In [23]:
def get_kmer_frequencies(sequence, k):
  kmer_list = get_kmers(sequence, k)
  kmer_dict = get_all_kmers_dict()
  for kmer in kmer_list:
        if kmer in kmer_dict:
         kmer_dict[kmer] += 1
  return kmer_dict

### Dataset 1 (Name: K-mer Frequency)

##### Example:
- **AAAA**: 3474 (Frequency of every KMer)  
- **AAAC**: 5938  
- **....**: ....  

##### Additional Information:
- **GC_content**: Percentage of nitrogenous bases in the strain  
- **Length**: Length of the chain  
- **AT_ratio**: Proportion of adenine and thymine  
- **Label**: Metadata for forensic analysis of the model  

In [36]:
def get_gc_content(sequence):
    return gc_fraction(sequence)

In [40]:
def get_sequence_length(sequence):
    return len(sequence)

In [41]:
def get_at_ratio(sequence):
    a_count = sequence.count('A')
    t_count = sequence.count('T')
    return (a_count + t_count) / len(sequence) if len(sequence) > 0 else 0

In [51]:
def get_molecular_weight(sequence):
  return molecular_weight(sequence, seq_type='DNA', monoisotopic=True)

In [None]:
def get_at_ratio(sequence):
    a_count = sequence.count('A')
    t_count = sequence.count('T')
    return (a_count + t_count) / len(sequence) if len(sequence) > 0 else 0

In [None]:
def get_sequence_length(sequence):
    return len(sequence)

In [None]:
def get_molecular_weight(sequence):
  return molecular_weight(sequence, seq_type='DNA', monoisotopic=True)

In [54]:
def create_amr_fasta_dataset():
    df = get_amr_fasta_dataset()
    new_rows = []

    for i in range(len(df)):
        sequence = df['sequence'][i]
        kmer_frequencies = get_kmer_frequencies(sequence, 4)
        gc_content = get_gc_content(sequence)

        # Start with the original row (excluding 'sequence')
        row_data = df.drop(columns=['sequence']).iloc[i].to_dict()

        # Add k-mer frequencies to it
        row_data.update(kmer_frequencies)
        row_data['gc_content'] = gc_content
        row_data['sequence_length'] = get_sequence_length(sequence)
        row_data['at_ratio'] = get_at_ratio(sequence)
        row_data['molecular_weight'] = get_molecular_weight(sequence)

        new_rows.append(row_data)

    result_df = pd.DataFrame(new_rows)
    return result_df

df1 = create_amr_fasta_dataset()

In [56]:
df1.shape

(15, 262)

### Dataset 2 (FCGR IMAGE 2D-MATRIX)