# Metagenomic Classification Data Processing

This notebook creates the datasets needed to train the metagenomic classification models. The datasets come from the paper [Deep learning models for bacteria taxonomic classification of metagenomic data](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6069770/) by 
Fiannaca et al. The dataset is available at [this repo](https://github.com/IcarPA-TBlab/MetagenomicDC).

#### Amplicon Sequencing Dataset

This dataset was generated by simulating amplicon sequencing on the S16 hypervariable region of several bacterial species. Sequencing is simulated using the following primers: `CCTACGGGAGGCAGCAG` and `CCGTCAATTCMTTTRAGT`

#### Shotgun Sequencing Dataset

This dataset was generated by simulating shot read shotgun sequencing on the same S16 region using the [Grinder](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3384353/) tool.

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai import *
from fastai.text import *
from Bio import Seq
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import FeatureLocation, CompoundLocation
import networkx as nx

In [3]:
sys.path.append("../../..")
from utils import *

In [4]:
path = Path('F:/genome/bacterial genomes/')

# Shotgun Data

In [5]:
fname = '16S-reads.fa'
fasta = SeqIO.parse(path/fname, 'fasta')

In [6]:
fs = [i for i in fasta]

In [7]:
len(fs)

28224

In [8]:
seqs = []
names = []

for sequence in fs:
    seqs.append(sequence.seq.__str__())
    
    name = sequence.description.split('description')[1][1:].strip('"')
    
    names.append(name)

In [9]:
len(names)

28224

In [10]:
len(set(names))

100

In [11]:
df = pd.DataFrame(seqs, columns=['Sequence'])
df['Species'] = names

In [12]:
df_clean = df[df.Sequence.map(lambda x: set(x.upper()) == set('ATGC'))]
df_clean.shape

(27218, 2)

In [13]:
def partition_data(df):
    
    train_size = int(len(df)*0.93*.93)
    valid_size = int(len(df)*0.93) - train_size
    
    train_df = df.sample(train_size)
    test_val = df.drop(train_df.index)
    valid_df = test_val.sample(valid_size)
    test_df = test_val.drop(valid_df.index)
    train_df['set'] = 'train'
    valid_df['set'] = 'valid'
    test_df['set'] = 'test'
    
    return (train_df, valid_df, test_df)

In [14]:
trains = []
vals = []
tests = []
for species in df_clean.Species.unique():
    species_df = df_clean[df_clean.Species == species]
    t,v,test = partition_data(species_df)
    trains.append(t)
    vals.append(v)
    tests.append(test)

In [15]:
train_df = pd.concat(trains)
valid_df = pd.concat(vals)
test_df = pd.concat(tests)

In [16]:
train_df.shape, valid_df.shape, test_df.shape

((23493, 3), (1771, 3), (1954, 3))

In [17]:
data_df = pd.concat([train_df, valid_df, test_df])

In [18]:
data_df.to_csv(path/'s16_shotgun_sequences.csv', index=False)

# Amplicon Data

In [5]:
fname = '16S-trimmed.fa'
fasta = SeqIO.parse(path/fname, 'fasta')
fs = [i for i in fasta]

In [6]:
len(fs)

28000

In [7]:
seqs = []
names = []

for sequence in fs:
    seqs.append(sequence.seq.__str__())
    
    name = sequence.description.split('description')[1][1:].strip('"')
    
    names.append(name)

In [8]:
len(names)

28000

In [9]:
len(set(names))

96

In [10]:
df = pd.DataFrame(seqs, columns=['Sequence'])
df['Species'] = names

In [11]:
df_clean = df[df.Sequence.map(lambda x: set(x.upper()) == set('ATGC'))]
df_clean.shape

(27223, 2)

In [12]:
def partition_data(df):
    
    train_size = int(len(df)*0.93*.93)
    valid_size = int(len(df)*0.93) - train_size
    
    train_df = df.sample(train_size)
    test_val = df.drop(train_df.index)
    valid_df = test_val.sample(valid_size)
    test_df = test_val.drop(valid_df.index)
    train_df['set'] = 'train'
    valid_df['set'] = 'valid'
    test_df['set'] = 'test'
    
    return (train_df, valid_df, test_df)

In [13]:
train_df, valid_df, test_df = partition_data(df_clean)

In [14]:
train_df.shape, valid_df.shape, test_df.shape

((23545, 3), (1772, 3), (1906, 3))

In [15]:
data_df = pd.concat([train_df, valid_df, test_df])

In [16]:
data_df.to_csv(path/'s16_amplicon_sequences.csv', index=False)