# Parse .ann Files
Code to read and parse .ann files, convert the data to pandas dataframe to be used for classification

### Download Data

In [None]:
!wget https://raw.githubusercontent.com/CSB5/atminter/master/data/train_test_data/collated_train.ann
!wget https://raw.githubusercontent.com/CSB5/atminter/master/data/train_test_data/lactobacillus_acidophilus%23escherichia_coli.ann

### Convert .ann files to pandas dataframe

In [None]:
import pandas as pd
import numpy as np
import os

In [42]:
SCHEMA = ['label','title','abstract']
OUTPUT_FILE_NAME = 'AE_Data.csv'

In [24]:
def read_ann_file(filePath):
	with open(filePath) as f:
		temp = [i for i in f]
		temp = [temp[i:i + 3] for i in range(0,len(temp), 3)]

	return temp

def convert_np_array_to_pdf(arr, schema=SCHEMA):
  return pd.DataFrame(arr, columns=schema)

input_file_list = [
    '/content/collated_train.ann',
    '/content/lactobacillus_acidophilus#escherichia_coli.ann'
]


output_df = pd.DataFrame()
for input_file in input_file_list:
  np_arr = read_ann_file(input_file)

  pdf = convert_np_array_to_pdf(np_arr)

  output_df = output_df.append(pdf)

output_df.head()

Unnamed: 0,label,title,abstract
0,>F\n,antimicrobial impacts of essential oils on foo...,the antimicrobial activity of twelve essential...
1,>F\n,purification and characterization of a cystein...,antimicrobial peptide (amp) crustin is a type ...
2,>F\n,telavancin activity tested against gram-positi...,objectives: to reassess the activity of telava...
3,>F\n,the in vitro antimicrobial activity of cymbopo...,background: it is well known that cymbopogon (...
4,>F\n,screening currency notes for microbial pathoge...,fomites are a well-known source of microbial i...


In [30]:
output_df.shape

(4145, 3)

In [26]:
output_df['label'].unique()

array(['>F\n', '>T\n', '>F#(but others)\n', '>F#GENUS\n', '>F#others\n',
       '>T#GUTSPECIFIC\n', '>T#NOTSOLID\n',
       '>F#I::METABOLICINTERACTION\n', '>F#I::Exclusiviity\n',
       '>F#OPPOSITE\n', '>F#FAILURE\n', '>F#AGG\n', '>f\n', '>F#OPP\n',
       '>F <NO EVIDENCE TOWARDS, claims that no inhibition>\n',
       '>F#AMBIGUOUS\n', '>F#Take into account virulence decreases?\n',
       '@F\n', '>F#see if bacteriocin\n', '>F#ATT\n'], dtype=object)

### Clean label

Looking at the labels, the second character decides if the label is 1 or 0.
Some of the text spilled over the label field which needs to be ignored

In [34]:
def clean_label(x):
  x_upper = x.upper()

  if x_upper[1] == 'T':
    return 1
  else:
    return 0


output_df['cleaned_label'] = output_df['label'].apply(lambda x : clean_label(x))

output_df.head()

Unnamed: 0,label,title,abstract,cleaned_label
0,>F\n,antimicrobial impacts of essential oils on foo...,the antimicrobial activity of twelve essential...,0
1,>F\n,purification and characterization of a cystein...,antimicrobial peptide (amp) crustin is a type ...,0
2,>F\n,telavancin activity tested against gram-positi...,objectives: to reassess the activity of telava...,0
3,>F\n,the in vitro antimicrobial activity of cymbopo...,background: it is well known that cymbopogon (...,0
4,>F\n,screening currency notes for microbial pathoge...,fomites are a well-known source of microbial i...,0


In [37]:
output_df['cleaned_label'].value_counts()

0    3851
1     294
Name: cleaned_label, dtype: int64

In [38]:
output_df.drop(columns=['label'], inplace=True)
output_df.rename(columns={'cleaned_label' : 'label'}, inplace=True)
output_df.head()

Unnamed: 0,title,abstract,label
0,antimicrobial impacts of essential oils on foo...,the antimicrobial activity of twelve essential...,0
1,purification and characterization of a cystein...,antimicrobial peptide (amp) crustin is a type ...,0
2,telavancin activity tested against gram-positi...,objectives: to reassess the activity of telava...,0
3,the in vitro antimicrobial activity of cymbopo...,background: it is well known that cymbopogon (...,0
4,screening currency notes for microbial pathoge...,fomites are a well-known source of microbial i...,0


In [43]:
output_df.to_csv(OUTPUT_FILE_NAME, index=False)