In [1]:
import pandas as pd
import seaborn as sns
import glob
from numpy import genfromtxt
# from sklearn.metrics import f1_score
from matplotlib import pyplot as plt
import numpy as np
# https://pandas.pydata.org/docs/getting_started/intro_tutorials/03_subset_data.html
import json
from collections import OrderedDict
import os
import re
import logging
import multiprocessing
from functools import partial
from datetime import datetime
from Bio import SeqIO
from Bio.Seq import Seq
import gzip
from sklearn import preprocessing
from sklearn.feature_selection import VarianceThreshold

  import pandas.util.testing as tm


### Create map from gene ID to cluster ID

In [2]:
# panta input directory
# pantain_dir = '/data/hoan/amromics/prediction/data/Ecoli1936/prokkatest/'
pantain_dir = '/data/hoan/amromics/prediction/data/Ecoli1936/prokka/'
# panta output dir
# pantaout_dir = '/data/hoan/amromics/prediction/output/pantaEcoli1936aligntest/'
pantaout_dir = '/data/hoan/amromics/prediction/output/pantaEcoli1936align_v4/'

In [3]:
with open(pantaout_dir + 'annotated_clusters.json', 'r') as JSON:
    json_dict = json.load(JSON)
# data = json.loads('/data/hoan/amromics/prediction/output/pantaEcoli1936aligntest/clusters.json')[0]

In [4]:
# json_dict

In [5]:
gene2clusterdict = {}
for key in json_dict:
    if len(json_dict[key])==0:
        gene2clusterdict[key] = key
    for gene in json_dict[key]['gene_id']:
        gene2clusterdict[gene] = key

### Find all AMR genes

In [6]:
def parse_gff_AMRgene_finder(gff_fh, sample_id, min_protein_len=40):
    # gene_annotation = OrderedDict()
    # gene_position = OrderedDict()    
    # suffix = 1
    # bed_records = []
    # gene_index = 0
    seq_id = None
    min_cds_len = 3 * min_protein_len
    gene_list = []
    
    for line in gff_fh:            
        if line.startswith('##FASTA'):
            #Done reading gff, move on to reading fasta
            break

        if line[0] == '#':
            continue
        line = line.strip()
        #print(line)
        cells = line.split('\t')
        if cells[2] != 'CDS':
            continue
        if 'BARRGD' not in cells[8]:
            continue
        start = int(cells[3])
        end = int(cells[4])
        length = end - start + 1
        if length < min_cds_len:
            continue
        if length % 3 != 0:
            continue
        cells[0] = cells[0].replace('-','_') #make sure seq_id has no -
        
        if seq_id != cells[0]:
            seq_id = cells[0]
            gene_index = 0

        # strand = cells[6]
        tags = cells[8].split(';')
        gene_id = None
        gene_name = ''
        gene_product = ''
        for tag in tags:
            if tag.startswith('ID='):
                gene_id = tag[3:]
            elif tag.startswith('gene='):                    
                gene_name = tag[5:]
                gene_name = re.sub(r'\W', '_', gene_name)
            elif tag.startswith('product='):                    
                gene_product = tag[8:]
        if gene_id == None:
            continue

        # Ensure gene_id is in the format of sample_id-seq_id-gene_tag
        if not gene_id.startswith(sample_id + '-'):
            gene_id = sample_id + '-' + gene_id

        if not gene_id.startswith(sample_id + '-' + seq_id + '-'):
            gene_id = sample_id + '-' + seq_id + '-' + gene_id[len(sample_id)+1:]

        gene_list.append(gene_id)
    
    return gene_list

In [7]:
# def parse_alignment(gff_fh):
#     sample_list = []
#     seq_list = []
#     index = 0
#     for line in gff_fh:            
#         if line[0] == '>':
#             if index >= 1:
#                 seq_list.append(seq)
#             index+=1
#             sample_list.append(line.split('-')[0][1:])
#             seq = ''
#         else:
#             seq += line[:-1]
#             # seq_list.append(line)
#     seq_list.append(seq)
#     return sample_list, seq_list

In [8]:
amr_gene = []
for data_dir in glob.glob(pantain_dir + '*.gff'):
    # print(data_dir)
    in_fh = open(data_dir)
    sample_id = data_dir.split('/')[-1][:-4]
    amr_gene += parse_gff_AMRgene_finder(in_fh, sample_id)
    in_fh.close()

In [9]:
amr_gene[:3], len(amr_gene)

(['SAMEA2204230.contig-SAMEA2204230.contig00001-KJJADFBE_00063',
  'SAMEA2204230.contig-SAMEA2204230.contig00001-KJJADFBE_00095',
  'SAMEA2204230.contig-SAMEA2204230.contig00001-KJJADFBE_00151'],
 119509)

## TODO: Map genes back to cluster IDs

In [10]:
amr_clusterID = [gene2clusterdict[gene] for gene in amr_gene]
amr_clusterID = list(set(amr_clusterID))

In [11]:
len(amr_clusterID)

In [15]:
sample_isolate = pd.read_csv('/data/hoan/amromics/prediction/data/Ecoli1936metafiles/sample_isolate.csv')
sample_isolate.head(2)
sample2isolate = {}
for idx in range(len(sample_isolate.index)):
    sample2isolate[sample_isolate.iloc[idx,0]+'.contig'] = sample_isolate.iloc[idx,1]

In [20]:
pa_matrix = pd.read_csv('/data/hoan/amromics/prediction/output/pantaEcoli1936align_v4/gene_presence_absence.Rtab', sep='\t', index_col=0).T

In [21]:
metadata = pd.read_csv('PanPred/test_data/Metadata.csv')
metadata = metadata.set_index(metadata['Isolate'])
isolate_index = [sample2isolate[sample] for sample in pa_matrix.index]
metadata_panta = metadata.loc[isolate_index]

In [22]:
# sel = VarianceThreshold(threshold=0)
# pa_matrix = sel.fit_transform(pa_matrix)

In [23]:
pa_matrix

Gene,groups_0,namA,groups_2,groups_3,groups_4,groups_5,groups_6,groups_7,groups_8,groups_9,...,groups_74779,groups_74780,groups_74781,groups_74782,traI_2_16929,groups_74784,groups_74785,groups_74786,groups_74787,groups_74788
SAMEA2204229.contig,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SAMEA2204230.contig,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SAMEA2204231.contig,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SAMEA2204232.contig,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SAMEA2204233.contig,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SAMEA3531855.contig,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SAMEA3531856.contig,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SAMEA3531869.contig,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SAMEA3531871.contig,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [106]:
column_sum = pa_matrix.sum(axis=0)
L = list(column_sum)

In [107]:
core_genes = []
for idx in range(len(L)):
    if L[idx] > 1653*0.99:
        core_genes.append(pa_matrix.columns[idx])

In [108]:
len(core_genes)

2114

In [111]:
core_genes[:5]

['rsmI', 'ssb_1', 'rpnA', 'rluF', 'groups_20580']

### Compute label encoder for core genes

In [113]:
with open(pantaout_dir + 'samples.json', 'r') as JSON:
    sample_dict = json.load(JSON)
sample2integerindex = {}
for idx in range(len(sample_dict)):
    sample2integerindex[sample_dict[idx]['id']] = idx
n_samples = len(sample_dict)

In [114]:
# sample2integerindex

In [115]:
# codes = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
#          'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', '-']
# le = preprocessing.LabelEncoder()
# le.fit(codes)
# le.transform(['-', 'P'])

In [116]:
amr_mat = None;
start_idx = [0];
# end_idx = [];
idx = 0
for gene in core_genes:
    alignment_dir = pantaout_dir + 'clusters/' + gene +'/'+gene+'.faa.aln.gz'
    # https://www.biostars.org/p/710/
    # fasta_sequences = SeqIO.parse(open(alignment_dir),'fasta')
    # for fasta in fasta_sequences:
    #     name, sequence = fasta.id, str(fasta.seq)
    #     print(name)
    #     print(sequence)
    codes = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
             'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', '-']
    le = preprocessing.LabelEncoder()
    le.fit(codes)
    mat = None
    index = 0
    index_set = []
    with gzip.open(alignment_dir, "rt") as handle:
        for record in SeqIO.parse(handle, "fasta"):
            name, sequence = record.id, str(record.seq)
            sample_id = name.split('-')[0]
            # print(name)
            # print(sequence)
            # print(1 + le.transform([*sequence]))
            if index == 0:
                mat = np.zeros((n_samples, len(sequence)))
            index += 1
            mat[sample2integerindex[sample_id],:] = 1 + le.transform([*sequence])
            index_set.append(sample2integerindex[sample_id])
            # print(record.id)
    if idx==0:
        amr_mat = mat
        idx += 1
    else:
        # ## Run feature selection
        variant_thres = 0.1
        vs = True
        if len(index_set) >= int(n_samples*0.01):
            try:
                sel = VarianceThreshold(variant_thres)
                sel.fit(mat[index_set,:])
            except ValueError:
                vs = False
            if vs:
                mat = mat[:, sel.variances_>variant_thres]
                if mat.shape[0] > 0:
                    start_idx += [start_idx[-1] + mat.shape[1]]
                    amr_mat = np.append(amr_mat, mat, axis=1)
end_idx = start_idx[1:]
start_idx = start_idx[:-1]

ValueError: zero-dimensional arrays cannot be concatenated

In [None]:
amr_mat.shape

In [None]:
# np.save(pantaout_dir + 'amrlabelencodermat_VarianceThreshold.npy', amr_mat) # save numpy array
np.save(pantaout_dir + 'amrlabelencodermat_VT10.npy', amr_mat) # save numpy array
# np.save(pantaout_dir + 'amrlabelencodermat.npy', amr_mat) # save numpy array

In [7]:
# load data
amr_mat = np.load(pantaout_dir + 'amrlabelencodermat_VT10.npy')

In [12]:
# mapping = dict()
# for i in range(22):
#     mapping[i] = i
# def one_hot_encode2(seq):
#     seq2 = [mapping[i] for i in seq]
#     return np.eye(22)[seq2].flatten()

In [13]:
# amr_mat = amr_mat.astype(int)
# amr_matOnehot = None
# for idx in range(amr_mat.shape[0]):
#     if idx == 0:
#         amr_matOnehot = one_hot_encode2(amr_mat[idx,:])
#     else:
#         amr_matOnehot = np.vstack([amr_matOnehot, one_hot_encode2(amr_mat[idx,:])])

In [14]:
# amr_matOnehot.shape, amr_mat.shape