In [1]:
import pandas as pd
import seaborn as sns
import glob
from numpy import genfromtxt
# from sklearn.metrics import f1_score
from matplotlib import pyplot as plt
import numpy as np
import json
from collections import OrderedDict
import os
import re
import logging
import multiprocessing
from functools import partial
from datetime import datetime
from Bio import SeqIO
from Bio.Seq import Seq
import gzip
from sklearn import preprocessing
from sklearn.feature_selection import VarianceThreshold
# https://pandas.pydata.org/docs/getting_started/intro_tutorials/03_subset_data.html

### Find SNPs info

In [2]:
pantaout_dir = '/data/hoan/amromics/prediction/output/pantaEcoli1936align_v9/'
with open(pantaout_dir + 'samples.json', 'r') as JSON:
    sample_dict = json.load(JSON)
sample2integerindex = {}
for idx in range(len(sample_dict)):
    sample2integerindex[sample_dict[idx]['id']] = idx
n_samples = len(sample_dict)
amrgene_annotation = pd.read_pickle(pantaout_dir + 'SNPsCoreGeneFullSubmission_metadata.pkl')

In [3]:
amrgene_annotation.head(2) # index in SNPs matrix

Unnamed: 0,gene,start_index,end_index
0,namA,0,8341
1,vgrG1,8341,10348


In [4]:
snpID = 'yfcR@192641' # 192641 corresponds to the index in the SNPs matrix

In [5]:
def getSNPPositionIndex(snpID):
    ## Compute the SNP matrix
    geneID = snpID.split('@')[0]
    start_index = amrgene_annotation.loc[amrgene_annotation['gene']==geneID].iloc[0,1]
    snpPositionWithinGene = int(snpID.split('@')[1]) - start_index
    alignment_dir = pantaout_dir + 'clusters/' + geneID +'/'+geneID+'.faa.aln.gz'
    codes = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
             'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', '-', 'X']
    le = preprocessing.LabelEncoder()
    le.fit(codes)
    mat = None; index = 0; index_set = []
    with gzip.open(alignment_dir, "rt") as handle:
        for record in SeqIO.parse(handle, "fasta"):
            name, sequence = record.id, str(record.seq)
            sample_id = name.split('-')[0]
            if index == 0:
                mat = np.zeros((n_samples, len(sequence)))
            index += 1
            mat[sample2integerindex[sample_id],:] = 1 + le.transform([*sequence])
            index_set.append(sample2integerindex[sample_id])

    variant_thres = 0
    vs = True
    try:
        sel = VarianceThreshold(variant_thres)
        sel.fit(mat[index_set,:])
    except ValueError:
        vs = False
    if vs:
        mat = mat[:, sel.variances_>variant_thres]
        
    ## Compute the index
    count = 0
    truePosition = None
    boolVec = sel.variances_>variant_thres
    for idx in range(len(boolVec)):
        if boolVec[idx]:
            count += 1
        if count == snpPositionWithinGene:
            truePosition = count
    positionIndex = geneID +'@'+str(truePosition)
    return(positionIndex)

In [6]:
positionIndex = getSNPPositionIndex(snpID)

In [7]:
positionIndex

'yfcR@69'

### Find Kmer info

In [8]:
kmerID = 'VSLHINQGEI'

In [9]:
from pangraph.utils import parse_gff_AMRgene_finder, parse_gff_AMRgene_finder_BARRGD_ID
with open(pantaout_dir + 'annotated_clusters.json', 'r') as JSON:
    json_dict = json.load(JSON)
gene2clusterdict = {}
for key in json_dict:
    if len(json_dict[key])==0:
        gene2clusterdict[key] = key
    for gene in json_dict[key]['gene_id']:
        gene2clusterdict[gene] = key

In [10]:
amr_gene = []
pantain_dir = '/data/hoan/amromics/prediction/data/Ecoli1936/prokka/'
for data_dir in glob.glob(pantain_dir + '*.gff'):
    # print(data_dir)
    in_fh = open(data_dir)
    sample_id = data_dir.split('/')[-1][:-4]
    amr_gene += parse_gff_AMRgene_finder(in_fh, sample_id)
    in_fh.close()

In [11]:
amr_clusterID = [gene2clusterdict[gene] for gene in amr_gene]
amr_clusterID = list(set(amr_clusterID))

In [12]:
amr_gene[0:1], len(amr_clusterID), amr_clusterID[0:4]

(['SAMEA2204230.contig-SAMEA2204230.contig00001-KJJADFBE_00063'],
 216,
 ['atoS', 'tet_X_', 'aph_3___IIa', 'tetA_58__2'])

In [13]:
def getAMRPositionIndex(kmerID):
    ksize = 10; # k = 10 for protein, 20 for DNA
    topAMRList = [];
    pairdata = []
    positionIndexVec = []
    computed_gene_cluster = amr_clusterID
    for idx in range(len(computed_gene_cluster)):
        alignment_dir = pantaout_dir + 'clusters/' + computed_gene_cluster[idx] +'/'+computed_gene_cluster[idx]+'.faa.aln.gz'
        # alignment_dir = pantaout_dir + 'clusters/' + computed_gene_cluster[idx] +'/'+computed_gene_cluster[idx]+'.fna.aln.gz'
        clusterBool = False
        with gzip.open(alignment_dir, "rt") as handle:
            for record in SeqIO.parse(handle, "fasta"):
                name, sequence = record.id, str(record.seq)
                sample_id = name.split('-')[0]
                seqraw = sequence.replace('-', '')
                n_kmers = len(seqraw) - ksize + 1
                # print(idx, end =',')
                breakLoop = False
                for i in range(n_kmers):
                    kmer = seqraw[i:i + ksize] 
                    if kmer == kmerID:
                        clusterBool = True
                        pairdata.append((computed_gene_cluster[idx] , sample_id))
                        positionIndexVec.append(computed_gene_cluster[idx]+'@'+str(i)+'@'+str(i+10))
                        # breakLoop = True;
                        # break;
        if clusterBool:
            topAMRList.append(computed_gene_cluster[idx])
    positionIndexVecUnique = np.unique(positionIndexVec)
    positionAMRIndex = ''
    for v in positionIndexVecUnique:
        positionAMRIndex = positionAMRIndex + v + ';'
    positionAMRIndex = positionAMRIndex[:-1]
    return(positionAMRIndex)

In [14]:
positionIndex = getAMRPositionIndex(kmerID)

In [15]:
positionIndex

'tetA_58__7_00656@23@33'

In [16]:
def getTypeVariant(variantID):
    if '@' in variantID:
        return ('SNP')
    elif '_' in variantID:
        return ('PAGene')
    elif len(variantID) != 10:
        return ('PAGene')
    else:
        return ('AMRKmer')

In [17]:
# Opening JSON file
scoreInputFileDir = 'data/TopFeatures/gain_ft_importance.json'
# 'data/TopFeatures/split_ft_importance.json'
with open(scoreInputFileDir) as json_file:
    gainImportance = json.load(json_file)

In [18]:
# gainImportance

In [19]:
# for v in gainImportance['AMP'][0]:
#     print(v, getTypeVariant(v))

In [20]:
def Reverse(lst):
   new_lst = lst[::-1]
   return new_lst

In [21]:
dfVec = []
nFeatures = 10
count = 0
for antibiotics in gainImportance:
    # count += 1
    # if count == 3:
    #     break
    print('Reverse the list')
    featureList = Reverse(gainImportance[antibiotics][0])
    scoreList = Reverse(gainImportance[antibiotics][1])
    print(antibiotics)
    for i in range(nFeatures):
        print(i, end = ',')
        feature = featureList[i]
        score = scoreList[i]
        featureType = getTypeVariant(feature)
        positionIndex = None
        if featureType == 'SNP':
            positionIndex = getSNPPositionIndex(feature)
        elif featureType == 'AMRKmer':
            positionIndex =  getAMRPositionIndex(feature)
        else:
            positionIndex = feature
        dfVec.append([antibiotics, feature, featureType, i, score, positionIndex])

Reverse the list
AMC
0,1,2,3,4,5,6,7,8,9,Reverse the list
AMP
0,1,2,3,4,5,6,7,8,9,Reverse the list
AMX
0,1,2,3,4,5,6,7,8,9,Reverse the list
CET
0,1,2,3,4,5,6,7,8,9,Reverse the list
CIP
0,1,2,3,4,5,6,7,8,9,Reverse the list
CTX
0,1,2,3,4,5,6,7,8,9,Reverse the list
CTZ
0,1,2,3,4,5,6,7,8,9,Reverse the list
CXM
0,1,2,3,4,5,6,7,8,9,Reverse the list
GEN
0,1,2,3,4,5,6,7,8,9,Reverse the list
TBM
0,1,2,3,4,5,6,7,8,9,Reverse the list
TMP
0,1,2,3,4,5,6,7,8,9,

In [22]:
df = pd.DataFrame(dfVec)
df.columns = ['antibiotics', 'featureName', 'variantType', 'rank', 'featureScore', 'positionIndex']

In [23]:
if 'gain' in scoreInputFileDir:
    df.to_csv('data/TopFeatures/gain_feature_annotation.csv', index=False)
else:
    df.to_csv('data/TopFeatures/split_feature_annotation.csv', index=False)

### Copy clusters to directory

In [24]:
df_file = pd.read_csv('data/TopFeatures/gain_feature_annotation.csv')

In [25]:
df_file.head(2)

Unnamed: 0,antibiotics,featureName,variantType,rank,featureScore,positionIndex
0,AMC,DRKRLLISLG,AMRKmer,0,1236.919297,oleI@203@213
1,AMC,FAMAHIVTLT,AMRKmer,1,445.742985,emrB_1@369@379


In [26]:
# for i in range(len(df_file.index)):
#     if (df_file.iloc[i,2]=='AMRKmer') or (df_file.iloc[i,2]=='SNP'):
#         positionIndex = df_file.iloc[i,5].split(';')
#         for cluster in positionIndex:
#             gene = cluster.split('@')[0]
#             print(gene, end = ',')
#             alignment_dir = pantaout_dir + 'clusters/' + gene +'/'+gene+'.faa.aln.gz'
#             os.system('cp '+ alignment_dir + ' data/TopFeatures/featureClusters')

In [27]:
gene_annotation = pd.read_csv(pantaout_dir + 'gene_annotation.csv')

In [28]:
# gene_annotation

In [29]:
amr_gene = []; AMR_anno = [];
pantain_dir = '/data/hoan/amromics/prediction/data/Ecoli1936/prokka/'
for data_dir in glob.glob(pantain_dir + '*.gff'):
    # print(data_dir)
    in_fh = open(data_dir)
    sample_id = data_dir.split('/')[-1][:-4]
    amr_gene_sample, AMR_anno_sample= parse_gff_AMRgene_finder_BARRGD_ID(in_fh, sample_id)
    amr_gene += amr_gene_sample
    AMR_anno += AMR_anno_sample
    in_fh.close()

In [30]:
geneID2NCBI = {}
for gene, itemID in AMR_anno:
    geneID2NCBI[gene] = itemID

In [31]:
clusterID = 'emrB_1'
NCBIList = []
for gene in json_dict[clusterID]['gene_id']:
    if gene in geneID2NCBI:
        print(gene2clusterdict[gene], geneID2NCBI[gene])
        NCBIList.append(geneID2NCBI[gene])

emrB_1 NG_048029.1
emrB_1 NG_055981.1
emrB_1 NG_055981.1
emrB_1 NG_055981.1
emrB_1 NG_055981.1
emrB_1 NG_055981.1
emrB_1 NG_048121.1


In [32]:
from collections import Counter

def most_frequent_element(lst):
    counts = Counter(lst)
    most_common = counts.most_common(1)
    return most_common[0][0]

In [33]:
# Example usage:
print(most_frequent_element(NCBIList))  # Output: 3

NG_055981.1


In [46]:
def get_NCBIID(clusterID):
    NCBIList = []
    for gene in json_dict[clusterID]['gene_id']:
        if gene in geneID2NCBI:
            NCBIList.append(geneID2NCBI[gene])
    if len(NCBIList) > 0:
        return(most_frequent_element(NCBIList))
    else:
        return ('')

In [47]:
get_NCBIID(clusterID)

'NG_055981.1'

In [50]:
np.unique(gene_annotation.loc[gene_annotation['gene_name'] == clusterID]['gene_product'].values) # many gene same with the same cluster

array(['Colistin resistance protein EmrB',
       'Multidrug export protein EmrB'], dtype=object)

In [51]:
most_frequent_element(gene_annotation.loc[gene_annotation['gene_name'] == clusterID]['gene_product'].values)

'Colistin resistance protein EmrB'

In [37]:
# most_frequent_element([1,1,2,2,2])

In [38]:
df_file.head(2)

Unnamed: 0,antibiotics,featureName,variantType,rank,featureScore,positionIndex
0,AMC,DRKRLLISLG,AMRKmer,0,1236.919297,oleI@203@213
1,AMC,FAMAHIVTLT,AMRKmer,1,445.742985,emrB_1@369@379


In [39]:
# antibiotics, featureID (Name), variantType, rank, featureScore, Gene location (position Index), Gene, Gene Function, NCBI ID

In [77]:
df_full = []
for i in range(len(df_file.index)):
    positionIndex = df_file.iloc[i, 5].split("@")
    print(i,end=',')
    if len(positionIndex) == 3:
        gene = positionIndex[0]
        position = positionIndex[1]+':'+positionIndex[2]
    elif len(positionIndex) == 2:
        gene = positionIndex[0]
        position = positionIndex[1]
    else:
        gene = positionIndex[0]
        position = ''
    NCBIID = get_NCBIID(gene)
    if gene in gene_annotation['gene_name'].values:
        function = most_frequent_element(gene_annotation.loc[gene_annotation['gene_name'] == gene]['gene_product'].values)
    else:
        function = ''
    if gene in amr_clusterID:
        AMRCluster = 'Yes'
    else:
        AMRCluster = 'No'
    row = [df_file.iloc[i, 0], df_file.iloc[i, 1], df_file.iloc[i, 2], int(df_file.iloc[i, 3]) + 1, df_file.iloc[i, 4], position,gene,function,NCBIID, AMRCluster]
    df_full.append(row)

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,

In [78]:
df = pd.DataFrame(df_full)
df.columns = ['Antibiotics', 'Feature ID', 'Variant Type', 'Rank', 'Feature Score', 'Gene Location', 'Gene', 'Gene Function', 'NCBI ID', 'AMR Gene']

In [79]:
df.to_csv('revision/feature_annotation.csv', index=None)

In [80]:
gene in gene_annotation['gene_name']

False

In [1]:
# amr_clusterID