In [None]:
import pandas as pd
import seaborn as sns
import glob
from numpy import genfromtxt
# from sklearn.metrics import f1_score
from matplotlib import pyplot as plt
import numpy as np
import json
from collections import OrderedDict
import os
import re
import logging
import multiprocessing
from functools import partial
from datetime import datetime
from Bio import SeqIO
from Bio.Seq import Seq
import gzip
from sklearn import preprocessing
from sklearn.feature_selection import VarianceThreshold
# https://pandas.pydata.org/docs/getting_started/intro_tutorials/03_subset_data.html

In [None]:
# ### panta input directory (prokka)
# pantain_dirdb = '/data/hoan/amromics/prediction/data/Ecoli1936/prokkatest/sub1/'
# pantain_dir = '/data/hoan/amromics/prediction/data/Ecoli1936/prokkatest/sub2/'
# ### panta output directory
# pantaout_dirdb = '/data/hoan/amromics/prediction/output/pantaaddtestOld/'
# pantaout_dir = '/data/hoan/amromics/prediction/output/pantaaddtest/'

In [None]:
### panta input directory (prokka)
pantain_dirdb = '/data/hoan/amromics/prediction/data/Ecoli1936/prokka_train/train1/'
pantain_dir = '/data/hoan/amromics/prediction/data/Ecoli1936/prokka_test/test1/'
### panta output directory
pantaout_dirdb = '/data/hoan/amromics/prediction/output/pantatrain1/'
pantaout_dir = '/data/hoan/amromics/prediction/output/pantatest1/'

## Find correspondence (injection) from db clusters to test clusters
#### if they have the same representative gene

In [None]:
# find representative gene for each cluster
with open(pantaout_dir + 'annotated_clusters.json', 'r') as JSON:
    json_dict = json.load(JSON)
    
cluster2representativegenedict = {}
representativegene2clusterdict = {}
for key in json_dict:
    cluster2representativegenedict[key] = json_dict[key]['representative']
    representativegene2clusterdict[json_dict[key]['representative']] = key

In [None]:
with open(pantaout_dirdb + 'annotated_clusters.json', 'r') as JSON:
    json_dictdb = json.load(JSON)

cluster2representativegenedictdb = {}
representativegenedict2clusterdb = {}
for key in json_dictdb:
    cluster2representativegenedictdb[key] = json_dictdb[key]['representative']
    representativegenedict2clusterdb[json_dictdb[key]['representative']] = key

In [None]:
# for key in json_dict:
#     print(key, json_dict[key])
#     break;

In [None]:
## PA matrix of new samples
pa_matrix = pd.read_csv(pantaout_dir+'gene_presence_absence.Rtab', sep='\t', index_col=0).T

In [None]:
## PA matrix of db
pa_matrixdb = pd.read_csv(pantaout_dirdb+'gene_presence_absence.Rtab', sep='\t', index_col=0).T
n_samples = pa_matrixdb.shape[0]
n_genes = pa_matrixdb.shape[1]
colsumdb = pa_matrixdb.sum()
core_gene_clusterdb = [colsumdb.index[idx] for idx in range(n_genes) if colsumdb[idx] >= 0.9999*n_samples] # E. coli

In [None]:
clusters = list(pa_matrix.columns)
clustersdb = list(pa_matrixdb.columns)

In [None]:
clustersdb2clustersdict = {}
for idx in range(len(clustersdb)):
    # clustersdb[idx]
    repidxdb = cluster2representativegenedictdb[clustersdb[idx]]
    if  repidxdb in representativegene2clusterdict:
        correspond_cluster = representativegene2clusterdict[repidxdb]
        clustersdb2clustersdict[clustersdb[idx]] = correspond_cluster
    else:
        for key in json_dict:
            if repidxdb in json_dict[key]['gene_id']:
                correspond_cluster = key
                clustersdb2clustersdict[clustersdb[idx]] = correspond_cluster
                break;

In [None]:
# change the order of presence and absence matrix
dbclusterindex = [clustersdb2clustersdict[key] for key in clustersdb]
# PA matrix of test set
pa_matrixnew = pa_matrix[dbclusterindex]

In [None]:
# Note that the cluster name can be different but they share the rep gene

### Presence and absence matrix of db and test

In [None]:
pa_matrixdb.head(2)

In [None]:
pa_matrixnew.head(2)

# Feature engineering

## Find all AMR clusters (db)

In [None]:
from pangraph.utils import parse_gff_AMRgene_finder

In [None]:
amr_gene = []
for data_dir in glob.glob(pantain_dirdb + '*.gff'):
    # print(data_dir)
    in_fh = open(data_dir)
    sample_id = data_dir.split('/')[-1][:-4]
    amr_gene += parse_gff_AMRgene_finder(in_fh, sample_id)
    in_fh.close()

In [None]:
amr_gene[:3], len(amr_gene)

In [None]:
## Create map from gene ID to cluster ID (db)
with open(pantaout_dirdb + 'annotated_clusters.json', 'r') as JSON:
    json_dictdb = json.load(JSON)

gene2clusterdictdb = {}
for key in json_dictdb:
    if len(json_dictdb[key])==0:
        gene2clusterdictdb[key] = key
    for gene in json_dictdb[key]['gene_id']:
        gene2clusterdictdb[gene] = key

In [None]:
#### Map genes back to cluster IDs
amr_clusterID = [gene2clusterdictdb[gene] for gene in amr_gene]
amr_clusterID = list(set(amr_clusterID))

In [None]:
len(amr_clusterID), amr_clusterID[0:4]

## Compute K-mer of AMR clusters (db)

In [None]:
from pangraph.utils import binary_label
from sklearn.feature_selection import mutual_info_classif, chi2

In [None]:
# # Read prepresentative sequence
# from Bio import SeqIO
# genecluster2representativeseq = {}
# with open(pantaout_dirDB+'representative_clusters_prot.fasta') as handle:
#     for record in SeqIO.parse(handle, "fasta"):
#         name, sequence = record.id, str(record.seq)
#         genecluster2representativeseq[name] = sequence
#         # print(name,'----', sequence)

In [None]:
with open(pantaout_dirdb + 'samples.json', 'r') as JSON:
    sample_dictdb = json.load(JSON)
sample2integerindexdb = {}
for idx in range(len(sample_dictdb)):
    sample2integerindexdb[sample_dictdb[idx]['id']] = idx
n_samplesdb = len(sample_dictdb)

In [None]:
computed_gene_cluster = amr_clusterID;

In [None]:
# amr_mat = None;
ksize = 10; # k = 10 for protein, 20 for DNA
kmer_list = [];
pairdata = []
for idx in range(len(computed_gene_cluster)):
    alignment_dir = pantaout_dirdb + 'clusters/' + computed_gene_cluster[idx] +'/'+computed_gene_cluster[idx]+'.faa.aln.gz'
    # alignment_dir = pantaout_dir + 'clusters/' + computed_gene_cluster[idx] +'/'+computed_gene_cluster[idx]+'.fna.aln.gz'
    with gzip.open(alignment_dir, "rt") as handle:
        for record in SeqIO.parse(handle, "fasta"):
            name, sequence = record.id, str(record.seq)
            sample_id = name.split('-')[0]
            seqraw = sequence.replace('-', '')
            n_kmers = len(seqraw) - ksize + 1
            for i in range(n_kmers):
                kmer = seqraw[i:i + ksize] 
                # kmer = computed_gene_cluster[idx] + seqraw[i:i + ksize] # compute unique kmer for eaach cluster
                kmer_list.append(kmer)
                pairdata.append((sample2integerindexdb[sample_id], kmer))

In [None]:
unique_kmer = list(set(kmer_list))

In [None]:
kmer2index = {}
for i in range(len(unique_kmer)):
    kmer2index[unique_kmer[i]] = i

In [None]:
kmer_matrix = np.zeros((n_samplesdb, len(unique_kmer)))

In [None]:
# ct = 0
for u, v in pairdata:
    # kmer_matrix[u, kmer2index[v]] += 1
    kmer_matrix[u, kmer2index[v]] = 1

In [None]:
kmer_matrix.shape

In [None]:
# selector = VarianceThreshold(threshold=0.05)
selector = VarianceThreshold(threshold=0.01)
kmer_matrix_VT = selector.fit_transform(kmer_matrix)

In [None]:
# (1653, 59580)
kmer_matrix_VT.shape

In [None]:
np.save(pantaout_dir + 'KmerEncoderAMRGenesSubmissiondb.npy', kmer_matrix_VT) # save numpy array

In [None]:
kmerindexdb = np.array(unique_kmer)[selector.get_support()==True]

In [None]:
np.save(pantaout_dir + 'KmerEncoderAMRGenesSubmissiondb_index.npy', kmerindexdb) # save numpy array

In [None]:
kmer_matrixdbdf = pd.DataFrame(kmer_matrix_VT, columns = kmerindexdb)

In [None]:
# ### Check the difference between metadata and matrix
# diffindex = set(pa_matrix.index).difference(set(metadata.index))
# # diffindex
# newindex = [val + '0' if val in diffindex else val for val in pa_matrix.index]
# pa_matrix.index = newindex
# len(set(pa_matrix.index).difference(set(metadata.index)))

In [None]:
# ### Export refined metadata
# metadata_panta = metadata.loc[list(pa_matrix.index)]
# newcolumn = [item.replace("_", "@") for item in metadata_panta.columns]
# metadata_panta.columns = newcolumn
# metadata_panta.to_csv("/data/hoan/amromics/prediction/data/Kpmetadata_final.csv", index=False)

In [None]:
kmer_matrix_VT, kmer_matrix_VT.shape, kmerindexdb, kmerindexdb.shape

## Compute K-mer of AMR clusters (test)

In [None]:
with open(pantaout_dir + 'samples.json', 'r') as JSON:
    sample_dict = json.load(JSON)
sample2integerindex = {}
for idx in range(len(sample_dict)):
    sample2integerindex[sample_dict[idx]['id']] = idx
n_samples = len(sample_dict)

In [None]:
computed_gene_cluster = [clustersdb2clustersdict[cluster] for cluster in amr_clusterID];

In [None]:
# amr_mat = None;
ksize = 10; # k = 10 for protein, 20 for DNA
kmer_list = [];
pairdata = []
for idx in range(len(computed_gene_cluster)):
    alignment_dir = pantaout_dir + 'clusters/' + computed_gene_cluster[idx] +'/'+computed_gene_cluster[idx]+'.faa.aln.gz'
    # alignment_dir = pantaout_dir + 'clusters/' + computed_gene_cluster[idx] +'/'+computed_gene_cluster[idx]+'.fna.aln.gz'
    with gzip.open(alignment_dir, "rt") as handle:
        for record in SeqIO.parse(handle, "fasta"):
            name, sequence = record.id, str(record.seq)
            sample_id = name.split('-')[0]
            seqraw = sequence.replace('-', '')
            n_kmers = len(seqraw) - ksize + 1
            for i in range(n_kmers):
                kmer = seqraw[i:i + ksize] 
                kmer_list.append(kmer)
                pairdata.append((sample2integerindex[sample_id], kmer))

In [None]:
unique_kmer = list(set(kmer_list))

In [None]:
kmer2index = {}
for i in range(len(unique_kmer)):
    kmer2index[unique_kmer[i]] = i

In [None]:
kmer_matrix = np.zeros((n_samples, len(unique_kmer)))

In [None]:
# ct = 0
for u, v in pairdata:
    # kmer_matrix[u, kmer2index[v]] += 1
    kmer_matrix[u, kmer2index[v]] = 1

In [None]:
kmer_matrix.shape

In [None]:
kmer_matrixdf = pd.DataFrame(kmer_matrix, columns=unique_kmer)

In [None]:
newkmerindexdb = list(set(kmerindexdb).difference(set(unique_kmer)))

In [None]:
kmer_matrixdf[newkmerindexdb] = 0

In [None]:
kmer_matrixdftest = kmer_matrixdf[kmerindexdb]

In [None]:
kmer_matrixdftest.shape

In [None]:
kmer_matrix_VT = kmer_matrixdftest.values
np.save(pantaout_dir + 'KmerEncoderAMRGenesSubmission.npy', kmer_matrix_VT) # save numpy array

In [None]:
np.save(pantaout_dir + 'KmerEncoderAMRGenesSubmission_index.npy', kmerindexdb) # save numpy array

In [None]:
kmer_matrixdftest.head(2)

In [None]:
kmerindexdb

### Label encoder for AMR clusters (not considered)

In [None]:
# computed_gene_cluster = amr_clusterID;

In [None]:
# with open(pantaout_dir + 'samples.json', 'r') as JSON:
#     sample_dict = json.load(JSON)
# sample2integerindex = {}
# for idx in range(len(sample_dict)):
#     sample2integerindex[sample_dict[idx]['id']] = idx
# n_samples = len(sample_dict)

In [None]:
# amr_mat = None;
# start_idx = [0];
# pass_gene_cluster = [];
# for idx in range(len(computed_gene_cluster)):
#     alignment_dir = pantaout_dir + 'clusters/' + computed_gene_cluster[idx] +'/'+computed_gene_cluster[idx]+'.faa.aln.gz'
#     codes = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
#              'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', '-', 'X']
#     le = preprocessing.LabelEncoder()
#     le.fit(codes)
#     mat = None; index = 0; index_set = []
#     with gzip.open(alignment_dir, "rt") as handle:
#         for record in SeqIO.parse(handle, "fasta"):
#             name, sequence = record.id, str(record.seq)
#             sample_id = name.split('-')[0]
#             if index == 0:
#                 mat = np.zeros((n_samples, len(sequence)))
#             index += 1
#             mat[sample2integerindex[sample_id],:] = 1 + le.transform([*sequence])
#             index_set.append(sample2integerindex[sample_id])
#             # print(record.id)
#     if idx==0:
#         pass_gene_cluster.append(computed_gene_cluster[idx])
#         start_idx += [start_idx[-1] + mat.shape[1]]
#         amr_mat = mat
#     else:
#         # ## Run feature selection
#         # variant_thres = 0.05
#         variant_thres = 0
#         vs = True
#         if len(index_set) >= int(n_samples*0.01):
#             try:
#                 sel = VarianceThreshold(variant_thres)
#                 sel.fit(mat[index_set,:])
#             except ValueError:
#                 vs = False
#             if vs:
#                 mat = mat[:, sel.variances_>variant_thres]
#                 if mat.shape[0] > 0:
#                     pass_gene_cluster.append(computed_gene_cluster[idx])
#                     start_idx += [start_idx[-1] + mat.shape[1]]
#                     amr_mat = np.append(amr_mat, mat, axis=1)
# end_idx = [start_idx[idx]-1 for idx in range(1, len(start_idx))]
# start_idx = start_idx[:-1]

In [None]:
# amr_mat.shape

In [None]:
# # metadata_panta = pd.read_csv("/data/hoan/amromics/prediction/data/Kametadata_final.csv")
# metadata_panta = pd.read_csv("data/Ecoli1936metafiles/metadata_final.csv")
# mutual_mat = []
# for idx in range(2, metadata_panta.shape[1]):
#     y_class = metadata_panta.iloc[:,idx].values
#     print(metadata_panta.columns[idx])
#     y, nonenan_index = binary_label(y_class) # v6
#     pa_matrix_new = amr_mat[nonenan_index, ]
#     y_new = y[nonenan_index].astype(int)
#     scores, pvalue = chi2(pa_matrix_new, y_new)
#     mutual_mat.append(scores)
# mutual_mat = np.array(mutual_mat)
# mutual_mat_mean = mutual_mat.mean(axis=0)

In [None]:
# top_features = np.argsort(mutual_mat_mean)[::-1][:20000]
# kmer_matrix_VT_top_features = amr_mat[:,top_features]
# kmer_matrix_VT_top_features.shape

In [None]:
# # np.save(pantaout_dir + 'amrlabelencodermat_top10kgenes_v9.npy', amr_mat) # save numpy array
# # np.save(pantaout_dir + 'KpAMRGeneLabelEncoderMat.npy', amr_mat) # save numpy array
# outdata_name = 'KpAMRGeneLabelEncoderMatTop20k'
# np.save(pantaout_dir + outdata_name + '.npy', kmer_matrix_VT_top_features) # save numpy array
# # outdata_name = 'genes_fold_' + str(fold_idx)
# # np.save(pantaout_dir + outdata_name + '.npy', amr_mat) # save numpy array

In [None]:
# amrgene_annotation = pd.DataFrame({'gene': pass_gene_cluster, 'start_index': start_idx, 'end_index': end_idx})
# amrgene_annotation.to_csv(pantaout_dir + outdata_name + '_geneindex.csv', index=None)

## Compute label encoder for core gene clusters (db)

In [None]:
computed_gene_cluster = core_gene_clusterdb;

In [None]:
codes = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', '-', 'X']
le = preprocessing.LabelEncoder()
le.fit(codes)

In [None]:
amr_mat = None;
# start_idx = [0];
# pass_gene_cluster = [];
positional_index = [];
# for idx in range(2):
for idx in range(len(computed_gene_cluster)):
    alignment_dir = pantaout_dirdb + 'clusters/' + computed_gene_cluster[idx] +'/'+computed_gene_cluster[idx]+'.faa.aln.gz'
    mat = None; index = 0; index_set = []; selected_location = []
    with gzip.open(alignment_dir, "rt") as handle:
        for record in SeqIO.parse(handle, "fasta"):
            name, sequence = record.id, str(record.seq)
            sample_id = name.split('-')[0]
            if index == 0:
                mat = np.zeros((n_samplesdb, len(sequence)))
            index += 1
            mat[sample2integerindexdb[sample_id],:] = 1 + le.transform([*sequence])
            index_set.append(sample2integerindexdb[sample_id])
            if name == cluster2representativegenedictdb[computed_gene_cluster[idx]]:
                # print(idx, sequence)
                selected_location = [pos for pos, char in enumerate(sequence) if char != '-']
                # print(selected_location)
    mat = mat[:, selected_location] #only select the position where representative sequence is not "-"
    if idx==0:
        amr_mat = mat
        positional_index += [computed_gene_cluster[idx] +'@'+ str(i) for i in range(mat.shape[1])]
    else:
        variant_thres = 0; vs = True;
        if len(index_set) >= int(n_samplesdb*0.01):
            try:
                sel = VarianceThreshold(variant_thres)
                sel.fit(mat[index_set,:])
            except ValueError:
                vs = False
            if vs:
                mat = mat[:, sel.variances_>variant_thres]
                if mat.shape[0] > 0:
                    amr_mat = np.append(amr_mat, mat, axis=1)
                    bool_vec = sel.variances_>variant_thres
                    positional_index += [computed_gene_cluster[idx] +'@'+ str(i) for i in range(len(bool_vec)) if bool_vec[i]]

In [None]:
amr_mat.shape, len(positional_index)

In [None]:
snpmatrixdbdf = pd.DataFrame(amr_mat, columns = positional_index)

In [None]:
snpmatrixdbdf.head(2)

In [None]:
# amrgene_annotation = pd.DataFrame({'gene': pass_gene_cluster, 'start_index': start_idx, 'end_index': end_idx})
# # amrgene_annotation.to_csv(pantaout_dir + outdata_name + '_geneindex.csv', index=None)

In [None]:
# np.save(pantaout_dir + 'SNPsCoreGeneFullSubmission.npy', amr_mat) # save numpy array
# # amr_mat = np.load(pantaout_dir + 'SNPsCoreGeneFullSubmission.npy') 

In [None]:
snpmatrixdbdf.to_pickle(pantaout_dir + 'SNPsCoreGeneFullSubmission.pkl')
# amrgene_annotation = pd.read_pickle(pantaout_dir + 'SNPsCoreGeneFullSubmission_metadata.pkl')

In [None]:
# metadata_panta = pd.read_csv("/data/hoan/amromics/prediction/data/Kametadata_final.csv")
metadata_panta = pd.read_csv("data/Ecoli1936metafiles/metadata_final.csv")

In [None]:
metadata_panta.head(2)

In [None]:
samples_list = list(sample2integerindexdb.keys())

In [None]:
sample_isolate = pd.read_csv('/data/hoan/amromics/prediction/data/Ecoli1936metafiles/sample_isolate.csv')
sample_isolate.head(2)
sample2isolate = {}
for idx in range(len(sample_isolate.index)):
    sample2isolate[sample_isolate.iloc[idx,0]+'.contig'] = sample_isolate.iloc[idx,1]

In [None]:
# sample2isolate

In [None]:
isolate_list = [sample2isolate[key] for key in samples_list]

In [None]:
metadata_pantanew = metadata_panta.loc[metadata_panta['Isolate'].isin(isolate_list)]

In [None]:
amr_mat = snpmatrixdbdf.values

In [None]:
mutual_mat = []
for idx in range(2, metadata_pantanew.shape[1]):
    y_class = metadata_pantanew.iloc[:,idx].values
    print(metadata_pantanew.columns[idx])
    y, nonenan_index = binary_label(y_class) # v6
    pa_matrix_new = amr_mat[nonenan_index, ]
    if pa_matrix_new.shape[0] > 0:
        y_new = y[nonenan_index].astype(int)
        scores, pvalue = chi2(pa_matrix_new, y_new)
        mutual_mat.append(scores)
mutual_mat = np.array(mutual_mat)
mutual_mat_mean = mutual_mat.mean(axis=0)

In [None]:
top_features = np.argsort(mutual_mat_mean)[::-1][:10000]
kmer_matrix_VT_top_features = amr_mat[:,top_features]
kmer_matrix_VT_top_features.shape

In [None]:
snp_features_name = snpmatrixdbdf.columns[top_features]

In [None]:
snpmatrixdbfinaldf = snpmatrixdbdf.iloc[:,top_features]

In [None]:
# np.save(pantaout_dir + 'SNPsCoreGeneTop10KSubmission.npy', kmer_matrix_VT_top_features) # save numpy array

In [None]:
# np.save(pantaout_dir + 'SNPsCoreGeneTop10KSubmission_index.npy', snp_features_name) # save numpy array

In [None]:
snpmatrixdbfinaldf.head(2)

In [None]:
snp_features_name

## Compute label encoder for core gene clusters (test)

In [None]:
computed_gene_cluster = [clustersdb2clustersdict[cluster] for cluster in core_gene_clusterdb];

In [None]:
codes = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', '-', 'X']
le = preprocessing.LabelEncoder()
le.fit(codes)

In [None]:
snp_features_name_unique = np.unique([key.split('@')[0] for key in snp_features_name])
snp_features_name_test_unique =  [clustersdb2clustersdict[cluster] for cluster in snp_features_name_unique];

In [None]:
# amr_mat = None;
amr_mat = np.empty(shape=[n_samples, 0])
positional_index = [];
# for idx in range(100):
for idx in range(len(computed_gene_cluster)):
    if computed_gene_cluster[idx] in snp_features_name_test_unique:
        alignment_dir = pantaout_dir + 'clusters/' + computed_gene_cluster[idx] +'/'+computed_gene_cluster[idx]+'.faa.aln.gz'
        mat = None; index = 0; index_set = []; selected_location = []
        with gzip.open(alignment_dir, "rt") as handle:
            for record in SeqIO.parse(handle, "fasta"):
                name, sequence = record.id, str(record.seq)
                sample_id = name.split('-')[0]
                if index == 0:
                    mat = np.zeros((n_samples, len(sequence)))
                index += 1
                mat[sample2integerindex[sample_id],:] = 1 + le.transform([*sequence])
                index_set.append(sample2integerindex[sample_id])
                if name == cluster2representativegenedict[computed_gene_cluster[idx]]:
                    # print(idx, sequence)
                    selected_location = [pos for pos, char in enumerate(sequence) if char != '-']
                    # print(selected_location)
        mat = mat[:, selected_location] #only select the position where representative sequence is not "-"
        positional_index += [core_gene_clusterdb[idx] +'@'+ str(i) for i in range(mat.shape[1])] # use the same name as database
        # if idx==0:
        #     amr_mat = mat
        # else:
        amr_mat = np.append(amr_mat, mat, axis=1)

In [None]:
amr_mat.shape, len(positional_index)

In [None]:
snpmatrixtestdf = pd.DataFrame(amr_mat, columns = positional_index)

In [None]:
snpmatrixtestfinaldf = snpmatrixtestdf[snp_features_name]

In [None]:
snpmatrixtestfinaldf.shape

In [None]:
snpmatrixtestfinaldf.head(2)

In [None]:
snpmatrixdbfinaldf.head(2)

## Export data

In [None]:
# PA matrix
pa_matrixdb.to_pickle(pantaout_dir + 'PAmatrixdb.pkl')
pa_matrixnew.to_pickle(pantaout_dir + 'PAmatrix.pkl')
# AMR Kmer
kmer_matrixdbdf.index = pa_matrixdb.index
kmer_matrixdftest.index = pa_matrixnew.index
kmer_matrixdbdf.to_pickle(pantaout_dir + 'AMRKmerdb.pkl')
kmer_matrixdftest.to_pickle(pantaout_dir + 'AMRKmer.pkl')
# SNPs
snpmatrixdbfinaldf.index = pa_matrixdb.index
snpmatrixtestfinaldf.index = pa_matrixnew.index
snpmatrixdbfinaldf.to_pickle(pantaout_dir + 'SNPmatrixdb.pkl')
snpmatrixtestfinaldf.to_pickle(pantaout_dir + 'SNPmatrix.pkl')

In [None]:
for key in sample2integerindexdb:
    if sample2integerindex[key] != sample2integerindexdb[key]:
        print("Sample IDs do not match! Please check");
        break;
print("It is okay")