In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import glob
import os
import subprocess

In [2]:
# Load in sample sheet
df = pd.read_excel('GSP_Sample_Sheet.xlsx')

In [3]:
# Translate the sample numbers to complete_name
meta = pd.read_csv('../data/metadata_fixed.csv', index_col=0)
meta['sample_code'] = meta['name_meta'].str.extract(r"_(\d{3,4})$").fillna(0).astype(int)
df = df.merge(meta[['sample_code', 'complete_name', 'genepid']], left_on='Sample Code', right_on='sample_code')

In [4]:
target_file = 'GSP_samples_data.xlsx'

writer = pd.ExcelWriter(target_file, mode='a', if_sheet_exists='replace')

df.to_excel(writer, sheet_name='sample_metadata', index=None)

# PanRes metadata

In [5]:
panres_meta = pd.read_csv('../data/panres_annotations.tsv', sep='\t', skiprows=1)
panres_meta['gene'] = panres_meta['gene'].str.replace('_v1.0.1', '')

In [6]:
# Extract gene names
resfinder_genes = panres_meta.query("variable == 'database' & value == 'resfinder'")['gene'].unique().tolist()
resfinder_fa = panres_meta.loc[
    (panres_meta['gene'].isin(resfinder_genes)) & (panres_meta['variable'] == 'fa_name') & (panres_meta['value'].str.startswith('resfinder'))
].merge(
    panres_meta.loc[(panres_meta['gene'].isin(resfinder_genes)) & (panres_meta['variable'] == 'class')].pivot_table(index='gene', columns='variable', values='value', aggfunc="/".join),
    left_on='gene', right_index=True
).merge(
    panres_meta.loc[(panres_meta['gene'].isin(resfinder_genes)) & (panres_meta['variable'] == 'gene_length')].drop(columns='variable').rename(columns={'value': 'gene_length'}),
    on='gene'
)
resfinder_fa['value'] = resfinder_fa['value'].str.replace('resfinder|', '')

resfinder_fa.drop(columns=['variable']).rename(columns={'value': 'fasta_header'}).to_excel(writer, sheet_name='resfinder_metadata', index=None)

In [7]:
biocide_genes = panres_meta.query("variable == 'resistance_type' & value == 'biocide'")['gene'].unique().tolist()

biocide_fa = panres_meta.loc[
    (panres_meta['gene'].isin(biocide_genes)) & (panres_meta['variable'] == 'fa_name') & (panres_meta['value'].str.startswith('megares'))
].merge(
     panres_meta.loc[(panres_meta['gene'].isin(biocide_genes)) & (panres_meta['variable'] == 'gene_length')].drop(columns='variable').rename(columns={'value': 'gene_length'}),
    on='gene'
)
biocide_fa['value'] = biocide_fa['value'].str.replace('megares|', '')
biocide_fa['gene_name'] = biocide_fa['value'].str.extract(r"\|(\w+)$")

pattern = r"\b(\w+(?:_\w+)*_\(\w+\)_resistance\b|\b\w+(?:_\w+)*_resistance)\b"
biocide_fa['compound'] = biocide_fa['value'].str.extract(pattern)[0].values

biocide_fa.drop(columns=['variable']).rename(columns={'value': 'fasta_header'}).to_excel(writer, sheet_name='biocide_metadata', index=None)

# Find count data

In [8]:
panres_mapstats = {os.path.basename(f).replace('.mapstat', ''): f for f in glob.glob(os.path.join('..', 'data', 'mapstats', '*.mapstat'))}

In [9]:
def read_mapstat(file):

    # read tabular data
    df = pd.read_csv(file, sep='\t', skiprows=6)

    # get file name , thus complete_name of sample
    fname = os.path.basename(file).replace('.mapstat', '')
    df['complete_name'] = fname

    # rename first column to remove space
    df = df.rename(columns = {'# refSequence': 'refSequence'})              

    # in header of file, grep how many fragments there are in total
    p = subprocess.run(f"grep '## fragment' {file}", shell=True, stdout=subprocess.PIPE)
    o = int(p.stdout.decode().strip().split('\t')[-1])

    # save as dataframe
    tot_fragment = pd.DataFrame([fname, o], index=['complete_name', 'total_fragments']).T
    return df, tot_fragment

def parse_mapstat(file, biocide_fa, resfinder_fa):

    mapstat_df, tot_fragments= read_mapstat(file)


    resfinder_mapstat = mapstat_df[['complete_name', 'refSequence', 'fragmentCountAln']].merge(resfinder_fa, left_on='refSequence', right_on='gene')
    resfinder_mapstat['fragmentCountAln_adj'] = resfinder_mapstat['fragmentCountAln'] / (resfinder_mapstat['gene_length'].astype(int) / 1e3)

    biocide_mapstat = mapstat_df[['complete_name', 'refSequence', 'fragmentCountAln']].merge(biocide_fa, left_on='refSequence', right_on='gene')
    biocide_mapstat['fragmentCountAln_adj'] = biocide_mapstat['fragmentCountAln'] / (biocide_mapstat['gene_length'].astype(int) / 1e3)
    
    tot_fragments['n_resfinder_genes'] = resfinder_mapstat['refSequence'].nunique()
    tot_fragments['n_biocide_genes'] = biocide_mapstat['refSequence'].nunique()
    tot_fragments['total_resfinder_counts'] = resfinder_mapstat['fragmentCountAln_adj'].sum()
    tot_fragments['total_biocide_counts'] = biocide_mapstat['fragmentCountAln_adj'].sum()
    
    
    return tot_fragments, resfinder_mapstat, biocide_mapstat


resfinder_mapstats = []
biocide_mapstats = []

all_tots = []

#for mapstat_file in panres_mapstats:
complete_names = df['complete_name'].values
for i, complete_name in enumerate(complete_names):
    print(f" {i+1}/{len(complete_names)} ", end='\r')
    mapstat_file = panres_mapstats[complete_name]
    tf, rf_mapstat, bc_mapstat = parse_mapstat(mapstat_file, biocide_fa=biocide_fa, resfinder_fa=resfinder_fa)

    all_tots.append(tf)
    resfinder_mapstats.append(rf_mapstat)
    biocide_mapstats.append(bc_mapstat)


 49/49 

## Extract total counts

In [10]:
motu_kingdom_counts = pd.read_csv('../motus_counts/motus_agg_pad/kingdom_motus_pad_agg_counts.csv')
bacterial_counts = df[['complete_name', 'genepid']].merge(motu_kingdom_counts[['genepid', 'Bacteria']], on='genepid').drop(columns='genepid').rename(columns={'Bacteria': 'total_bacteria_counts'})

In [11]:
total_counts = pd.concat(all_tots).merge(bacterial_counts, on='complete_name')

total_counts[['complete_name', 'total_fragments', 'total_bacteria_counts', 'total_resfinder_counts', 'total_biocide_counts']].to_excel(writer, sheet_name='total_counts', index=None)
total_counts[['complete_name', 'n_resfinder_genes', 'n_biocide_genes']].to_excel(writer, sheet_name='total_genes_found', index=None)

## ResFinder counts

In [12]:
resfinder_class_counts = pd.concat(resfinder_mapstats).groupby(['complete_name', 'class']).agg({'gene': 'nunique', 'fragmentCountAln_adj': 'sum'})
resfinder_class_counts[['gene']].reset_index().pivot_table(index='complete_name', columns='class', values='gene').to_excel(writer, sheet_name='resfinder_class_gene_matches')

In [13]:
resfinder_class_counts.reset_index().pivot_table(index='complete_name', columns='class', values='fragmentCountAln_adj', aggfunc='sum').to_excel('resfinder_class_counts.xlsx')

In [14]:
pd.concat(resfinder_mapstats).pivot_table(index='complete_name', columns='value', values='fragmentCountAln_adj').to_excel('resfinder_gene_counts.xlsx')

## Biocide counts

In [15]:
biocide_compound_counts = pd.concat(biocide_mapstats).groupby(['complete_name', 'compound']).agg({'gene': 'nunique', 'fragmentCountAln_adj': 'sum'}).reset_index()
biocide_compound_counts.pivot_table(index='complete_name', columns='compound', values='gene').to_excel(writer, sheet_name='biocide_class_gene_matches')
biocide_compound_counts.pivot_table(index='complete_name', columns='compound', values='fragmentCountAln_adj', aggfunc='sum').to_excel('biocide_class_counts.xlsx')

In [16]:
pd.concat(biocide_mapstats).pivot_table(index='complete_name', columns='value', values='fragmentCountAln_adj').to_excel('biocide_gene_counts.xlsx')

In [17]:
writer.close()