## Imports

In [24]:
import os
import pandas as pd
import seaborn as sns
import sys
import matplotlib.pyplot as plt
import numpy as np
import openpyxl

# # Custome functions
# sys.path.append('../scripts/python')
# from taxonomic_composition_tables import adjust_suthaus_2020_df, prepare_data_for_plotting, plot_taxonomic_data

## Global variables

In [20]:
project = 'Suthaus_2023'
cell = 'cell_combined'
marker = 'Full18S'
sim = 'sim_90'
denoise_method = 'dada2'
raw_data = os.path.join('..', 'raw_data')
results_dir_path = os.path.join('..', 'results')
blast_results = os.path.join(results_dir_path, 'tax_assignment_vsearch', project, marker, denoise_method, sim, 'blast')
output_path = os.path.join(results_dir_path, 'final_tables')

# Creating taxonomic table

In [17]:
# get paths

files = os.listdir(blast_results)
paths = [blast_results + '/' + file for file in files]

paths

record = {'otu_id': [], 
          'size_denoised': [],
          'size_clustered': [],
          'kingdom': [], 
          'domain': [], 
          'phyllum': [], 
          'class': [], 
          'order': [], 
          'family': [], 
          'genus': [], 
          'species': [], 
          'closest_match': [],
          'percent_identity': [], 
          'sample': []}
    
for path in paths:
    with open(path, 'rt') as f:
        lines = f.readlines()
        for line in lines:
            record['otu_id'].append(line.split('\t')[0])
            record['size_denoised'].append(line.split('\t')[0].split('size=')[-1].split(';')[0])
            record['size_clustered'].append(line.split('\t')[0].split('seqs=')[-1])
            tax_assign = line.split('\t')[1]
            record['kingdom'].append(tax_assign.split(',')[0].split(':')[1])
            record['domain'].append(tax_assign.split(',')[1].lstrip('d:'))
            record['phyllum'].append(tax_assign.split(',')[2].lstrip('p:'))
            record['class'].append(tax_assign.split(',')[3].lstrip('c:'))
            record['order'].append(tax_assign.split(',')[4].lstrip('o:'))
            record['family'].append(tax_assign.split(',')[5].lstrip('f:'))
            record['genus'].append(tax_assign.split(',')[6].lstrip('g:'))
            record['species'].append(tax_assign.split(',')[7].lstrip('s:'))
            record['closest_match'].append(line.split('\t')[1].split(';')[0].split('.')[0])
            record['percent_identity'].append(line.split('\t')[2])
            record['sample'].append(path.split('/')[-1].lstrip('blast6_').rstrip('.tab'))

        
sum_table = pd.DataFrame.from_dict(record)

In [18]:
sum_table

Unnamed: 0,otu_id,size_denoised,size_clustered,kingdom,domain,phyllum,class,order,family,genus,species,closest_match,percent_identity,sample
0,centroid=8611da65f64225524f93bf014c6976de2ab0e...,47,1,Eukaryota,TSAR,Stramenopiles-Gyrista,Chrysophyceae,Hibberdiales,Hibberdiaceae,Chrysocapsa,Chrysocapsa_wetherbeei,EF165145,95.6,Th38_18S
1,centroid=3b6fcdde430729db31873baca638267ec9b38...,25,1,Eukaryota,TSAR,Rhizaria-Cercozoa,Endomyxa,Vampyrellida,Leptophryidae,Leptophryidae_X,Leptophryidae_XX,KC511080,99.5,Th38_18S
2,centroid=e05afbbd21a37383dafb80e7326bee5f7b0bc...,30,1,Eukaryota,TSAR,Stramenopiles-Gyrista,Chrysophyceae,Hibberdiales,Hibberdiaceae,Chrysocapsa,Chrysocapsa_wetherbeei,EF165145,96.1,Th38_18S
3,centroid=ccaf456376a5db4e20fb9f6e684150e1f7a46...,393,19,Eukaryota,TSAR,Stramenopiles-Gyrista,Eustigmatophyceae,Eustigmatophyceae_X,Eustigmatophyceae_XX,Nannochloropsis,Nannochloropsis_limnetica,JF489988,97.5,Th38_18S
4,centroid=bc999389b5a27d4438971fe8822066a1cff1f...,294,14,Eukaryota,TSAR,Stramenopiles-Gyrista,Raphidophyceae,Raphidophyceae_X,Raphidophyceae_XX,Vacuolaria,Vacuolaria_virescens,U41651,99.6,Th38_18S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1416,centroid=791cea00db76faad88ebfb3498a9901691f24...,2,1,Eukaryota,TSAR,Alveolata-Apicomplexa,Gregarinomorphea,Cryptogregarinorida,Cryptosporidiidae,Cryptosporidium13,Cryptosporidium13_serpentis,AF093502,90.2,NH1_18S
1417,centroid=6713b660f89b156bc521e2dc3b6ab80c940f7...,4,1,Eukaryota,TSAR,Alveolata-Apicomplexa,Gregarinomorphea,Stenophorida,Stenophoridae,Stenophora,Stenophora_robusta,FJ459760,91.1,NH1_18S
1418,centroid=7297285da1abbbd690c36976eb58364d18e60...,3,1,Eukaryota,Archaeplastida,Chlorophyta-Chlorophyta_X,Trebouxiophyceae,Trebouxiophyceae_X,Trebouxiophyceae_XX,Trebouxiophyceae_XXX,Trebouxiophyceae_XXX_sp.,JQ988938,99.6,NH1_18S
1419,centroid=b2012e19c9da28d7e60cef27e1f64f89111c9...,2,1,Eukaryota,Archaeplastida,Chlorophyta-Chlorophyta_X,Ulvophyceae,Ulotrichales,Ulotrichales_X,Hazenia,Hazenia_mirabilis,MF034630,99.7,NH1_18S


In [21]:
# Create the directory if it does not exist
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [25]:
# save the summary table as excel tables

sum_table.to_excel(f'{output_path}/summary_table_{project}_{marker}_{denoise_method}_{sim}.xlsx')

In [None]:
# save the summary table as tsv tables

sum_table.to_csv(f'{otu_results}/OTU_summary_table_{marker}_{cell}_{sim}.tsv', sep = '\t')

In [None]:
# save the filtered summary table as tsv tables

sum_table_filtered.to_csv(f'{otu_results}/OTU_summary_table_{marker}_{cell}_{sim}_filtered.tsv', sep = '\t')