In [None]:
import matplotlib.pyplot as plt
from ete3 import NCBITaxa
import seaborn as sns
import pandas as pd
import numpy as np
import glob
from itertools import repeat, product, chain
from collections import defaultdict
import plotly.express as px
import plotly
import random
import pickle
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm as tqdm
import os 
import shutil
import requests
import subprocess
from datetime import datetime
import collections
import math
import matplotlib.colors as mcolors
from matplotlib.patches import Patch
from sklearn.decomposition import PCA

pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)

# to disable ssl certificate check for downloading ncbi taxonomy
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
ncbi = NCBITaxa()
# ncbi.update_taxonomy_database()

home_path = '/cluster/raid/home/f80878961/'
script_path ='{}scripts/'.format(home_path)
working_path = '{}phenotypes/'.format(home_path)
fig_path = '/agsad/f80878961/Data-Raw/23_Livestock-PO/235_Bee_ZBF/planB/figures/'

extract_kraken_reads_script = '{}KrakenTools/extract_kraken_reads.py'.format(script_path)
create_kreport_script = '{}KrakenTools/make_kreport.py'.format(script_path)

# phenotype data
pheno_df = pd.read_csv('{}phenotype_suisse.csv'.format(working_path), sep=';')
pheno_df.index = pheno_df['N°-Ruche-BeeStrong']

# parse Sonia data
mito_df = pd.read_csv('{}varroa_seq_suisse.csv'.format(working_path), sep=';', usecols=range(6))
mito_df = mito_df[mito_df['name'] <= 'BS18-0185']

## slurm

In [None]:
def write_preprocessing_script(script_fn, array_str, node_str):
    runstr="""#!/bin/bash -l
#SBATCH --array=ARRAY_STRING
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --nodelist=NODE_STRING
#SBATCH --mem=100g
#SBATCH --time=6:00:00
#SBATCH --job-name=pp
#SBATCH --output=%x_%A_%a.out
#SBATCH --error=%x_%A_%a.err

modulesld
ebld
module use /software/anaconda3/envs/eb/easybuild/modules/all
ml SRA-Toolkit
ml cutadapt
ml FastQC

# file listing all BeeStrong ids (bs_id) for parallel computing
bs_ids_file=$1
sra_path=$2
fastq_path=$3
sample_SRA_file=$4

# work on scratch node is better when lots of I/O operations
node_scratch=/scratch/${USER}/tmp_${SLURM_ARRAY_JOB_ID}_${SLURM_ARRAY_TASK_ID}/
mkdir -p $node_scratch

# getting bs_id
bs_id=$(sed -n ${SLURM_ARRAY_TASK_ID}'{p;q}' ${bs_ids_file})

echo "1. find all SRA ids corresponding to the bs_id and copy them to the scrach node"
sra_ids=()
for sra in $(grep $bs_id $sample_SRA_file | cut -f 2); do
    sra_ids+=($sra)
    echo $sra_path$sra
    cp -r $sra_path$sra $node_scratch
done

# initiate fastq files
fastq_1=$node_scratch${bs_id}_1.fastq
fastq_2=$node_scratch${bs_id}_2.fastq
touch $fastq_1
touch $fastq_2

echo "2. concatenate fastq from multiple SRA files and clean"
for sra in "${sra_ids[@]}"; do
    ls $node_scratch$sra
    fasterq-dump $node_scratch$sra -O $node_scratch
    cat ${node_scratch}${sra}_1.fastq >> $fastq_1
    cat ${node_scratch}${sra}_2.fastq >> $fastq_2
    rm -f ${node_scratch}${sra}_1.fastq
    rm -f ${node_scratch}${sra}_2.fastq
done

echo "3. trim poly-G tails"
fastq_1_t=$node_scratch${bs_id}_1_trim.fastq
fastq_2_t=$node_scratch${bs_id}_2_trim.fastq

cutadapt -a "G{10}" -A "G{10}" -m 50 -o $fastq_1_t -p $fastq_2_t $fastq_1 $fastq_2

mv -f $fastq_1_t $fastq_1
mv -f $fastq_2_t $fastq_2

echo "4. FastQC"
fastqc $fastq_1 $fastq_2

echo "5. compress"
gzip -f $fastq_1 $fastq_2

echo "6. copy to home and clean"
mv -f $node_scratch${bs_id}* $fastq_path
rm -rf $node_scratch

echo DONE""".replace("ARRAY_STRING", array_str).replace("NODE_STRING", node_str)
    with open(script_fn, 'w') as outf:
        outf.write(runstr)

def write_bowtie2_script_home(script_fn, array_str):
    runstr="""#!/bin/bash -l
#SBATCH --array=ARRAY_STR
#SBATCH --nodes=1
#SBATCH --ntasks=8
#SBATCH --mem=30g
#SBATCH --time=04:00:00
#SBATCH --job-name=bowtie2
#SBATCH --output=%x_%A_%a.out
#SBATCH --error=%x_%A_%a.err

modulesld
ebld
module use /software/anaconda3/envs/eb/easybuild/modules/all
ml Bowtie2
ml SAMtools

idx_path=$1
idx_name=$2
fastq_path=$3
bowtie2_path=$4
bs_ids_file=$5

# getting bs_id
bs_id=$(sed -n ${SLURM_ARRAY_TASK_ID}'{p;q}' ${bs_ids_file})

fastq_1=$fastq_path${bs_id}_1.fastq.gz
fastq_2=$fastq_path${bs_id}_2.fastq.gz
ls -lh $fastq_1
ls -lh $fastq_2

echo "1. bowtie2"
unmapped_prefix=${bowtie2_path}${bs_id}_${idx_name}_unmapped
mapped_prefix=${bowtie2_path}${bs_id}_${idx_name}_mapped
bowtie2 -x  ${idx_path}${idx_name} -p 8 -1 $fastq_1 -2 $fastq_2 --un-conc ${unmapped_prefix}.fastq -S ${mapped_prefix}.sam > ${bowtie2_path}${bs_id}_${idx_name}.out 2>&1

echo "3. SAM to BAM"
samtools view -S -b ${mapped_prefix}.sam > ${mapped_prefix}.bam

echo "4. compress"
gzip -f ${unmapped_prefix}.1.fastq 
gzip -f ${unmapped_prefix}.2.fastq
rm -f ${mapped_prefix}.sam

echo DONE""".replace(
    'ARRAY_STR', array_str)
    with open(script_fn, 'w') as outf:
        outf.write(runstr)

def write_kraken2_script(kraken2_script_fn, array_str, read_pool, min_hit_grps, confidence):
    conf_str = str(confidence).replace('.', '')
    kraken2_runstr="""#!/bin/bash -l
#SBATCH --array=ARRAY_STR
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --mem=400g
#SBATCH --time=02:00:00
#SBATCH --job-name=kraken2
#SBATCH --output=%x_%A_%a.out
#SBATCH --error=%x_%A_%a.err

modulesld
ebld
module use /software/anaconda3/envs/eb/easybuild/modules/all
conda activate planb
which kraken2

kdb_path=$1
kdb_name=$2
bowtie2_path=$3
kraken2_path=$4
bs_ids_file=$5

# getting bs_id
bs_id=$(sed -n ${SLURM_ARRAY_TASK_ID}'{p;q}' ${bs_ids_file})

fastq_1=${bowtie2_path}${bs_id}_bee_unmapped.1.fastq.gz
fastq_2=${bowtie2_path}${bs_id}_bee_unmapped.2.fastq.gz
ls -lh $fastq_1
ls -lh $fastq_2

out_file_id=${kraken2_path}${bs_id}_${kdb_name}_READPOOL_mhgMINHITGROUP_csCONFIDENCESTRING_sf1_rep1
echo $out_file_id

echo '1. kraken2'
kraken2 --threads 8 --db ${kdb_path}${kdb_name} --paired --report-minimizer-data --minimum-hit-groups MINHITGROUP --confidence CONFIDENCESCORE --gzip-compressed --classified-out ${out_file_id}_cseqs#.fq --output ${out_file_id}.kraken2 --report ${out_file_id}.k2report ${fastq_1} ${fastq_2}

echo '2. compress mapped reads fastq'
gzip -f ${out_file_id}_cseqs_1.fq
gzip -f ${out_file_id}_cseqs_2.fq

echo DONE""".replace('ARRAY_STR', array_str).replace('READPOOL', read_pool).replace('MINHITGROUP', min_hit_grps).replace('CONFIDENCESTRING', conf_str).replace('CONFIDENCESCORE', confidence)
    with open(kraken2_script_fn, 'w') as outf:
        outf.write(kraken2_runstr)
        
def get_status_job_array_ids(status='RUNNING', job_name='pp'):
    command = ['squeue', '-t', status, '-n', job_name, '-o', '%.18i']
    result = subprocess.run(command, capture_output=True, text=True)
    job_array_ids = []
    # to handle pending jobs...
    for job_str in result.stdout.split()[1:]:
        job_str = job_str.split('_')[1]
        if job_str.startswith('['):
            split_job_str = job_str[1:].split('%')[0].split(',')
            for js in split_job_str:
                if len(js.split('-')) > 1:
                    x = js.split('-')
                    job_array_ids.extend(list(range(int(x[0]), int(x[1]) + 1)))
                else:
                    job_array_ids.append(int(js))
        else:
            job_array_ids.append(int(job_str))
    return job_array_ids
    
def write_array_str(array_ids_to_run, parallel_job_nr=100):
    '''
    convoluted way to build array string (as BSUB argument cannot be too long)
    '''
    if len(array_ids_to_run) == 1:
        return str(array_ids_to_run[0])
    array_str_list = []
    start = array_ids_to_run[0]
    for i in range(1, len(array_ids_to_run)):
        curr = array_ids_to_run[i]
        prev = array_ids_to_run[i - 1]
        if curr != prev + 1:
            if start == prev:
                array_str_list.append(str(prev))
            else:
                array_str_list.append('{}-{}'.format(start, prev))
            start = curr
    # end 
    if start == prev:
        array_str_list.append(str(prev))
    else:
        array_str_list.append('{}-{}'.format(start, prev + 1)) ## this + 1 is a hack
    start = curr
    
    return '{}%{}'.format(','.join(array_str_list), parallel_job_nr)

# def write_bowtie2_script(script_fn, array_str):
#     runstr="""#!/bin/bash -l
# #SBATCH --array={}
# #SBATCH --nodes=1
# #SBATCH --ntasks=8
# #SBATCH --mem=30g
# #SBATCH --time=04:00:00
# #SBATCH --job-name=bowtie2
# #SBATCH --output=%x_%A_%a.out
# #SBATCH --error=%x_%A_%a.err
# 
# modulesldv
# ebld
# module use /xsoftware/anaconda3/envs/eb/easybuild/modules/all
# conda activate planb
# ml Bowtie2
# which bowtie2 
# 
# idx_path=$1
# idx_name=$2
# input_path=$3
# output_path=$4
# 
# cd $output_path
# 
# # padding zeros
# printf -v j "%04d" $SLURM_ARRAY_TASK_ID
# sample_id=BS18-${{j}}
# 
# reads_1=$(ls ${{input_path}}${{sample_id}}*_R1.fastq.gz)
# reads_2=$(ls ${{input_path}}${{sample_id}}*_R2.fastq.gz)
# echo $reads_1
# echo $reads_2
# 
# bowtie2 -x  ${{idx_path}}${{idx_name}} -p 8 -1 ${{reads_1}} -2 ${{reads_2}} --un-conc ${{output_path}}${{sample_id}}_${{idx_name}}_unmapped.fastq -S ${{output_path}}${{sample_id}}_${{idx_name}}_mapped.sam > ${{output_path}}${{sample_id}}_${{idx_name}}.out 2>&1""".format(
#     array_str)
#     with open(script_fn, 'w') as outf:
#         outf.write(runstr)
# 
# 
# def write_kraken2_script(kraken2_script_fn, array_str, gzip, read_pool, min_hit_grps, confidence):
#     conf_str = str(confidence).replace('.', '')
#     kraken2_runstr="""#!/bin/bash -l
# #SBATCH --array={}
# #SBATCH --nodes=1
# #SBATCH --ntasks=1
# #SBATCH --mem=400g
# #SBATCH --time=02:00:00
# #SBATCH --job-name=kraken2
# #SBATCH --output=%x_%A_%a.out
# #SBATCH --error=%x_%A_%a.err
# 
# modulesld
# ebld
# module use /software/anaconda3/envs/eb/easybuild/modules/all
# conda activate planb
# which kraken2
# which bracken
# 
# kdb_path=$1
# kdb_name=$2
# input_path=$3
# output_path=$4
# read_1_suffix=$5
# read_2_suffix=$6
# 
# # padding zeros
# printf -v j "%04d" $SLURM_ARRAY_TASK_ID
# sample_id=BS18-${{j}}
# out_file_id=${{output_path}}${{sample_id}}_${{kdb_name}}_{}_mhg{}_cs{}_sf1_rep1
# 
# reads_1=$(ls ${{input_path}}${{sample_id}}${{read_1_suffix}})
# reads_2=$(ls ${{input_path}}${{sample_id}}${{read_2_suffix}})
# # reads_1=$(ls ${{input_path}}${{sample_id}}*_R1.fastq.gz)
# # reads_2=$(ls ${{input_path}}${{sample_id}}*_R2.fastq.gz)
# echo $reads_1
# echo $reads_2
# echo $out_file_id
# 
# kraken2 --threads 8 --db ${{kdb_path}}${{kdb_name}} --paired --report-minimizer-data --minimum-hit-groups {} --confidence {} {} --unclassified-out ${{out_file_id}}_ucseqs#.fq --classified-out ${{out_file_id}}_cseqs#.fq --output ${{out_file_id}}.kraken2 --report ${{out_file_id}}.k2report ${{reads_1}} ${{reads_2}}
# """.format(array_str, read_pool, min_hit_grps, conf_str, min_hit_grps, confidence, gzip)
#     with open(kraken2_script_fn, 'w') as outf:
#         outf.write(kraken2_runstr)
# 
# def write_bracken_script(bracken_script_fn, array_str):
#     bracken_runstr="""#!/bin/bash -l
# #SBATCH --array={}
# #SBATCH --nodes=1
# #SBATCH --ntasks=1
# #SBATCH --mem=20g
# #SBATCH --time=01:00:00
# #SBATCH --job-name=bracken
# #SBATCH --output=%x_%A_%a.out
# #SBATCH --error=%x_%A_%a.err
# 
# modulesld
# ebld
# module use /software/anaconda3/envs/eb/easybuild/modules/all
# conda activate planb
# which bracken
# 
# kdb_path=$1
# kdb_name=$2
# kraken_path=$3
# bracken_path=$4
# j=$SLURM_ARRAY_TASK_ID
# 
# # padding zeros
# printf -v j "%04d" $SLURM_ARRAY_TASK_ID
# file_id=BS18-${{j}}_corent_nonbee_mhg2_cs005_sf1_rep1
# k2report_fn=${{kraken_path}}${{file_id}}.k2report
# 
# echo $k2report_fn
# 
# # species
# bracken -d ${{kdb_path}}${{kdb_name}} -i ${{k2report_fn}} -r 150 -l S -t 10 -o ${{bracken_path}}${{file_id}}_S.bracken -w ${{bracken_path}}${{file_id}}_S.breport
# 
# # genus
# bracken -d ${{kdb_path}}${{kdb_name}} -i ${{k2report_fn}} -r 150 -l G -t 10 -o ${{bracken_path}}${{file_id}}_G.bracken -w ${{bracken_path}}${{file_id}}_G.breport""".format(array_str)
#     with open(bracken_script_fn, 'w') as outf:
#         outf.write(bracken_runstr)

## other

In [None]:
def parse_kreport(file_path, bs_id_str, krakdb, readpool, mhg, cs, sf, r):
    kreport_fn = '{}BS18-{}_{}_{}_mhg{}_cs{}_sf{}_rep{}.k2report'.format(file_path, bs_id_str, krakdb, readpool, mhg, cs, str(sf).replace('.', ''), r)
    if not os.path.exists(kreport_fn): 
        print('missing {}'.format(kreport_fn))
    with open(kreport_fn, 'r') as inf:
        ucseqs_nr = int(inf.readline().split()[1])
        cseqs_nr = int(inf.readline().split()[1])
    return ucseqs_nr, cseqs_nr

def parse_breport(file_path, bs_id_str, krakdb, readpool, mhg, cs, sf, r, level, taxa):
    '''get read numbers for taxa'''
    breport_fn = '{}BS18-{}_{}_{}_mhg{}_cs{}_sf{}_rep{}_{}.breport'.format(file_path, bs_id_str, krakdb, readpool, mhg, cs, str(sf).replace('.', ''), r, level)
    if not os.path.exists(breport_fn): 
        print('missing {}'.format(breport_fn))
    breport_df = pd.read_csv(breport_fn, header=None, sep='\t')
    breport_df[5] = [x.lstrip() for x in breport_df[5].to_list()]
    return breport_df[breport_df[5].isin(taxa)].set_index(5)[1].to_dict()


def get_total_read_nr(bowtie2_path, bs_id_str):
    with open('{}BS18-{}_bee.out'.format(bowtie2_path, bs_id_str), 'r') as inf:
        return int(inf.readline().split()[0])
    
def get_classified_read_nr(bowtie2_path, kraken_path, bracken_path, bs_id_str, krakdb, readpool, mhg, cs, sf, r, level):
    '''
    Equals the total number of read in the raw fasta minus the number of unclassified reads by Bracken.
    For a subsample, I multiple the total number of reads by the sampling fraction (similar to subsampling reads before mapping with bowtie2)
    '''
    total_read_nr = get_total_read_nr(bowtie2_path, bs_id_str)
    kraken_ucread_nr, kraken_cread_nr = parse_kreport(kraken_path, bs_id_str, krakdb, readpool, mhg, cs, sf, r)
    bracken_cread_nr = parse_breport(bracken_path, bs_id_str, krakdb, readpool, mhg, cs, sf, r, level, ['root'])['root']
    bracken_ucread_nr = kraken_cread_nr - bracken_cread_nr
    return sf * total_read_nr - kraken_ucread_nr - bracken_ucread_nr
    
def get_bracken_df(bowtie2_path, kraken_path, bracken_path, bs_id_str, krakdb, readpool, mhg, cs, sf, r, level, taxon_filter):
    '''
    parse bracken file
    '''
    # parse bracken output
    bracken_fn = '{}BS18-{}_{}_{}_mhg{}_cs{}_sf{}_rep{}_{}.bracken'.format(bracken_path, bs_id_str, krakdb, readpool, mhg, cs, str(sf).replace('.', ''), r, level)
    if not os.path.exists(bracken_fn): 
        print('missing {}'.format(bracken_fn))
    bracken_df = pd.read_csv(bracken_fn, sep='\t')

    ## calculate original read numbers
    # number of reads classified by kraken2 + bracken AND by bowtie2 --> to calculate relative abundance
    classified_read_nr = get_classified_read_nr(bowtie2_path, kraken_path, bracken_path, bs_id_str, krakdb, readpool, mhg, cs, sf, r, level)
    
    # classified reads - non bee reads
    honeybee_read_nr = classified_read_nr - bracken_df[bracken_df['name'] != 'Apis mellifera']['new_est_reads'].sum()

    # recompute relative abundance
    bracken_df['fraction_total_reads'] = bracken_df['new_est_reads'].to_numpy() / classified_read_nr
    
    # calculate Aitchison log ratio
    bracken_df['ratio_honeybee'] = bracken_df['new_est_reads'].to_numpy() / honeybee_read_nr
    bracken_df['log_ratio_honeybee'] = np.log(bracken_df['new_est_reads'].to_numpy() / honeybee_read_nr)
    
    # log ratio of absolute abundances == log ratio of relative abundances
    assert np.all(np.round(bracken_df['log_ratio_honeybee'].to_numpy(), 10) == np.round(np.log(bracken_df['fraction_total_reads'].to_numpy() / (honeybee_read_nr / classified_read_nr)), 10))
    
    if taxon_filter:
        return bracken_df[bracken_df['taxonomy_id'].isin(taxon_filter)]
    else:
        return bracken_df
        

def get_beexact_species_name2taxid():

    url = "https://raw.githubusercontent.com/bdaisley/BEExact/master/full_database/BEEx_v2023.01.30___FL-refs_taxonomy.txt"
    file_content = requests.get(url, verify=False).text

    level2beexact_taxa = defaultdict(set)
    for level in ['d', 'p', 'c', 'o', 'f', 'g', 's']:
        level2beexact_taxa[level] = set()
    
    for line in file_content.split('\n'):
        if not line: continue
        for s in line.rstrip(';').split('\t')[1].split(';'):
            level, taxon = s.split('__')
            assert level in level2beexact_taxa, print('missing {}'.format(level))
            level2beexact_taxa[level].add(taxon.replace('_', ' '))

    return ncbi.get_name_translator(level2beexact_taxa['s'])


def bray_curtis_dissimilarity(sample1, sample2):
    # Ensure the samples are numpy arrays
    sample1 = np.array(sample1)
    sample2 = np.array(sample2)
    
    # Calculate the Bray-Curtis dissimilarity
    numerator = np.sum(np.abs(sample1 - sample2))
    denominator = np.sum(sample1 + sample2)
    
    return numerator / denominator

# Rarefication and LoD

In [None]:
kdb_path = '{}krakdb/'.format(working_path)
tmp_path = '{}tmp/rarefaction/'.format(working_path)
bowtie2_path = '{}bowtie2/'.format(tmp_path)
kraken_path = '{}kraken/'.format(tmp_path)
subsample_path = '{}subsamples/'.format(tmp_path)

with open('{}tmp/bacteria_taxids.pkl'.format(working_path), 'rb') as inf:
    bacteria_taxids = pickle.load(inf)
len(bacteria_taxids)

## Remove most honey bee sequences with bowtie2

In [None]:
array_str = '62,105,155'
bowtie2_script = '{}bowtie2_bee.run'.format(script_path)
write_bowtie2_script(bowtie2_script, array_str)

In [None]:
idx_path = '{}bee_bt2idx/'.format(working_path)
idx_name = 'bee'
input_path = '{}SwissFastQSample/'.format(working_path)
output_path = '{}tmp/rarefaction/bowtie2/'.format(working_path)

In [None]:
%%bash -s "$bowtie2_script" "$idx_path" "$idx_name" "$input_path" "$output_path"
cd $5
sbatch $1 $2 $3 $4 $5

## kraken2 precise and default mode

In [None]:
# beestrong ids (same for looked with pavian)
bs_ids_str = '27, 62,105,155'

In [None]:
# precise mode
kraken2_script = '{}kraken2.run'.format(script_path)
write_kraken2_script(kraken2_script, bs_ids_str, '--gzip-compressed', 'all', '2', '0.05')

kdb_name = 'corent'
kdb_name = 'corent_krakdb'
input_path = '{}SwissFastQSample/'.format(working_path)
output_path = kraken_path

In [None]:
%%bash -s "$kraken2_script" "$kdb_path" "$kdb_name" "$input_path" "$output_path"
cd $5
sbatch $1 $2 $3 $4 $5

In [None]:
# beestrong ids (same for looked with pavian)
bs_ids_str = '27,62,105,155'

In [None]:
# with nonbee read pool
kraken2_script = '{}kraken2.run'.format(script_path)
write_kraken2_script(kraken2_script, bs_ids_str, '', 'nonbee', '2', '0.05')

kdb_name = 'corent'
input_path = bowtie2_path
output_path = kraken_path
read_1_suffix = '_bee_unmapped.1.fastq'
read_2_suffix = '_bee_unmapped.2.fastq'

In [None]:
%%bash -s "$kraken2_script" "$kdb_path" "$kdb_name" "$input_path" "$output_path" "$read_1_suffix" "$read_2_suffix"
cd $5
sbatch $1 $2 $3 $4 $5 $6 $7

## subsampling kraken2

In [None]:
# def subsample_kraken2(input_path, bs_id_str, mhg, cs, subsampling_fractions, replicate_nr, output_path):
#     with open('{}BS18-{}_mhg{}_cs{}.kraken2'.format(input_path, bs_id_str, mhg, str(cs).replace('.', '')), 'r') as inf:
#         lines = inf.readlines()
# 
#     for sf in tqdm(subsampling_fractions):
#         line_nr = int(sf * len(lines))
#         for r in range(1, replicate_nr + 1):
#             with open('{}BS18-{}_mhg{}_cs{}_sf{}_rep{}.kraken2'.format(output_path, bs_id_str, mhg, str(cs).replace('.', ''), str(sf).replace('.', ''), r), 'w') as outf:
#                 for l in random.sample(lines, line_nr):
#                     outf.write(l)
# 
# randomly subsample .kraken2 files (each line correspond to a paired read)
# 
# 
# bs_ids_str = '27,62,105,155'
# bs_ids = [int(x) for x in bs_ids_str.split(',')]
# print(bs_ids)
# 
# # subsample precise kraken outputs
# mhg = 2
# cs = '005'
# subsampling_fractions = [round(x, 2) for x in np.arange(0.1, 1, 0.1)]
# replicate_nr = 10
# 
# for bs_id in bs_ids:
#     bs_id_str = '%04d' % bs_id
#     print(bs_id_str)
#     subsample_kraken2(kraken_path, bs_id_str, mhg, cs, subsampling_fractions, replicate_nr, subsample_path)
# 
# # subsample default kraken outputs
# mhg = 2
# cs = '00'
# subsampling_fractions = [round(x, 2) for x in np.arange(0.1, 1, 0.1)]
# replicate_nr = 10
# 
# for bs_id in bs_ids:
#     bs_id_str = '%04d' % bs_id
#     print(bs_id_str)
#     subsample_kraken2(kraken_path, bs_id_str, mhg, cs, subsampling_fractions, replicate_nr, subsample_path)

In [None]:
def write_subsample_script(subsample_script_fn, array_str):
    subsample_runstr="""#!/bin/bash -l
#SBATCH --array={}
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --mem=20g
#SBATCH --time=04:00:00
#SBATCH --job-name=subsample
#SBATCH --output=%x_%A_%a.out
#SBATCH --error=%x_%A_%a.err

modulesld
ebld
module use /software/anaconda3/envs/eb/easybuild/modules/all
conda activate planb

kraken_path=$1
subsample_path=$2
krakdb=$3
readpool=$4
mhg=$5
cs=$6

bs_id=$SLURM_ARRAY_TASK_ID

python ~/scripts/plan-b-omics/bin/subsample_kraken2.py --kraken_path ${{kraken_path}} --bs_id ${{bs_id}} --krakdb ${{krakdb}} --readpool ${{readpool}} --mhg ${{mhg}} --cs ${{cs}} --subsample_path ${{subsample_path}}""".format(array_str)
    with open(subsample_script_fn, 'w') as outf:
        outf.write(subsample_runstr)

In [None]:
line_nr_str = '27,62,105,155'

subsample_script = '{}subsample.run'.format(script_path)
write_subsample_script(subsample_script, line_nr_str)

krakdb = 'corent'
readpool = 'nonbee'
mhg = 2
cs = '005'

In [None]:
%%bash -s "$subsample_script" "$kraken_path" "$subsample_path" "$krakdb" "$readpool" "$mhg" "$cs" 
cd $3
sbatch $1 $2 $3 $4 $5 $6 $7

In [None]:
# copy and rename original kraken output (subsampling fraction 1.0) for both precise and default kraken mode
bs_ids = [int(x) for x in bs_ids_str.split(',')]
print(bs_ids)

krakdb = 'corent'
readpool = 'nonbee'
mhg = 2
cs = '005'

for bs_id in bs_ids:
    bs_id_str = '%04d' % bs_id
    shutil.copy('{}BS18-{}_{}_{}_mhg{}_cs{}.kraken2'.format(kraken_path, bs_id_str, krakdb, readpool, mhg, cs), 
                '{}BS18-{}_{}_{}_mhg{}_cs{}_sf1_rep1.kraken2'.format(subsample_path, bs_id_str, krakdb, readpool, mhg, cs))

# --> could also include in script

## make_kreport and bracken

create ktaxonomy (actually pretty sure it was already available in database)

In [None]:
%%bash
python /cluster/raid/home/f80878961/scripts/KrakenTools/make_ktaxonomy.py --nodes /cluster/raid/home/f80878961/phenotypes/corent_krakdb/nodes.dmp --names /cluster/raid/home/f80878961/phenotypes/corent_krakdb/names.dmp --seqid2taxid /cluster/raid/home/f80878961/phenotypes/corent_krakdb/seqid2taxid.map -o /cluster/raid/home/f80878961/phenotypes/corent_krakdb/mydb_taxonomy.txt 

create file for array job indicating which .kraken2 subsamples to convert in k2report and process with bracken

In [None]:
bs_ids_str = '27,62,105,155'
bs_ids = [int(x) for x in bs_ids_str.split(',')]
print(bs_ids)

In [None]:
krakdb = 'corent'
readpool_mhg_cs = [('nonbee', 2, '005')]

subsampling_fractions = [round(x, 2) for x in np.arange(0.1, 1, 0.1)]
replicate_nr = 10
files_to_bracken_fn = '{}files_to_bracken.txt'.format(tmp_path)

with open(files_to_bracken_fn, 'w') as outf:
    for bs_id, (readpool, mhg, cs) in product(bs_ids, readpool_mhg_cs): 
        bs_id_str = '%04d' % bs_id
        for sf, r in product(subsampling_fractions, range(1, replicate_nr + 1)):
            fn = '{}BS18-{}_{}_{}_mhg{}_cs{}_sf{}_rep{}.kraken2'.format(subsample_path, bs_id_str, krakdb, readpool, mhg, cs, str(sf).replace('.', ''), r)
            assert os.path.exists(fn), 'missing {}'.format(fn)
            outf.write(fn.split('/')[-1].replace('.kraken2', '\n'))
        
        # 100% sampling (no replicate)
        fn = '{}BS18-{}_{}_{}_mhg{}_cs{}_sf1_rep1.kraken2'.format(subsample_path, bs_id_str, krakdb, readpool, mhg, cs)
        assert os.path.exists(fn), 'missing {}'.format(fn)
        outf.write(fn.split('/')[-1].replace('.kraken2', '\n'))

In [None]:
def write_bracken_script(bracken_script_fn, array_str):
    bracken_runstr="""#!/bin/bash -l
#SBATCH --array={}
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --mem=20g
#SBATCH --time=01:00:00
#SBATCH --job-name=bracken
#SBATCH --output=%x_%A_%a.out
#SBATCH --error=%x_%A_%a.err

modulesld
ebld
module use /software/anaconda3/envs/eb/easybuild/modules/all
conda activate planb
which bracken

kdb_path=$1
kdb_name=$2
file_path=$3
file_list_path=$4
j=$SLURM_ARRAY_TASK_ID

file_id=$(sed -n ${{j}}'{{p;q}}' ${{file_list_path}})
kraken_fn=${{file_path}}${{file_id}}.kraken2
ktaxonomy=${{kdb_path}}${{kdb_name}}/mydb_taxonomy.txt
kreport=${{file_path}}${{file_id}}.kreport

echo $kraken_fn
echo $ktaxonomy
echo $kreport
echo $bracken_fn
echo $breport

# create kreports from subsampled kraken files
python /cluster/raid/home/f80878961/scripts/KrakenTools/make_kreport.py -i ${{kraken_fn}} -t ${{ktaxonomy}} -o ${{kreport}}

# species
bracken -d ${{kdb_path}}${{kdb_name}} -i ${{kreport}} -r 150 -l S -t 10 -o ${{file_path}}${{file_id}}_S.bracken -w ${{file_path}}${{file_id}}_S.breport

# genus
bracken -d ${{kdb_path}}${{kdb_name}} -i ${{kreport}} -r 150 -l G -t 10 -o ${{file_path}}${{file_id}}_G.bracken -w ${{file_path}}${{file_id}}_G.breport""".format(array_str)
    with open(bracken_script_fn, 'w') as outf:
        outf.write(bracken_runstr)

In [None]:
# which subsampled kraken files to compute
num_lines = sum(1 for _ in open(files_to_bracken_fn))
line_nr_str = '1-{}'.format(num_lines)
# line_nr_str = '4-182'
print(line_nr_str)

In [None]:
bracken_script = '{}bracken.run'.format(script_path)
write_bracken_script(bracken_script, line_nr_str)

kdb_name = 'corent'
file_path = subsample_path
file_list_path = files_to_bracken_fn

start script from Gamarello

In [None]:
%%bash -s "$bracken_script" "$kdb_path" "$kdb_name" "$file_path" "$file_list_path"
cd $4
sbatch $1 $2 $3 $4 $5

## check expected files


In [None]:
# # one off to add db and read pool params
# files = glob.glob(kraken_path + '*')
# for f in files:
#     fn = f.split('/')[-1]
#     if fn.startswith('BS18'):
#         split_fn = fn.split('_')
#         # insert corent_all
#         if split_fn[1].startswith('mhg'):
#             new_fn = '{}{}_corent_all_{}'.format(kraken_path, split_fn[0], '_'.join(split_fn[1:]))
#             os.rename(f, new_fn)

In [None]:
# check if kraken files are there for these parameter combinations
bs_ids = [27, 62, 105, 155]
krakdb = 'corent'
readpool_mhg_cs = [('all', 2, '00'),  ('all', 2, '005'), ('nonbee', 2, '005'), ('all', 4, '005')]

for bs_id, (readpool, mhg, cs) in product(bs_ids, readpool_mhg_cs):
    bs_id_str = '%04d' % bs_id
    file_prefix = '{}BS18-{}_{}_{}_mhg{}_cs{}'.format(kraken_path, bs_id_str, krakdb, readpool, mhg, cs)
    assert os.path.exists('{}.kraken2'.format(file_prefix)), print(file_prefix)
    assert os.path.exists('{}.k2report'.format(file_prefix)), print(file_prefix)

In [None]:
# # one off to add db and read pool params
# files = glob.glob(subsample_path + '*')
# for f in files:
#     fn = f.split('/')[-1]
#     if fn.startswith('BS18'):
#         split_fn = fn.split('_')
#         # insert corent_all
#         if split_fn[1].startswith('mhg'):
#             new_fn = '{}{}_corent_all_{}'.format(subsample_path, split_fn[0], '_'.join(split_fn[1:]))
#             os.rename(f, new_fn)

In [None]:
# check if subsample files are there
bs_ids = [27, 62, 105, 155]
krakdb = 'corent'
readpool_mhg_cs = [('all', 2, '00'),  ('all', 2, '005'), ('nonbee', 2, '005'), ('all', 4, '005')]

level = 'S'
subsampling_fractions = [round(x, 2) for x in np.arange(0.1, 1, 0.1)]
replicate_nr = 10

for bs_id, (readpool, mhg, cs) in product(bs_ids, readpool_mhg_cs): 
    bs_id_str = '%04d' % bs_id
    for sf, r in product(subsampling_fractions,  range(1, replicate_nr + 1)):
        bracken_fn = '{}BS18-{}_{}_{}_mhg{}_cs{}_sf{}_rep{}_{}.bracken'.format(subsample_path, bs_id_str, krakdb, readpool, mhg, cs, str(sf).replace('.', ''), r, level)
        assert os.path.exists(bracken_fn), print(bracken_fn)

## classification overview


Load this table rather than recompute it as I have zipped the subsample directory

In [None]:
def parse_kreport(file_path, bs_id_str, krakdb, readpool, mhg, cs, sf, r):
    # kreport instead of k2report
    kreport_fn = '{}BS18-{}_{}_{}_mhg{}_cs{}_sf{}_rep{}.kreport'.format(file_path, bs_id_str, krakdb, readpool, mhg, cs, str(sf).replace('.', ''), r)
    if not os.path.exists(kreport_fn): 
        print('missing {}'.format(kreport_fn))
    with open(kreport_fn, 'r') as inf:
        ucseqs_nr = int(inf.readline().split()[1])
        cseqs_nr = int(inf.readline().split()[1])
    return ucseqs_nr, cseqs_nr
    
bs_ids = [27, 62, 105, 155]
krakdb_readpool_mhg_cs2kraken_setting = {
    ('corent', 'all', 2, '00'):'Core-nt DB, all reads, Kraken2 default', 
    # ('corent', 'all', 4, '005'):'Core-nt DB, all reads, Kraken2 very-precise',
    ('corent', 'all', 2, '005'):'Core-nt DB, all reads, Kraken2 precise',
    ('corent', 'nonbee', 2, '005'):'Core-nt DB, non-bee reads, Kraken2 precise'}
sf = 1
r = 1
level = 'S'
taxa = set(('root', 'Eukaryota', 'Apis', 'Fungi', 'Bacteria', 'Archaea', 'Viruses'))

columns = ['Sample id', 'Kraken setting', 'Name', 'Read number']

rows = []
for bs_id, (krakdb, readpool, mhg, cs) in product(bs_ids, krakdb_readpool_mhg_cs2kraken_setting.keys()):
    bs_id_str = '%04d' % bs_id
    ucseqs_nr, cseqs_nr = parse_kreport(subsample_path, bs_id_str, krakdb, readpool, mhg, cs, sf, r)
    taxa2seqnr = parse_breport(subsample_path, bs_id_str, krakdb, readpool, mhg, cs, sf, r, level, taxa)

    rows.append((bs_id_str, krakdb_readpool_mhg_cs2kraken_setting[(krakdb, readpool, mhg, cs)], 'Unclassified', ucseqs_nr))
    rows.append((bs_id_str, krakdb_readpool_mhg_cs2kraken_setting[(krakdb, readpool, mhg, cs)], 'Kraken2 only', cseqs_nr - taxa2seqnr['root']))
    rows.append((bs_id_str, krakdb_readpool_mhg_cs2kraken_setting[(krakdb, readpool, mhg, cs)], 'Other Kraken2 + Bracken', taxa2seqnr['root'] - taxa2seqnr['Eukaryota'] - taxa2seqnr['Bacteria'] - taxa2seqnr.get('Archaea', 0) - taxa2seqnr['Viruses']))
    rows.append((bs_id_str, krakdb_readpool_mhg_cs2kraken_setting[(krakdb, readpool, mhg, cs)], 'Viruses', taxa2seqnr['Viruses']))
    rows.append((bs_id_str, krakdb_readpool_mhg_cs2kraken_setting[(krakdb, readpool, mhg, cs)], 'Archaea', taxa2seqnr.get('Archaea', 0)))
    rows.append((bs_id_str, krakdb_readpool_mhg_cs2kraken_setting[(krakdb, readpool, mhg, cs)], 'Bacteria', taxa2seqnr['Bacteria']))
    rows.append((bs_id_str, krakdb_readpool_mhg_cs2kraken_setting[(krakdb, readpool, mhg, cs)], 'Other Eukaryota', taxa2seqnr['Eukaryota'] - taxa2seqnr['Apis'] - taxa2seqnr['Fungi']))
    rows.append((bs_id_str, krakdb_readpool_mhg_cs2kraken_setting[(krakdb, readpool, mhg, cs)], 'Fungi', taxa2seqnr['Fungi']))
    rows.append((bs_id_str, krakdb_readpool_mhg_cs2kraken_setting[(krakdb, readpool, mhg, cs)], 'Apis', taxa2seqnr['Apis']))

class_df = pd.DataFrame(data=rows, columns=columns)

# calculate fraction
read_fraction = np.zeros(len(class_df))
for bs_id, (krakdb, readpool, mhg, cs) in product(bs_ids, krakdb_readpool_mhg_cs2kraken_setting.keys()):
    bs_id_str = '%04d' % bs_id
    ucseqs_nr, cseqs_nr = parse_kreport(subsample_path, bs_id_str, krakdb, readpool, mhg, cs, sf, r)
    f = (class_df['Sample id'] == bs_id_str) & (class_df['Kraken setting'] == krakdb_readpool_mhg_cs2kraken_setting[(krakdb, readpool, mhg, cs)])
    read_fraction[f] = class_df[f]['Read number'] / (ucseqs_nr + cseqs_nr)
class_df['Read fraction'] = read_fraction

In [None]:
class_table_fn = '{}classification_overview.csv'.format(working_path)

# class_df.to_csv(class_table_fn)

class_df = pd.read_csv(class_table_fn, index_col=0)

bs_ids = [27, 62, 105, 155]


In [None]:
class_df

In [None]:
cmap = {
    'Unclassified': '#262626',
    'Kraken2 only': '#575757',
    'Other Kraken2 + Bracken': '#949494',
    'Viruses': '#0173b2',
    'Archaea': '#cc78bc',
    'Bacteria': '#de8f05',
    'Other Eukaryota': '#12634c',
    'Fungi': '#029e73',
    'Apis': '#75c8b0'
}

In [None]:
for bs_id in bs_ids:
    bs_id_str = '%04d' % bs_id
    fig = px.bar(class_df[class_df['Sample id']==bs_id], x="Kraken setting", y="Read number", color="Name",
                hover_data=['Name'], barmode = 'stack', color_discrete_map=cmap, title=bs_id_str)
    fig.update_layout(
        autosize=False,
        width=700,
        height=700,
    )
    fig.show()

### explore colors

In [None]:
sns.color_palette('colorblind').as_hex()

In [None]:
print(sns.color_palette('colorblind').as_hex())

In [None]:
light_palette = sns.light_palette('#029e73', n_colors=9)
dark_palette = sns.dark_palette('#029e73', n_colors=9)

In [None]:
dark_palette.as_hex()

In [None]:
print(dark_palette.as_hex())

In [None]:
light_palette

In [None]:
print(light_palette.as_hex())

In [None]:
print(sns.color_palette('colorblind').as_hex())


In [None]:
cmap = {
    'Unclassified': '#262626',
    'Kraken2 only': '#575757',
    'Other Kraken2 + Bracken': '#949494',
    'Viruses': '#0173b2',
    'Archaea': '#cc78bc',
    'Bacteria': '#de8f05',
    'Other Eukaryota': '#12634c',
    'Fungi': '#029e73',
    'Apis': '#75c8b0'
}

In [None]:
df.plot(kind='bar', stacked=True, color=cmap.values())

### plotly [old]

In [None]:
# for the plotly approach
#dfs = []
#for (mhg, cs) in mhg_cs2kraken_setting.keys():
#    columns = ['Unclassified', 'Kraken2 only', 'Kraken2 + Bracken']
#    rows = []
#    for bs_id_str in bs_ids_str:
#        ucseqs_nr, cseqs_nr = parse_kreport(subsample_path, bs_id_str, mhg, cs, sf, r)
#        taxa2seqnr = parse_breport(subsample_path, bs_id_str, mhg, cs, sf, r, level, taxa)
#        rows.append((ucseqs_nr, cseqs_nr - taxa2seqnr['root'], taxa2seqnr['root']))
#    dfs.append(pd.DataFrame(index=bs_ids_str, data=rows, columns=columns))

#class_df = pd.concat(dfs, axis=1, keys=list(mhg_cs2kraken_setting.values()))

df = class_df

# Create a figure with the right layout
fig = go.Figure(
    layout=go.Layout(
        height=600,
        width=1000,
        barmode="relative",
        yaxis_showticklabels=False,
        yaxis_showgrid=False,
        yaxis_range=[0, df.groupby(axis=1, level=0).sum().max().max() * 1.5],
       # Secondary y-axis overlayed on the primary one and not visible
        yaxis2=go.layout.YAxis(
            visible=False,
            matches="y",
            overlaying="y",
            anchor="x",
        ),
        font=dict(size=24),
        legend_x=0,
        legend_y=1,
        legend_orientation="h",
        hovermode="x",
        margin=dict(b=0,t=10,l=0,r=10)
    )
)
# Define some colors for the product, revenue pairs
colors = {
    "default": {
        "Unclassified": "#F28F1D",
        "Kraken2 only": "#F6C619",
        "Kraken2 + Bracken": "#FADD75",
    },
    "precise": {
        "Unclassified": "#F28F1D",
        "Kraken2 only": "#F6C619",
        "Kraken2 + Bracken": "#FADD75",
    }
}
# Add the traces
for i, t in enumerate(colors):
    for j, col in enumerate(df[t].columns):
        if (df[t][col] == 0).all():
            continue
        fig.add_bar(
            x=df.index,
            y=df[t][col],
            # Set the right yaxis depending on the selected product (from enumerate)
            yaxis=f"y{i + 1}",
            # Offset the bar trace, offset needs to match the width
            # For categorical traces, each category is spaced by 1
            offsetgroup=str(i),
            offset=(i - 1) * 1/3,
            width=1/3,
            legendgroup=t,
            legendgrouptitle_text=t,
            name=col,
            marker_color=colors[t][col],
            marker_line=dict(width=2, color="#333"),
            hovertemplate="%{y}<extra></extra>"
        )

fig.show()

In [None]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go

# Create dummy data indexed by month and with multi-columns [product, revenue]
index = ["California", "Texas", "Arizona", "Nevada", "Louisiana"]
df = pd.concat(
    [
        pd.DataFrame(
            np.random.rand(5, 3) * 1.25 + 0.25,
            index=index,
            columns=["Revenue1", "Revenue2", "Revenue3"]
        ),
        pd.DataFrame(
            np.random.rand(5, 3) + 0.5,
            index=index,
            columns=["Revenue1", "Revenue2", "Revenue3"]
        ),
    ],
    axis=1,
    keys=["Product1", "Product2"]
)

## Richness curves

In [None]:
def parse_kreport(file_path, bs_id_str, krakdb, readpool, mhg, cs, sf, r):
    # kreport instead of k2report
    kreport_fn = '{}BS18-{}_{}_{}_mhg{}_cs{}_sf{}_rep{}.kreport'.format(file_path, bs_id_str, krakdb, readpool, mhg, cs, str(sf).replace('.', ''), r)
    if not os.path.exists(kreport_fn): 
        print('missing {}'.format(kreport_fn))
    with open(kreport_fn, 'r') as inf:
        ucseqs_nr = int(inf.readline().split()[1])
        cseqs_nr = int(inf.readline().split()[1])
    return ucseqs_nr, cseqs_nr

    
def _compute_species_richness_per_lod(bowtie2_path, subsample_path, bs_id_str, krakdb, readpool, mhg, cs, sf, r, level, taxon_filter, lods, rows):
    '''parse bracken file and compute species richness (number of species above lod threshold...)'''

    bact_df = get_bracken_df(bowtie2_path, subsample_path, subsample_path, bs_id_str, krakdb, readpool, mhg, cs, sf, r, level, taxon_filter)
        
    for lod in lods:
        richness = len(bact_df[bact_df['fraction_total_reads'] >= lod])
        rows.append((
            'BS18-{}'.format(bs_id_str), 
            krakdb_readpool_mhg_cs2kraken_setting[(krakdb, readpool, mhg, cs)],
            sf,
            r,
            level,
            lod, 
            richness
        ))

    return bact_df

### Species level

In [None]:
level = 'S'
bs_ids = [27, 62, 105, 155]
krakdb_readpool_mhg_cs2kraken_setting = {
    ('corent', 'all', 2, '00'):'Core-nt DB, all reads, Kraken2 default', 
    # ('corent', 'all', 4, '005'):'Core-nt DB, all reads, Kraken2 very-precise',
    ('corent', 'all', 2, '005'):'Core-nt DB, all reads, Kraken2 precise',
    ('corent', 'nonbee', 2, '005'):'Core-nt DB, non-bee reads, Kraken2 precise'}

subsampling_fractions = [round(x, 2) for x in np.arange(0.1, 1, 0.1)]
replicate_nr = 10

#lods = [5e-05, 4e-05, 3e-05, 2e-05, 2.5e-05, 1e-05]
# lods = [0.00001, 0.00002, 0.00003, 0.00004, 0.00005]
# lods = [0.000001, 0.000009, 0.00001, 0.000011, 0.000015, 0.000019, 0.00002]
lods = [0.000001, 0.000005, 0.00001, 0.00005, 0.0001]

rows = []
for bs_id, (krakdb, readpool, mhg, cs) in product(bs_ids, krakdb_readpool_mhg_cs2kraken_setting.keys()):
        bs_id_str = '%04d' % bs_id
        for sf, r in tqdm(product(subsampling_fractions,  range(1, replicate_nr + 1))):
            bact_df = _compute_species_richness_per_lod(bowtie2_path, subsample_path, bs_id_str, krakdb, readpool, mhg, cs, sf, r, level, bacteria_taxids, lods, rows)
            
        # 100% sampling (no replicate)
        bact_df= _compute_species_richness_per_lod(bowtie2_path, subsample_path, bs_id_str, krakdb, readpool, mhg, cs, 1, 1, level, bacteria_taxids, lods, rows)

rcurve_df = pd.DataFrame(data=rows, columns=['Sample id', 'Kraken setting', 'Sampling fraction', 'Replicate nr', 'Level', 'LoD', 'Richness'])

In [None]:
rcurve_fn = '{}rarefication_curves_S.csv'.format(working_path)

# rcurve_df.to_csv(rcurve_fn)

rcurve_df = pd.read_csv(rcurve_fn, index_col=0)

bs_ids = [27, 62, 105, 155]

In [None]:
cmap = dict(zip(lods, sns.color_palette('colorblind', len(lods))))

In [None]:
f_rcurve_df = rcurve_df[rcurve_df['Kraken setting'].isin(
    {'Core-nt DB, all reads, Kraken2 precise',
    'Core-nt DB, non-bee reads, Kraken2 precise'})]

unfair comparison of lod when removing bee reads!
to think --> tune total read function think again when to calcuoate relative abundance (before after removing bee reads?)

In [None]:
for bs_id in bs_ids:
    bs_id_str = '%04d' % bs_id
    sns.lineplot(data=rcurve_df[rcurve_df['Sample id']=='BS18-0027'], x='Sampling fraction', y='Richness', hue='LoD', style='Kraken setting', palette=cmap)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.title(bs_id_str)
    plt.yscale('log')
    plt.show()

In [None]:
for bs_id in bs_ids:
    bs_id_str = '%04d' % bs_id
    sns.lineplot(data=f_rcurve_df[f_rcurve_df['Sample id']=='BS18-{}'.format(bs_id_str)], x='Sampling fraction', y='Richness', hue='LoD', style='Kraken setting', palette=cmap)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.title(bs_id_str)
    plt.show()

In [None]:
cmap = dict(zip(['BS18-%04d' % bs_id for bs_id in bs_ids], sns.color_palette('colorblind', len(bs_ids))))
for lod in lods:
    sns.lineplot(data=rcurve_df[(rcurve_df['Kraken setting']=='Core-nt DB, non-bee reads, Kraken2 precise') & (rcurve_df['LoD']==lod)], x='Sampling fraction', y='Richness', hue='Sample id', palette=cmap)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.title(lod)
    plt.show()

### Genera level

In [None]:
level = 'G'
bs_ids = [27, 62, 105, 155]
krakdb_readpool_mhg_cs2kraken_setting = {
    ('corent', 'all', 2, '00'):'Core-nt DB, all reads, Kraken2 default', 
    ('corent', 'all', 4, '005'):'Core-nt DB, all reads, Kraken2 very-precise',
    ('corent', 'all', 2, '005'):'Core-nt DB, all reads, Kraken2 precise',
    ('corent', 'nonbee', 2, '005'):'Core-nt DB, non-bee reads, Kraken2 precise'}

subsampling_fractions = [round(x, 2) for x in np.arange(0.1, 1, 0.1)]
replicate_nr = 10

#lods = [5e-05, 4e-05, 3e-05, 2e-05, 2.5e-05, 1e-05]
# lods = [0.00001, 0.00002, 0.00003, 0.00004, 0.00005]
# lods = [0.000001, 0.000009, 0.00001, 0.000011, 0.000015, 0.000019, 0.00002]
lods = [0.000001, 0.000005, 0.00001, 0.00005, 0.0001]

rows = []
for bs_id, (krakdb, readpool, mhg, cs) in product(bs_ids, krakdb_readpool_mhg_cs2kraken_setting.keys()):
        bs_id_str = '%04d' % bs_id
        for sf, r in tqdm(product(subsampling_fractions,  range(1, replicate_nr + 1))):
            bact_df = _compute_species_richness_per_lod(bowtie2_path, subsample_path, bs_id_str, krakdb, readpool, mhg, cs, sf, r, level, bacteria_taxids, lods, rows)
            
        # 100% sampling (no replicate)
        bact_df= _compute_species_richness_per_lod(bowtie2_path, subsample_path, bs_id_str, krakdb, readpool, mhg, cs, 1, 1, level, bacteria_taxids, lods, rows)

rcurve_df = pd.DataFrame(data=rows, columns=['Sample id', 'Kraken setting', 'Sampling fraction', 'Replicate nr', 'Level', 'LoD', 'Richness'])

In [None]:
rcurve_fn = '{}rarefication_curves_G.csv'.format(working_path)

# rcurve_df.to_csv(rcurve_fn)

rcurve_df = pd.read_csv(rcurve_fn, index_col=0)

bs_ids = [27, 62, 105, 155]

In [None]:
cmap = dict(zip(lods, sns.color_palette('colorblind', len(lods))))
f_rcurve_df = rcurve_df[rcurve_df['Kraken setting'].isin(
    {'Core-nt DB, all reads, Kraken2 precise',
    'Core-nt DB, non-bee reads, Kraken2 precise'})]
for bs_id in bs_ids:
    bs_id_str = '%04d' % bs_id
    sns.lineplot(data=f_rcurve_df[f_rcurve_df['Sample id']=='BS18-{}'.format(bs_id_str)], x='Sampling fraction', y='Richness', hue='LoD', style='Kraken setting', palette=cmap)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.title(bs_id_str)
    plt.show()

In [None]:
cmap = dict(zip(['BS18-%04d' % bs_id for bs_id in bs_ids], sns.color_palette('colorblind', len(bs_ids))))
for lod in lods:
    sns.lineplot(data=rcurve_df[(rcurve_df['Kraken setting']=='Core-nt DB, non-bee reads, Kraken2 precise') & (rcurve_df['LoD']==lod)], x='Sampling fraction', y='Richness', hue='Sample id', palette=cmap)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.title(lod)
    plt.show()
    

## Overlap with BEExact taxonomy (bacteria associated with bee)

In [None]:
beexact_sp_name2taxid = get_beexact_species_name2taxid()

In [None]:
level2beexact_taxa = defaultdict(set)
for level in ['d', 'p', 'c', 'o', 'f', 'g', 's']:
    level2beexact_taxa[level] = set()

for line in file_content.split('\n'):
    if not line: continue
    for s in line.rstrip(';').split('\t')[1].split(';'):
        level, taxon = s.split('__')
        assert level in level2beexact_taxa, print('missing {}'.format(level))
        level2beexact_taxa[level].add(taxon.replace('_', ' '))

In [None]:
beexact_sp2taxids = ncbi.get_name_translator(level2beexact_taxa['s'])

In [None]:
for sp, taxids in beexact_sp2taxids.items():
    if len(taxids) != 1:
        print(sp, taxids)

In [None]:
beexact_taxids = set(itertools.chain(*list(beexact_sp2taxids.values())))

beexact_taxnames = set(list(beexact_sp2taxids))

In [None]:
beexact_taxnames

In [None]:
bs_ids = [27, 62, 105, 155]
krakdb_readpool_mhg_cs2kraken_setting = {
    ('corent', 'all', 2, '00'):'Core-nt DB, all reads, Kraken2 default', 
    # ('corent', 'all', 4, '005'):'Core-nt DB, all reads, Kraken2 very-precise',
    ('corent', 'all', 2, '005'):'Core-nt DB, all reads, Kraken2 precise',
    ('corent', 'nonbee', 2, '005'):'Core-nt DB, non-bee reads, Kraken2 precise'}
sf = 1
r = 1
level = 'S'
lods = [0.000001, 0.000005, 0.00001, 0.00005, 0.0001]

columns = ['Sample id', 'Kraken setting', 'LoD', 'Set', 'Species number', 'Species fraction']
rows = []
for bs_id, (krakdb, readpool, mhg, cs) in product(bs_ids, krakdb_readpool_mhg_cs2kraken_setting.keys()):
    bs_id_str = '%04d' % bs_id
    bact_df= get_bracken_df(bowtie2_path, subsample_path, bs_id_str, krakdb, readpool, mhg, cs, sf, r, 'S', bacteria_taxids)
    
    for lod in lods:
        lod_bact_df = bact_df[bact_df['fraction_total_reads'] >= lod]
        all_species_nr = len(lod_bact_df)
        beexact_species_nr = len(set(lod_bact_df['taxonomy_id']).intersection(beexact_taxids))
        nonbeexact_species_nr = all_species_nr - beexact_species_nr
        rows.append((bs_id_str, krakdb_readpool_mhg_cs2kraken_setting[(krakdb, readpool, mhg, cs)], str(lod), 'non-BEExact', nonbeexact_species_nr, nonbeexact_species_nr / all_species_nr))
        rows.append((bs_id_str, krakdb_readpool_mhg_cs2kraken_setting[(krakdb, readpool, mhg, cs)], str(lod), 'BEExact', beexact_species_nr, beexact_species_nr / all_species_nr))

beexact_df = pd.DataFrame(data=rows, columns=columns)

In [None]:
cmap = {
    'BEExact': '#029e73', 
    'non-BEExact': '#d55e00'
}

In [None]:
for bs_id in bs_ids:
    bs_id_str = '%04d' % bs_id
        
    for setting in krakdb_readpool_mhg_cs2kraken_setting.values():

        fig = px.bar(beexact_df[(beexact_df['Sample id']==bs_id_str) & (beexact_df['Kraken setting']==setting)], x="LoD", y="Species fraction", color="Set",
                    hover_data=['Set'], barmode = 'stack', color_discrete_map=cmap, title='BS-{} LoD {}'.format(bs_id_str, setting))
        fig.update_layout(
            autosize=False,
            width=700,
            height=700,
        )
        fig.show()

# Back to analyzing all Phenotypes samples

In [None]:
working_path = '{}phenotypes/'.format(home_path)
fig_path = '/agsad/f80878961/Data-Raw/23_Livestock-PO/235_Bee_ZBF/planB/figures/'

kdb_path = '{}krakdb/'.format(working_path)
tmp_path = '{}tmp/'.format(working_path)
bowtie2_path = '{}bowtie2/'.format(tmp_path)
kraken_path = '{}kraken2/'.format(tmp_path)
bracken_path = '{}bracken/'.format(tmp_path)

kdb_name = 'corent'

## Bowtie2 + Kraken2 + Bracken

In [None]:
#array_str = ','.join([str(x) for x in failed_bs_ids])

In [None]:
# also remove 26, 68, 102, 103 with corrupted fastq
array_str = '35,36,37,41,43,44,46,47,49,50,60,62,63,64,74,75,77,80,90,98,101,104,110,113,123,126,128,129,131,138,152,153,154,161,162'

In [None]:
# array_str = '1-185'
bowtie2_script = '{}bowtie2_bee.run'.format(script_path)
write_bowtie2_script(bowtie2_script, array_str)

In [None]:
idx_path = '{}bee_bt2idx/'.format(working_path)
idx_name = 'bee'
input_path = '{}SwissFastQSample/'.format(working_path)
output_path = bowtie2_path

In [None]:
%%bash -s "$bowtie2_script" "$idx_path" "$idx_name" "$input_path" "$output_path"
cd $5
sbatch $1 $2 $3 $4 $5

In [None]:
failed_bs_ids = []
for bs_id in range(1, 186):
    bs_id_str = '%04d' % bs_id
    with open('{}BS18-{}_bee.out'.format(bowtie2_path, bs_id_str), 'r') as inf:
        line1 = inf.readline().split()
        if len(line1) < 2 or line1[1] != 'reads;':
            print(bs_id, line1)
            failed_bs_ids.append(bs_id)

In [None]:
kraken2_script = '{}kraken2.run'.format(script_path)
write_kraken2_script(kraken2_script, array_str, '', 'nonbee', '2', '0.05')

input_path = bowtie2_path
output_path = kraken_path
read_1_suffix = '_bee_unmapped.1.fastq'
read_2_suffix = '_bee_unmapped.2.fastq'

In [None]:
%%bash -s "$kraken2_script" "$kdb_path" "$kdb_name" "$input_path" "$output_path" "$read_1_suffix" "$read_2_suffix"
cd $5
sbatch $1 $2 $3 $4 $5 $6 $7

In [None]:
# check if kraken files are there for these parameter combinations
bs_ids = list(range(1, 186))
krakdb = 'corent'
readpool_mhg_cs = [('nonbee', 2, '005')]

for bs_id, (readpool, mhg, cs) in product(bs_ids, readpool_mhg_cs):
    bs_id_str = '%04d' % bs_id
    file_prefix = '{}BS18-{}_{}_{}_mhg{}_cs{}_sf1_rep1'.format(kraken_path, bs_id_str, krakdb, readpool, mhg, cs)
    if not os.path.exists('{}.kraken2'.format(file_prefix)): 
        print('{}.kraken2'.format(file_prefix))
    if not os.path.exists('{}.k2report'.format(file_prefix)): 
        print('{}.k2report'.format(file_prefix))

In [None]:
# # add _sf1_rep1 to kraken file names
# bs_ids = list(range(1, 186))
# krakdb = 'corent'
# readpool_mhg_cs = [('nonbee', 2, '005')]
# 
# for bs_id, (readpool, mhg, cs) in product(bs_ids, readpool_mhg_cs):
#     bs_id_str = '%04d' % bs_id
#     old_file_prefix = '{}BS18-{}_{}_{}_mhg{}_cs{}'.format(kraken_path, bs_id_str, krakdb, readpool, mhg, cs)
#     new_file_prefix = '{}BS18-{}_{}_{}_mhg{}_cs{}_sf1_rep1'.format(kraken_path, bs_id_str, krakdb, readpool, mhg, cs)
#     if os.path.exists('{}.kraken2'.format(old_file_prefix)):
#         os.rename('{}.kraken2'.format(old_file_prefix), '{}.kraken2'.format(new_file_prefix))
#     else:
#         print('{}.kraken2'.format(old_file_prefix))
#     if os.path.exists('{}.k2report'.format(old_file_prefix)):
#         os.rename('{}.k2report'.format(old_file_prefix), '{}.k2report'.format(new_file_prefix))
#     else:
#         print('{}.k2report'.format(old_file_prefix))

In [None]:
array_str = '35,36,41,43,44,46,47,49,50,60,62,63,64,74,75,77,80,90,98,101,104,110,113,123,126,128,129,131,138,152,153,154,161,162'
bracken_script = '{}bracken.run'.format(script_path)
write_bracken_script(bracken_script, array_str)

In [None]:
%%bash -s "$bracken_script" "$kdb_path" "$kdb_name" "$kraken_path" "$bracken_path"
cd $5
sbatch $1 $2 $3 $4 $5

In [None]:
# # add _sf1_rep1 to bracken file names
# bs_ids = list(range(1, 186))
# krakdb = 'corent'
# readpool_mhg_cs = [('nonbee', 2, '005')]
# levels = ['S', 'G']
# 
# for bs_id, (readpool, mhg, cs), level in product(bs_ids, readpool_mhg_cs, levels):
#     bs_id_str = '%04d' % bs_id
#     old_file_prefix = '{}BS18-{}_{}_{}_mhg{}_cs{}_{}'.format(bracken_path, bs_id_str, krakdb, readpool, mhg, cs, level)
#     new_file_prefix = '{}BS18-{}_{}_{}_mhg{}_cs{}_sf1_rep1_{}'.format(bracken_path, bs_id_str, krakdb, readpool, mhg, cs, level)
#     if os.path.exists('{}.bracken'.format(old_file_prefix)):
#         os.rename('{}.bracken'.format(old_file_prefix), '{}.bracken'.format(new_file_prefix))
#     else:
#         print('{}.bracken'.format(old_file_prefix))
#     if os.path.exists('{}.breport'.format(old_file_prefix)):
#         os.rename('{}.breport'.format(old_file_prefix), '{}.breport'.format(new_file_prefix))
#     else:
#         print('{}.breport'.format(old_file_prefix))

In [None]:
# check if bracken files are there for these parameter combinations
bs_ids = list(range(1, 186))
krakdb = 'corent'
readpool_mhg_cs = [('nonbee', 2, '005')]

for bs_id, (readpool, mhg, cs) in product(bs_ids, readpool_mhg_cs):
    bs_id_str = '%04d' % bs_id
    file_prefix = '{}BS18-{}_{}_{}_mhg{}_cs{}_sf1_rep1'.format(bracken_path, bs_id_str, krakdb, readpool, mhg, cs)
    if not os.path.exists('{}_S.bracken'.format(file_prefix)): 
        print('{}_S.bracken'.format(file_prefix))
    if not os.path.exists('{}_S.breport'.format(file_prefix)): 
        print('{}_S.breport'.format(file_prefix))

## Classification overview

In [None]:
missing_ids = {26, 68, 102, 103}
bs_ids = ['%04d' % bs_id for bs_id in range(1, 186) if bs_id not in missing_ids]
krakdb, readpool, mhg, cs, sf, r, level = ('corent', 'nonbee', 2, '005', 1, 1, 'S')

# which high level taxa we care about?
taxa = set(('root', 'Eukaryota', 'Apis', 'Varroa', 'Fungi', 'Bacteria', 'Archaea', 'Viruses'))

columns = ['Colony', 'Category', 'Read number']
rows = []
for bs_id in bs_ids:
    ucseqs_nr, cseqs_nr = parse_kreport(kraken_path, bs_id, krakdb, readpool, mhg, cs, sf, r)
    taxa2seqnr = parse_breport(bracken_path, bs_id, krakdb, readpool, mhg, cs, sf, r, level, taxa)
    bs_id_name = 'BS18-{}'.format(bs_id)
    rows.append((bs_id_name, 'Unclassified', ucseqs_nr))
    rows.append((bs_id_name, 'Kraken2 only', cseqs_nr - taxa2seqnr['root']))
    rows.append((bs_id_name, 'Other Kraken2 + Bracken', taxa2seqnr['root'] - taxa2seqnr['Eukaryota'] - taxa2seqnr['Bacteria'] - taxa2seqnr.get('Archaea', 0) - taxa2seqnr['Viruses']))
    rows.append((bs_id_name, 'Viruses', taxa2seqnr['Viruses']))
    rows.append((bs_id_name, 'Archaea', taxa2seqnr.get('Archaea', 0)))
    rows.append((bs_id_name, 'Bacteria', taxa2seqnr['Bacteria']))
    rows.append((bs_id_name, 'Other Eukaryota', taxa2seqnr['Eukaryota'] - taxa2seqnr['Apis'] - taxa2seqnr['Fungi'] - taxa2seqnr.get('Varroa', 0)))
    rows.append((bs_id_name, 'Fungi', taxa2seqnr['Fungi']))
    rows.append((bs_id_name, 'Varroa', taxa2seqnr.get('Varroa', 0)))
    rows.append((bs_id_name, 'Apis', taxa2seqnr['Apis']))

class_df = pd.DataFrame(data=rows, columns=columns)

# calculate fraction a posteriori
read_fraction = np.zeros(len(class_df))
for bs_id in bs_ids:
    bs_id_name = 'BS18-{}'.format(bs_id)
    # number of reads classified by kraken2 + bracken AND by bowtie2 --> to calculate relative abundance
    classified_read_nr = get_classified_read_nr(bowtie2_path, kraken_path, bracken_path, bs_id, krakdb, readpool, mhg, cs, sf, r, level)
    f = (class_df['Colony'] == bs_id_name)
    read_fraction[f] = class_df[f]['Read number'] / classified_read_nr
class_df['Read fraction'] = read_fraction

In [None]:
cmap = {
    'Unclassified': '#262626',
    'Kraken2 only': '#575757',
    'Other Kraken2 + Bracken': '#949494',
    'Viruses': '#0173b2',
    'Archaea': '#d55e00',
    'Bacteria': '#de8f05',
    'Other Eukaryota': '#12634c',
    'Fungi': '#029e73',
    'Metazoa': '#cc78bc',
    'Apis': '#75c8b0',
    'Varroa': '#ece133'
}

In [None]:
fig = px.bar(class_df, x="Colony", y="Read number", color="Category",
            hover_data=['Category'], barmode = 'stack', color_discrete_map=cmap)
fig.update_layout(
    autosize=False,
    width=1500,
    height=700,
)
fig.show()

In [None]:
categories={'Viruses','Archaea','Bacteria','Other Eukaryota','Fungi','Varroa'}

In [None]:
class_df

In [None]:
fig = px.bar(class_df[class_df['Category'].isin(categories)], x="Colony", y="Read fraction", color="Category",
            hover_data=['Category'], barmode = 'stack', color_discrete_map=cmap)
fig.update_layout(
    autosize=False,
    width=1500,
    height=700,
    xaxis={'title': {'font': {'size': 18}}},
    yaxis={'title': {'font': {'size': 18}}},
    legend={'font': {'size': 16}}
)
fig.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.boxplot(data=class_df, x='Category', y='Read fraction', palette=cmap, 
            showmeans=True, meanprops={"marker":"o", "markerfacecolor":"red", "markeredgecolor":"black", "markersize":"5"})
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

# Microbiome and Varroa analyses

In [None]:
kdb_path = '{}krakdb/'.format(working_path)
tmp_path = '{}tmp/'.format(working_path)
bowtie2_path = '{}bowtie2/'.format(tmp_path)
kraken_path = '{}kraken2/'.format(tmp_path)
bracken_path = '{}bracken/'.format(tmp_path)
tlf_path = '{}tlf/'.format(tmp_path)

kdb_name = 'corent'

with open('{}bacteria_taxids.pkl'.format(tmp_path), 'rb') as inf:
    bacteria_taxids = pickle.load(inf)
len(bacteria_taxids)

beexact_sp_name2taxid = get_beexact_species_name2taxid()
beexact_sp_taxids = set(chain(*list(beexact_sp_name2taxid.values())))
beexact_sp_names = set(list(beexact_sp_name2taxid))

## Parse Bracken tables

In [None]:
missing_ids = {26, 68, 102, 103}
bs_ids = ['%04d' % bs_id for bs_id in range(1, 186) if bs_id not in missing_ids]
lod = 0
# lod = 5e-06
krakdb, readpool, mhg, cs, sf, r, level = ('corent', 'nonbee', 2, '005', 1, 1, 'S')
df_list = []
for bs_id in tqdm(bs_ids):
    df = get_bracken_df(bowtie2_path, kraken_path, bracken_path, bs_id, krakdb, readpool, mhg, cs, sf, r, level, {})
    df = df[df['fraction_total_reads'] >= lod][['name', 'taxonomy_id', 'fraction_total_reads', 'new_est_reads', 'log_ratio_honeybee', 'ratio_honeybee']]
    df.insert(0, 'Colony', ['BS18-{}'.format(bs_id) for i in range(len(df))])
    df_list.append(df)

brack_df_nolod = pd.concat(df_list).rename(columns={'name': 'Taxon', 'taxonomy_id': 'TaxID', 'fraction_total_reads': 'Relative abundance', 'new_est_reads': 'Read number', 'ratio_honeybee': 'Ratio Honeybee', 'log_ratio_honeybee': 'Log Ratio Honeybee'})

brack_df_varroa = brack_df_nolod[brack_df_nolod['Taxon'] == 'Varroa destructor']

In [None]:
# apply lod
lod = 5e-06
brack_df = brack_df_nolod[(brack_df_nolod['Relative abundance'] >= lod) & (brack_df_nolod['TaxID'].isin(bacteria_taxids))]
brack_df

In [None]:
# # for Kevin
# brack_df.to_csv('{}{}_bacteria_abundances_BeeStrong_CH.csv'.format(tlf_path, str(datetime.now().date())), index=False)

## Varroa infestation

In [None]:
varroa_df = pheno_df[pheno_df['N°-Ruche-BeeStrong'].isin({'BS18-{}'.format(x) for x in bs_ids})].loc[:, ['N°-Ruche-BeeStrong', 'Nb varroas/100 bees']].rename(columns={'N°-Ruche-BeeStrong':'Colony'})
varroa_df = pd.merge(pd.merge(varroa_df, brack_df_varroa, on='Colony', how='left'), mito_df.rename(columns={'name':'Colony'}), on='Colony', how='left')
varroa_df = varroa_df[['Colony', 'Nb varroas/100 bees', 'Relative abundance', 'Read number', 'Ratio Honeybee', 'Log Ratio Honeybee', 'varroaMitoRatio']]

In [None]:
varroa_df['Nb varroas/100 bees'].to_numpy()

In [None]:
# # my NA means 0 but I don't know what sonia's NA means
# # varroa_df = varroa_df.fillna({'Relative abundance': 0, 'Log Ratio Honeybee': np.log(1e-07)})
# 
# x = varroa_df['Nb varroas/100 bees'].to_numpy()
# y = np.full(len(varroa_df), np.nan)
# y[x > 0] = np.log(x[x > 0])
# varroa_df['Log Nb varroas/100 bees'] = y

### compare measures

compare my measures: Log Ratio Honeybee and Relative abundance, and Sonia' measure: VarroaMitoRatio

for fair comparison filter rows with NA and 0 values in one of these measures

In [None]:
varroa_df.describe()

In [None]:
varroa_df_f = varroa_df.dropna()

In [None]:
varroa_df_f.describe()

In [None]:
figsize=(10, 8)
sns.set_style('whitegrid')
sns.set_context("talk", font_scale=1.5)

In [None]:
print(varroa_df_f['Nb varroas/100 bees'].corr(varroa_df_f['Relative abundance']))

plt.figure(figsize=figsize)
sns.scatterplot(data=varroa_df_f, x='Nb varroas/100 bees', y='Relative abundance')
plt.show()

In [None]:
print(varroa_df_f['Nb varroas/100 bees'].corr(varroa_df_f['Ratio Honeybee']))
plt.figure(figsize=figsize)
sns.scatterplot(data=varroa_df_f, x='Nb varroas/100 bees', y='Ratio Honeybee')
plt.show()

In [None]:
print(varroa_df_f['Nb varroas/100 bees'].corr(varroa_df_f['varroaMitoRatio']))
plt.figure(figsize=figsize)
sns.scatterplot(data=varroa_df_f, x='Nb varroas/100 bees', y='varroaMitoRatio')
plt.show()

very similar results, but relative abundance is actually better than ratio

In [None]:
print(varroa_df_f['Relative abundance'].corr(varroa_df_f['varroaMitoRatio']))
plt.figure(figsize=figsize)
sns.scatterplot(data=varroa_df_f, x='Relative abundance', y='varroaMitoRatio')
plt.show()

In [None]:
sns.scatterplot(data=varroa_df_f, x='Ratio Honeybee', y='varroaMitoRatio')
print(varroa_df_f['Ratio Honeybee'].corr(varroa_df_f['varroaMitoRatio']))


In [None]:
sns.scatterplot(data=varroa_df_f, x='Ratio Honeybee', y='Relative abundance')
print(varroa_df_f['Ratio Honeybee'].corr(varroa_df_f['Relative abundance']))

my two measures are extremely similar , more than either with sonia's measure, which makes sense

### include all samples

In [None]:
varroa_df.shape

In [None]:
sns.scatterplot(data=varroa_df, x='Nb varroas/100 bees', y='Relative abundance')
print(varroa_df['Nb varroas/100 bees'].corr(varroa_df['Relative abundance']))

### Linear modelling [old]

In [None]:
from sklearn import preprocessing, svm 
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression 

In [None]:
# aim to predict varroa abundance based on sequence
X = np.array(brack_pheno_df['varroaBrackenFrac']).reshape(-1, 1)
y = np.array(brack_pheno_df['Nb varroas/100 bees']).reshape(-1, 1) 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25) 

In [None]:
regr = LinearRegression() 
regr.fit(X_train, y_train) 
print(regr.score(X_test, y_test)) 

In [None]:
y_pred = regr.predict(X_test) 
plt.scatter(X_test, y_test, color ='b') 
plt.plot(X_test, y_pred, color ='k') 
plt.show() 

In [None]:
regr.predict(np.array([[0.0008]]))

In [None]:
from sklearn.metrics import mean_absolute_error,mean_squared_error 
  
mae = mean_absolute_error(y_true=y_test,y_pred=y_pred) 
#squared True returns MSE value, False returns RMSE value. 
mse = mean_squared_error(y_true=y_test,y_pred=y_pred) #default=True 
rmse = mean_squared_error(y_true=y_test,y_pred=y_pred)#,squared=False) 
  
print("MAE:",mae) 
print("MSE:",mse) 
print("RMSE:",rmse)

In [None]:
# what if we do the same but using 25% lower values as test set

In [None]:
sorted_brack_pheno_df = brack_pheno_df.sort_values(by='Nb varroas/100 bees')

In [None]:
X = np.array(sorted_brack_pheno_df['varroaBrackenFrac']).reshape(-1, 1)
y = np.array(sorted_brack_pheno_df['Nb varroas/100 bees']).reshape(-1, 1) 

In [None]:
idx = X.size // 4

X_train = X[idx:]
X_test = X[:idx]
y_train = y[idx:]
y_test = y[:idx]

In [None]:
regr = LinearRegression() 
regr.fit(X_train, y_train) 
print(regr.score(X_test, y_test)) 

In [None]:
y_pred = regr.predict(X_test) 
plt.scatter(X_test, y_test, color ='b') 
plt.plot(X_test, y_pred, color ='k') 
plt.show() 

In [None]:
from sklearn.metrics import mean_absolute_error,mean_squared_error 
  
mae = mean_absolute_error(y_true=y_test,y_pred=y_pred) 
#squared True returns MSE value, False returns RMSE value. 
mse = mean_squared_error(y_true=y_test,y_pred=y_pred) #default=True 
rmse = mean_squared_error(y_true=y_test,y_pred=y_pred)#,squared=False) 
  
print("MAE:",mae) 
print("MSE:",mse) 
print("RMSE:",rmse)

logic cause model is not suppose to work well for low abundance values 
- what if I train AND test model on higher values

In [None]:
X = np.array(sorted_brack_pheno_df['varroaBrackenFrac']).reshape(-1, 1)
y = np.array(sorted_brack_pheno_df['Nb varroas/100 bees']).reshape(-1, 1) 

idx = X.size // 4
X = X[idx:]
y = y[idx:]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25) 

In [None]:
regr = LinearRegression() 
regr.fit(X_train, y_train) 
print(regr.score(X_test, y_test)) 

In [None]:
y_pred = regr.predict(X_test) 
plt.scatter(X_test, y_test, color ='b') 
plt.plot(X_test, y_pred, color ='k') 
plt.show() 

lower fit it seems 

- let's exclude higher values

In [None]:
sorted_brack_pheno_df = brack_pheno_df.sort_values(by='varroaBrackenFrac')
X = np.array(sorted_brack_pheno_df['varroaBrackenFrac']).reshape(-1, 1)
y = np.array(sorted_brack_pheno_df['Nb varroas/100 bees']).reshape(-1, 1)

reg_scores_all = []
for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)
    regr = LinearRegression() 
    regr.fit(X_train, y_train) 
    reg_scores_all.append(regr.score(X_test, y_test))

In [None]:
sorted_brack_pheno_df = brack_pheno_df.sort_values(by='varroaBrackenFrac')
X = np.array(sorted_brack_pheno_df['varroaBrackenFrac']).reshape(-1, 1)[:160]
y = np.array(sorted_brack_pheno_df['Nb varroas/100 bees']).reshape(-1, 1)[:160]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25) 

reg_scores_nohigh = []
for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)
    regr = LinearRegression() 
    regr.fit(X_train, y_train) 
    reg_scores_nohigh.append(regr.score(X_test, y_test))

In [None]:
sns.boxplot(reg_scores_all)

In [None]:
sns.boxplot(reg_scores_nohigh)

In [None]:
np.mean(reg_scores_all)

In [None]:
np.mean(reg_scores_nohigh)

well seems to decrease the fit to remove high values

not sure how comparable however...

very stochastic score...

train on everything 

In [None]:
sorted_brack_pheno_df = brack_pheno_df.sort_values(by='varroaBrackenFrac')
X = np.array(sorted_brack_pheno_df['varroaBrackenFrac']).reshape(-1, 1)[:160]
y = np.array(sorted_brack_pheno_df['Nb varroas/100 bees']).reshape(-1, 1)[:160]
len(X)

In [None]:
len(y)

In [None]:
regr = LinearRegression() 
regr.fit(X, y)

In [None]:
y_pred = regr.predict(X) 
plt.scatter(X, y, color ='b') 
plt.plot(X, y_pred, color ='k') 
plt.show()

In [None]:
# proportion of infected bees estimated as number of varroa per bee
P = (y_pred / 100).flatten()

In [None]:
N = sorted_brack_pheno_df["Nombre d'abeilles"][:160]

In [None]:
P

In [None]:
N

In [None]:
print(len(P))
print(len(N))

In [None]:
D = 1-(1-P)**N

In [None]:
len(D)

In [None]:
D

In [None]:
sns.boxplot(D)

In [None]:
np.mean(D)

In [None]:
1 - sum(sorted_brack_pheno_df['Nb varroas/100 bees'] == 0) / len(brack_pheno_df)

removing outliers give closer results to what theory is expected...

--> possible to have > 1 varroa per bee

In [None]:
sorted_brack_pheno_df

In [None]:
plt.scatter(X[:50], y[:50], color ='b') 
plt.plot(X[:50], y_pred[:50], color ='k') 
plt.show()

In [None]:
plt.scatter(X[:100], y[:100], color ='b') 
plt.plot(X[:100], y_pred[:100], color ='k') 
plt.show()

In [None]:
plt.scatter(X[100:], y[100:], color ='b') 
plt.plot(X[100:], y_pred[100:], color ='k') 
plt.show()

### ONT sample selection [old]

In [None]:
brack_pheno_mito_df

In [None]:
Nb varroas/100 bees

In [None]:
bins = sns.histplot(brack_pheno_mito_df['Nb varroas/100 bees'], bins=23)

In [None]:
len(brack_pheno_mito_df['Nb varroas/100 bees'])

In [None]:
brack_pheno_mito_df['Nb varroas/100 bees'].max()

In [None]:
bin_nr, bin_values = np.histogram(brack_pheno_mito_df['Nb varroas/100 bees'], bins=23)
bin_values[0] = -1

In [None]:
bin_nr

In [None]:
random.seed(1234)

In [None]:
selected_samples = []

for i in range(len(bin_nr)):
    if bin_nr[i] == 0: continue
    left = bin_values[i]
    right = bin_values[i + 1]
    selected_samples.append(random.choice(brack_pheno_mito_df[(brack_pheno_mito_df['Nb varroas/100 bees']>left) & (brack_pheno_mito_df['Nb varroas/100 bees']<=right)]['name'].to_list()))

In [None]:
random.choice(brack_pheno_mito_df[(brack_pheno_mito_df['Nb varroas/100 bees']>left) & (brack_pheno_mito_df['Nb varroas/100 bees']<=right)]['name'].to_list())

In [None]:
len(selected_samples)

In [None]:
for s in sorted(selected_samples):
    print(s)

In [None]:
sns.scatterplot(data=brack_pheno_mito_df[brack_pheno_mito_df['name'].isin(selected_samples)], x='Nb varroas/100 bees', y='varroaBrackenFrac')

In [None]:
sns.scatterplot(data=brack_pheno_mito_df[brack_pheno_mito_df['name'].isin(selected_samples)], x='Nb varroas/100 bees', y='varroaMitoRatio')

In [None]:
sns.histplot(data=brack_pheno_mito_df[brack_pheno_mito_df['name'].isin(selected_samples)], x='chrDepth', bins=10)

In [None]:
sns.histplot(data=brack_pheno_mito_df, x='chrDepth', bins=10)

In [None]:
pheno_df

In [None]:
sns.histplot(pheno_df[pheno_df['N°-Ruche-BeeStrong'].isin(selected_samples)]['Nb varroas/100 bees'], bins=10)

## Misc

### Absolute, Relative abundances, and Log ratios

In [None]:
sns.lmplot(data=brack_df, y='Relative abundance', x='new_est_reads')
plt.show()
print(brack_df['Relative abundance'].corr(brack_df['new_est_reads']))

In [None]:
ax = sns.scatterplot(y=brack_df['Log Ratio Honeybee'], x=np.log(brack_df['new_est_reads']))
plt.show()
print(brack_df['Log Ratio Honeybee'].corr(np.log(brack_df['new_est_reads'])))

In [None]:
ax = sns.scatterplot(y=brack_df['Log Ratio Honeybee'], x=np.log(brack_df['Relative abundance']))
plt.show()
print(brack_df['Log Ratio Honeybee'].corr(np.log(brack_df['Relative abundance'])))

--> both approach should be super fine then

### explo BC

In [None]:
pivot_relab_df = brack_df.pivot(index='Colony', columns='Taxon', values='Relative abundance').fillna(0)

In [None]:
pivot_absab_df = brack_df.pivot(index='Colony', columns='Taxon', values='new_est_reads').fillna(0)

In [None]:
bray_curtis_dissimilarity(pivot_absab_df.loc['BS18-0001'].to_numpy(), pivot_absab_df.loc['BS18-0002'].to_numpy())

In [None]:
bray_curtis_dissimilarity(pivot_relab_df.loc['BS18-0001'].to_numpy(), pivot_relab_df.loc['BS18-0002'].to_numpy())

In [None]:
pivot_absab_df.loc['BS18-0001'] > 0

In [None]:
np.all(np.array(pivot_absab_df.loc['BS18-0001'] > 0) == np.array(pivot_relab_df.loc['BS18-0001'] > 0))

In [None]:
def bray_curtis_dissimilarity(sample1, sample2):
    # Ensure the samples are numpy arrays
    sample1 = np.array(sample1)
    sample2 = np.array(sample2)
    
    # Calculate the Bray-Curtis dissimilarity
    numerator = np.sum(np.abs(sample1 - sample2))
    denominator = np.sum(sample1 + sample2)
    
    return numerator / denominator


In [None]:
s1 = [0, 1, 5]
s2 = [1, 0, 10]

In [None]:
sample1, sample2 = s1, s2
sample1 = np.array(sample1)
sample2 = np.array(sample2)

In [None]:
np.sum(np.abs(sample1 - sample2))

In [None]:
np.sum(sample1 + sample2)

In [None]:
np.log10(1e-07)

In [None]:
1/10000000

In [None]:
bray_curtis_dissimilarity(s1, s2)

In [None]:
np.abs(s1 - s2)

## Colony-centric e.g., Richness

In [None]:
figsize=(10, 8)
sns.set_style('whitegrid')
sns.set_context("talk", font_scale=1.5)

In [None]:
def shannon_diversity(group):
    proportions = group['Relative abundance'] / group['Relative abundance'].sum()
    return -sum(proportions * np.log(proportions))

In [None]:
x = brack_df.groupby('Colony').agg({'Taxon' : ['count']})['Taxon']

colony_df = pd.DataFrame({
    'Colony' : x.index,
    'Richness' : x['count'],
    'Shannon diversity': list(brack_df.groupby('Colony').apply(shannon_diversity).reset_index(name='Shannon diversity')['Shannon diversity'])
}).reset_index(drop=True)
colony_df['Total read nr'] = [get_total_read_nr(bowtie2_path, col.split('-')[1]) for col in colony_df['Colony'] if bs_id not in missing_ids]

# add traits
colony_df = colony_df.merge(pheno_df[['N°-Ruche-BeeStrong', 'Identifiant Reine',
       'Apiary', 'Population type',
       'Calmness (Wabensitz)', 'Gentleness (Sanftmut)',
       'Number of workers',
       'Total worker brood (dm2)',
       'Total honey stores (dm2)',
       'Surface of pollen (dm2)',
       'Nb varroas/100 bees',
       'SMR - % infested cells recapped',
       'pin-test -cleared cells/hour']].rename(columns={'N°-Ruche-BeeStrong':'Colony'}), on='Colony', how='inner')

### Species richness vs. library size

In [None]:
x = brack_df_nolod.groupby('Colony').agg({'Taxon' : ['count']})['Taxon']

colony_df_nolod = pd.DataFrame({
    'Colony' : x.index,
    'Richness' : x['count']
}).reset_index(drop=True)
colony_df_nolod['Total read nr'] = [get_total_read_nr(bowtie2_path, col.split('-')[1]) for col in colony_df_nolod['Colony'] if bs_id not in missing_ids]

In [None]:
print(colony_df_nolod['Richness'].corr(colony_df_nolod['Total read nr']))

plt.figure(figsize=figsize)
sns.scatterplot(data=colony_df_nolod, x='Total read nr', y='Richness')
plt.show()

In [None]:
print(colony_df['Richness'].corr(colony_df['Total read nr']))

plt.figure(figsize=figsize)
sns.scatterplot(data=colony_df, x='Total read nr', y='Richness')
plt.show()

In [None]:
sns.lmplot(data=colony_df, x='Total read nr', y='Shannon diversity')
print(colony_df['Shannon diversity'].corr(colony_df['Total read nr']))

### Richness and evenness

Here I want to see whether there are relations between diversity (richness, eveness) and varroa abundance and other variables
- H:  more varroa, could disturb the microbiome due to immune response

In [None]:
for trait in [
       'Apiary', 'Population type',
       'Calmness (Wabensitz)', 'Gentleness (Sanftmut)']:
    sns.boxplot(data=colony_df, x=trait, y='Richness')
    plt.xticks(rotation=90)
    plt.show()
    sns.boxplot(data=colony_df, x=trait, y='Shannon diversity')
    plt.xticks(rotation=90)
    plt.show()

In [None]:
for trait in [
       'Number of workers',
       'Total worker brood (dm2)',
       'Total honey stores (dm2)',
       'Surface of pollen (dm2)',
       'Nb varroas/100 bees',
       'SMR - % infested cells recapped',
       'pin-test -cleared cells/hour']:
    sns.lmplot(data=colony_df, x=trait, y='Richness')
    plt.show()
    print(colony_df['Richness'].corr(colony_df[trait]))
    sns.lmplot(data=colony_df, x=trait, y='Shannon diversity')
    plt.show()
    print(colony_df['Shannon diversity'].corr(colony_df[trait]))

## Species-centric

### Prevalence and abundances

In [None]:
# calculate mean and median abundance of each species
median_abund_df = brack_df.groupby('Taxon').agg({'Relative abundance' : ['median']})['Relative abundance']
mean_abund_df = brack_df.groupby('Taxon').agg({'Relative abundance' : ['mean']})['Relative abundance']
sp2median_abund = dict(zip(median_abund_df.index, median_abund_df['median']))
sp2mean_abund = dict(zip(mean_abund_df.index, mean_abund_df['mean']))

In [None]:
taxids, counts = np.unique(brack_df['TaxID'], return_counts=True)
species, counts = np.unique(brack_df['Taxon'], return_counts=True)

In [None]:
# I guess it handles duplicates...
sp_name2taxid = dict(zip(brack_df['Taxon'], brack_df['TaxID']))

In [None]:
print(len(beexact_sp_taxids.intersection(taxids)))
print(len(beexact_sp_names.intersection(species)))

In [None]:
species, counts = np.unique(brack_df['Taxon'], return_counts=True)
species_freqs = counts / len(bs_ids)
in_beexact = [x in beexact_sp_names for x in species]
core = [x >= 0.5 for x in species_freqs]
prev_df = pd.DataFrame({
    'Taxon' : list(species),
    'TaxID' : [sp_name2taxid[sp] for sp in species],
    'Prevalence' : list(species_freqs),
    'Median Relative Abundance': [sp2median_abund[sp] for sp in species],
    'Mean Relative Abundance': [sp2mean_abund[sp] for sp in species],
    'In BEExact' : in_beexact,
    'Core' : core
})

In [None]:
print(sns.color_palette('Set1').as_hex())

In [None]:
plt.figure(figsize=figsize)
sns.histplot(data=prev_df, x='Prevalence', hue='In BEExact', multiple='stack', palette='Set1')
plt.show()

In [None]:
from scipy import stats

In [None]:
t_stat, p_value = stats.ttest_ind(prev_df[prev_df['In BEExact'] == True]['Prevalence'].to_list(), prev_df[prev_df['In BEExact'] == False]['Prevalence'].to_list())
p_value

In [None]:
plt.figure(figsize=figsize)
sns.boxplot(data=prev_df, x='In BEExact', y='Prevalence',palette='Set1')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

In [None]:
fig = px.scatter(prev_df, x='Prevalence', y='Median Relative Abundance', color='In BEExact', hover_data=['Taxon'])
fig.update_layout(
    autosize=False,
    width=700,
    height=500,
    xaxis={'title': {'font': {'size': 18}}},
    yaxis={'title': {'font': {'size': 18}}},
    legend={'font': {'size': 16}}
)
fig.show()


In [None]:
fig = px.scatter(prev_df, x='Prevalence', y='Median Relative Abundance', color='In BEExact', hover_data=['Taxon'], log_y=True)
fig.update_layout(
    autosize=False,
    width=700,
    height=500
)
fig.show()


In [None]:
fig = px.scatter(prev_df, x='Prevalence', y='Mean Relative Abundance', color='In BEExact', hover_data=['Taxon'], log_y=True)
fig.update_layout(
    autosize=False,
    width=700,
    height=500
)
fig.show()


### (Core) microbiome distribution

In [None]:
# defining here the core bacteria...

min_prev = 0.0
# min_prev = 0.5

core_prev_df = prev_df[prev_df['Prevalence'] >= min_prev]
core_brack_df = brack_df[brack_df['Taxon'].isin(core_prev_df['Taxon'])]
assert len(set(core_brack_df['Taxon'])) == len(core_prev_df['Taxon'])

In [None]:
# select species I want to show

def get_parent_taxid_at_level(taxid, level):
    lineage = ncbi.get_lineage(taxid)
    level2taxid = {v:k for k, v in ncbi.get_rank(lineage).items()}
    parent_taxid = level2taxid[level]
    return ncbi.get_taxid_translator([parent_taxid])[parent_taxid]
    
# other species are merged together
plot_median_lod = 0.001

abund_df = core_prev_df[core_prev_df['Median Relative Abundance'] >= plot_median_lod][['Taxon', 'TaxID', 'Median Relative Abundance']]
abund_df['Family'] = [get_parent_taxid_at_level(taxid, 'family') for taxid in abund_df['TaxID']] 
abund_df.loc[len(abund_df)] = ['Other bacteria', np.nan, core_prev_df[core_prev_df['Median Relative Abundance'] < plot_median_lod].sum()['Median Relative Abundance'], 'Other bacteria']
abund_df = abund_df.sort_values(by=['Family', 'Median Relative Abundance'], ascending=[True, False])
abund_df

In [None]:
# get species colors as gradients of family colors
fam2color = {
'Bartonellaceae':'#0173b2',
'Bifidobacteriaceae' :'#de8f05',
'Enterobacteriaceae':'#029e73',
'Erwiniaceae':'#d55e00',
'Lactobacillaceae':'#cc78bc',
'Morganellaceae':'#ca9161',
'Neisseriaceae':'#56b4e9',
'Orbaceae':'#ece133',
'Other bacteria':'#949494'
}

x = abund_df.groupby('Family').agg({'Family' : 'count'})
fam2sp_nr = dict(zip(x.index, x['Family']))

sp2color = {}
fam2visit = {k: 0 for k in fam2sp_nr}
for i, sp in enumerate(abund_df['Taxon']):
    fam = abund_df['Family'].iloc[i]
    light_palette = sns.light_palette(fam2color[fam], n_colors=fam2sp_nr[fam] + 1).as_hex()[::-1]
    sp2color[sp] = light_palette[fam2visit[fam]]
    fam2visit[fam] += 1

sp2color

In [None]:
# merge all non select species into others
abund_brack_df = core_brack_df[core_brack_df['Taxon'].isin(sp2color)][['Colony', 'Taxon', 'Relative abundance']]
sp2fam = dict(zip(abund_df['Taxon'], abund_df['Family']))
abund_brack_df['Family'] = [sp2fam[sp] for sp in abund_brack_df['Taxon']]

x = core_brack_df[~core_brack_df['Taxon'].isin(sp2color)].groupby('Colony').agg({'Relative abundance' : 'sum'})
other_df = pd.DataFrame(data={'Colony': x.index, 'Taxon':  ['Other bacteria' for i in range(len(x))], 'Relative abundance': x['Relative abundance'], 'Family': ['Other bacteria' for i in range(len(x))]})

abund_brack_df = pd.concat([abund_brack_df, other_df]).sort_values('Colony')
abund_brack_df

In [None]:
fig = px.bar(abund_brack_df, x="Colony", y="Relative abundance", color="Taxon",
            hover_data=['Taxon'], barmode = 'stack', color_discrete_map=sp2color, category_orders={'Taxon': list(abund_df['Taxon'])})
fig.update_layout(
    autosize=False,
    width=1500,
    height=700,
    xaxis={'title': {'font': {'size': 18}}},
    yaxis={'title': {'font': {'size': 18}}},
    legend={'font': {'size': 16}}
)
fig.show()

## Beta diversity, Jaccard and Bray curtis


### all bacteria and Jaccard

In [None]:
jaccard_df = brack_df.pivot(index='Colony', columns='Taxon', values='Relative abundance').map(lambda x: 1 if x > 0 else x).fillna(0)

In [None]:
# add metadata
pheno_jaccard_df = pd.merge(pheno_df, jaccard_df, left_index=True, right_index=True)

color_dict = {}
discrete_vars = ['Apiary', 'Population type']
continous_vars =  [
       'Nb varroas/100 bees',
       'SMR - % infested cells recapped',
       'pin-test -cleared cells/hour']
for var in discrete_vars:
    values = pheno_jaccard_df[var]
    cmap = dict(zip(values.unique(), [x.upper() for x in list(sns.color_palette('husl', n_colors=len(values.unique())).as_hex())]))
    color_dict[var] =  values.map(cmap)

for var in continous_vars:
    values = pheno_jaccard_df[var]
    normalized_values = np.interp(values, (min(values), max(values)), (0, 1))
    cmap = plt.get_cmap('viridis')
    hex_colors = [mcolors.to_hex(cmap(val)) for val in normalized_values]
    color_dict[var] =  hex_colors
    
row_colors = pd.DataFrame(color_dict)

In [None]:
plt.figure(figsize=(20, 16))
cg=sns.clustermap(jaccard_df, metric='jaccard', row_colors=row_colors)
cg.cax.set_visible(False)
plt.show()

In [None]:
pca = PCA()
x_pca = pca.fit_transform(jaccard_df)

sns.barplot(pca.explained_variance_ratio_[:10])

pca_df = pd.DataFrame(data=x_pca, columns=['PC{}'.format(x) for x in range(1, x_pca.shape[1] + 1)], index=jaccard_df.index)

pheno_pca_df = pd.merge(pheno_df, pca_df.iloc[:, :5], left_index=True, right_index=True)

pheno_pca_melt_df = pd.melt(pheno_pca_df, id_vars=list(pheno_pca_df.columns[:50]), value_vars=list(pheno_pca_df.columns[50:]), 
        var_name = 'PC', value_name='PC_value')

In [None]:
variables = ['Apiary', 'Population type']
for v in variables:
    g = sns.FacetGrid(pheno_pca_melt_df, col='PC', col_wrap=5, height=3, aspect=1)
    g.map(sns.boxplot,'PC_value',v)
    g.set_titles("{col_name}")

In [None]:
for pc in ['PC1', 'PC2', 'PC3', 'PC4', 'PC5']:
    for v in variables:
        df = pheno_pca_melt_df[(pheno_pca_melt_df['PC'] == pc)]
        print(pc, v, df[v].corr(df['PC_value']))

In [None]:
variables = [
       #'Number of workers',
       #'Total worker brood (dm2)',
       #'Total honey stores (dm2)',
       #'Surface of pollen (dm2)',
       'Nb varroas/100 bees',
       'SMR - % infested cells recapped',
       'pin-test -cleared cells/hour']

for v in variables:
    g = sns.FacetGrid(pheno_pca_melt_df, col='PC', col_wrap=5, height=3, aspect=1)
    g.map(sns.regplot,'PC_value',v,scatter_kws={'s':4})
    g.set_titles("{col_name}")

In [None]:
variables = [
       #'Number of workers',
       #'Total worker brood (dm2)',
       #'Total honey stores (dm2)',
       #'Surface of pollen (dm2)',
       'Nb varroas/100 bees',
       'SMR - % infested cells recapped',
       'pin-test -cleared cells/hour']

for v in variables:
    g = sns.FacetGrid(pheno_pca_melt_df, col='PC', hue='Population type', col_wrap=5, height=3, aspect=1)
    g.map(sns.scatterplot,'PC_value',v)
    g.set_titles("{col_name}")

In [None]:
sns.set_style('whitegrid')
plt.figure(figsize=(8, 8))
x='PC1'
y='PC2'
sns.scatterplot(data=pheno_pca_df, x=x, y=y, hue='Apiary')
plt.xlabel(x, fontsize=14)
plt.ylabel(y, fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.legend(fontsize=14)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

In [None]:
plt.figure(figsize=(8, 8))
x='PC1'
y='PC2'
hue = 'Population type'
sns.scatterplot(data=pheno_pca_df, x=x, y=y, hue=hue)
plt.xlabel(x, fontsize=14)
plt.ylabel(y, fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.legend(fontsize=14)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(data=pheno_pca_df, x='PC2', y='PC3', hue='Nb varroas/100 bees')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(data=pheno_pca_df, x='PC1', y='PC4', hue='SMR - % infested cells recapped')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

### core and bray-curtis

In [None]:
# use only abundant bacteria > 0.001 (as for community distributions)
core_brack_df = brack_df[brack_df['Taxon'].isin(abund_df['Taxon'])]
braycurtis_df = core_brack_df.pivot(index='Colony', columns='Taxon', values='Relative abundance').fillna(0)
len(set(core_brack_df['Taxon'])) == abund_df.shape[0] - 1

In [None]:
cmap = plt.get_cmap("viridis")
cmap.set_bad(color='white')
mask = braycurtis_df == 0

In [None]:
cg = sns.clustermap(braycurtis_df, metric='braycurtis', row_colors=row_colors)#, cmap=cmap, mask=mask)
cg.cax.set_visible(False)

In [None]:
pca = PCA()
x_pca = pca.fit_transform(braycurtis_df)

sns.barplot(pca.explained_variance_ratio_[:10])

pca_df = pd.DataFrame(data=x_pca, columns=['PC{}'.format(x) for x in range(1, x_pca.shape[1] + 1)], index=jaccard_df.index)

pheno_pca_df = pd.merge(pheno_df, pca_df.iloc[:, :3], left_index=True, right_index=True)

pheno_pca_melt_df = pd.melt(pheno_pca_df, id_vars=list(pheno_pca_df.columns[:50]), value_vars=list(pheno_pca_df.columns[50:]), 
        var_name = 'PC', value_name='PC_value')

In [None]:
variables = ['Apiary', 'Population type']
for v in variables:
    g = sns.FacetGrid(pheno_pca_melt_df, col='PC', col_wrap=5, height=3, aspect=1)
    g.map(sns.boxplot,'PC_value',v)
    g.set_titles("{col_name}")

In [None]:
variables = [
       #'Number of workers',
       #'Total worker brood (dm2)',
       #'Total honey stores (dm2)',
       #'Surface of pollen (dm2)',
       'Nb varroas/100 bees',
       'SMR - % infested cells recapped',
       'pin-test -cleared cells/hour']

for pc in ['PC1', 'PC2', 'PC3']:
    for v in variables:
        df = pheno_pca_melt_df[(pheno_pca_melt_df['PC'] == pc)]
        print(pc, v, df[v].corr(df['PC_value']))

In [None]:
variables = [
       #'Number of workers',
       #'Total worker brood (dm2)',
       #'Total honey stores (dm2)',
       #'Surface of pollen (dm2)',
       'Nb varroas/100 bees',
       'SMR - % infested cells recapped',
       'pin-test -cleared cells/hour']

for v in variables:
    g = sns.FacetGrid(pheno_pca_melt_df, col='PC', col_wrap=5, height=3, aspect=1)
    g.map(sns.regplot,'PC_value',v,scatter_kws={'s':4})
    g.set_titles("{col_name}")

In [None]:
variables = [
       #'Number of workers',
       #'Total worker brood (dm2)',
       #'Total honey stores (dm2)',
       #'Surface of pollen (dm2)',
       'Nb varroas/100 bees',
       'SMR - % infested cells recapped',
       'pin-test -cleared cells/hour']

for v in variables:
    g = sns.FacetGrid(pheno_pca_melt_df, col='PC', hue='Population type', col_wrap=5, height=3, aspect=1)
    g.map(sns.scatterplot,'PC_value',v)
    g.set_titles("{col_name}")

In [None]:
sns.set_style('whitegrid')
plt.figure(figsize=(8, 8))
x='PC1'
y='PC2'
sns.scatterplot(data=pheno_pca_df, x=x, y=y, hue='Apiary')
plt.xlabel(x, fontsize=14)
plt.ylabel(y, fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.legend(fontsize=14)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

In [None]:
plt.figure(figsize=(8, 8))
x='PC1'
y='PC2'
hue = 'Population type'
sns.scatterplot(data=pheno_pca_df, x=x, y=y, hue=hue)
plt.xlabel(x, fontsize=14)
plt.ylabel(y, fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.legend(fontsize=14)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(data=pheno_pca_df, x='PC2', y='PC3', hue='Nb varroas/100 bees')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(data=pheno_pca_df, x='PC1', y='PC4', hue='SMR - % infested cells recapped')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA()
x_pca = pca.fit_transform(braycurtis_df)

In [None]:
x_pca

In [None]:
sns.barplot(pca.explained_variance_ratio_[:10])

In [None]:
pca_df = pd.DataFrame(data=x_pca, columns=['PC{}'.format(x) for x in range(1, x_pca.shape[1] + 1)], index=jaccard_df.index)

In [None]:
pheno_pca_df = pd.merge(pheno_df, pca_df.iloc[:, :3], left_index=True, right_index=True)

In [None]:
variables = ['Apiary', 'Population type']
for v in variables:
    g = sns.FacetGrid(pheno_pca_melt_df, col='PC', col_wrap=5, height=3, aspect=1)
    g.map(sns.boxplot,'PC_value',v)
    g.set_titles("{col_name}")

In [None]:
variables = [
       'Number of workers',
       'Total worker brood (dm2)',
       'Total honey stores (dm2)',
       'Surface of pollen (dm2)',
       'Nb varroas/100 bees',
       'SMR - % infested cells recapped',
       'pin-test -cleared cells/hour']

for v in variables:
    g = sns.FacetGrid(pheno_pca_melt_df, col='PC', col_wrap=5, height=3, aspect=1)
    g.map(sns.regplot,'PC_value',v,scatter_kws={'s':4})
    g.set_titles("{col_name}")

In [None]:
plt.figure(figsize=(8, 8))
x='PC1'
y='PC2'
hue='pin-test -cleared cells/hour'
sns.scatterplot(data=pheno_pca_df, x=x, y=y, hue=hue)
plt.xlabel(x, fontsize=14)
plt.ylabel(y, fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.legend(fontsize=14)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(data=pheno_pca_df, x='PC1', y='PC2', hue='Population type')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(data=pheno_pca_df, x='PC1', y='PC2', hue='Apiary')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

normalization before bray-curtis? Z-scale stuff?

log ratio? 

do I want all my signal to be driven by some very abundant species?


# Whole BeeStrong

In [None]:
import statsmodels.api as sm
import statsmodels.stats.multitest as smm
from scipy import stats

from scipy.stats import pearsonr, mannwhitneyu,  spearmanr
from matplotlib_venn import venn3, venn2

from skbio.stats.ordination import pcoa
from skbio import DistanceMatrix
from scipy.spatial.distance import pdist, squareform
from skbio.diversity import beta_diversity

from skbio.stats.ordination import pcoa
from skbio import DistanceMatrix
from scipy.spatial.distance import pdist, squareform
from skbio.diversity import beta_diversity

figsize=(10, 8)
sns.set_style('whitegrid')

## lib

In [None]:
def parse_kreport(file_path, bs_id, krakdb, readpool, mhg, cs, sf, r):
    kreport_fn = '{}{}_{}_{}_mhg{}_cs{}_sf{}_rep{}.k2report'.format(file_path, bs_id, krakdb, readpool, mhg, cs, str(sf).replace('.', ''), r)
    if not os.path.exists(kreport_fn): 
        print('missing {}'.format(kreport_fn))
    with open(kreport_fn, 'r') as inf:
        ucseqs_nr = int(inf.readline().split()[1])
        cseqs_nr = int(inf.readline().split()[1])
    return ucseqs_nr, cseqs_nr

def parse_breport(file_path, bs_id, krakdb, readpool, mhg, cs, sf, r, level, taxa):
    '''get read numbers for taxa'''
    breport_fn = '{}{}_{}_{}_mhg{}_cs{}_sf{}_rep{}_{}.breport'.format(file_path, bs_id, krakdb, readpool, mhg, cs, str(sf).replace('.', ''), r, level)
    if not os.path.exists(breport_fn): 
        print('missing {}'.format(breport_fn))
    breport_df = pd.read_csv(breport_fn, header=None, sep='\t')
    breport_df[5] = [x.lstrip() for x in breport_df[5].to_list()]
    return breport_df[breport_df[5].isin(taxa)].set_index(5)[1].to_dict()

def get_total_read_nr(bowtie2_path, bs_id):
    with open('{}{}_bee.out'.format(bowtie2_path, bs_id), 'r') as inf:
        lines = inf.readlines()
        total_read_nr = None
        for l in lines:
            if l.endswith('reads; of these:\n'):
                total_read_nr = int(l.split()[0])
        return total_read_nr

def get_classified_read_nr(bowtie2_path, kraken_path, bracken_path, bs_id_str, krakdb, readpool, mhg, cs, sf, r, level):
    '''
    Equals the total number of read in the raw fasta minus the number of unclassified reads by Bracken.
    For a subsample, I multiple the total number of reads by the sampling fraction (similar to subsampling reads before mapping with bowtie2)
    '''
    total_read_nr = get_total_read_nr(bowtie2_path, bs_id_str)
    kraken_ucread_nr, kraken_cread_nr = parse_kreport(kraken_path, bs_id_str, krakdb, readpool, mhg, cs, sf, r)
    bracken_cread_nr = parse_breport(bracken_path, bs_id_str, krakdb, readpool, mhg, cs, sf, r, level, ['root'])['root']
    bracken_ucread_nr = kraken_cread_nr - bracken_cread_nr
    return sf * total_read_nr - kraken_ucread_nr - bracken_ucread_nr

def get_beexact_species_name2taxid():

    url = "https://raw.githubusercontent.com/bdaisley/BEExact/master/full_database/BEEx_v2023.01.30___FL-refs_taxonomy.txt"
    file_content = requests.get(url, verify=False).text

    level2beexact_taxa = defaultdict(set)
    for level in ['d', 'p', 'c', 'o', 'f', 'g', 's']:
        level2beexact_taxa[level] = set()
    
    for line in file_content.split('\n'):
        if not line: continue
        for s in line.rstrip(';').split('\t')[1].split(';'):
            level, taxon = s.split('__')
            assert level in level2beexact_taxa, print('missing {}'.format(level))
            level2beexact_taxa[level].add(taxon.replace('_', ' '))

    return ncbi.get_name_translator(level2beexact_taxa['s'])
    
def get_desc_taxa(node, cand_taxa):
    desc_taxa = []
    if node.sci_name in cand_taxa:
        desc_taxa.append(node.sci_name)
        return desc_taxa
    for ch in node.children:
        desc_taxa.extend(get_desc_taxa(ch, cand_taxa))
    return desc_taxa

def get_parent_taxon_at_level(taxid, level):
    lineage = ncbi.get_lineage(taxid)
    level2taxid = {v:k for k, v in ncbi.get_rank(lineage).items()}
    parent_taxid = level2taxid.get(level, None)
    if parent_taxid:
        return ncbi.get_taxid_translator([parent_taxid])[parent_taxid]
    else: 
        return np.nan

def get_class_df(bs_ids, level, group_name2taxid, kraken2_path, bracken_path, bowtie2_path):
    krakdb, readpool, mhg, cs, sf, r, level = ('corent', 'nonbee', 2, '005', 1, 1, level)
    
    tree = ncbi.get_topology(list(group_name2taxid.values()))
    
    columns = ['Colony', 'Category', 'Read number', 'Relative abundance']
    rows = []
    for bs_id in tqdm(bs_ids):
        ucseqs_nr, cseqs_nr = parse_kreport(kraken2_path, bs_id, krakdb, readpool, mhg, cs, sf, r)
        taxa2seqnr = parse_breport(bracken_path, bs_id, krakdb, readpool, mhg, cs, sf, r, level, taxonomic_groups)
        classified_read_nr = get_classified_read_nr(bowtie2_path, kraken2_path, bracken_path, bs_id, krakdb, readpool, mhg, cs, sf, r, level)
        
        rows.append((bs_id, 'Unclassified', ucseqs_nr))
        rows.append((bs_id, 'Kraken2 only', cseqs_nr - taxa2seqnr['root']))
        
        for taxon in taxonomic_groups:
            name = group2name.get(taxon, taxon)
            node = tree&group_name2taxid[name]
            read_nr = taxa2seqnr.get(taxon, 0) - sum([taxa2seqnr.get(x, 0) for x in get_desc_taxa(node, taxonomic_groups.difference({node.sci_name}))])
            rows.append((bs_id, name, read_nr, read_nr / classified_read_nr))
    
    return pd.DataFrame(data=rows, columns=columns)
def get_bracken_df(bowtie2_path, kraken2_path, bracken_path, bs_id_str, krakdb, readpool, mhg, cs, sf, r, level, taxon_filter):
    '''
    parse bracken file
    '''
    # parse bracken output
    bracken_fn = '{}{}_{}_{}_mhg{}_cs{}_sf{}_rep{}_{}.bracken'.format(bracken_path, bs_id_str, krakdb, readpool, mhg, cs, str(sf).replace('.', ''), r, level)
    if not os.path.exists(bracken_fn): 
        print('missing {}'.format(bracken_fn))
    bracken_df = pd.read_csv(bracken_fn, sep='\t')

    ## calculate original read numbers
    # number of reads classified by kraken2 + bracken AND by bowtie2 --> to calculate relative abundance
    classified_read_nr = get_classified_read_nr(bowtie2_path, kraken2_path, bracken_path, bs_id_str, krakdb, readpool, mhg, cs, sf, r, level)
    
    # classified reads - non bee reads
    if level == 'S':
        honeybee_taxon = 'Apis mellifera'
    elif level == 'F':
        honeybee_taxon = 'Apidae'
    else: 
        print('wrong honeybee taxonomic level')
    
    honeybee_read_nr = classified_read_nr - bracken_df[bracken_df['name'] != honeybee_taxon]['new_est_reads'].sum()

    # recompute relative abundance
    bracken_df['fraction_total_reads'] = bracken_df['new_est_reads'].to_numpy() / classified_read_nr
    
    # calculate Aitchison log ratio
    bracken_df['ratio_honeybee'] = bracken_df['new_est_reads'].to_numpy() / honeybee_read_nr
    bracken_df['log_ratio_honeybee'] = np.log(bracken_df['new_est_reads'].to_numpy() / honeybee_read_nr)
    
    # log ratio of absolute abundances == log ratio of relative abundances
    assert np.all(np.round(bracken_df['log_ratio_honeybee'].to_numpy(), 10) == np.round(np.log(bracken_df['fraction_total_reads'].to_numpy() / (honeybee_read_nr / classified_read_nr)), 10))
    
    if taxon_filter:
        return bracken_df[bracken_df['taxonomy_id'].isin(taxon_filter)]
    else:
        return bracken_df

def get_brack_df(bs_ids, level, bowtie2_path, kraken2_path, bracken_path, group_taxid2name):

    krakdb, readpool, mhg, cs, sf, r, level = ('corent', 'nonbee', 2, '005', 1, 1, level)
    
    df_list = []
    for bs_id in tqdm(bs_ids):
        df = get_bracken_df(bowtie2_path, kraken2_path, bracken_path, bs_id, krakdb, readpool, mhg, cs, sf, r, level, {})
        df = df[['name', 'taxonomy_id', 'fraction_total_reads', 'new_est_reads', 'log_ratio_honeybee', 'ratio_honeybee']]
        df.insert(0, 'Colony', ['{}'.format(bs_id) for i in range(len(df))])
        df_list.append(df)
    
    brack_df = pd.concat(df_list).rename(columns={'name': 'Taxon', 
                                                  'taxonomy_id': 'TaxID', 
                                                  'fraction_total_reads': 
                                                  'Relative abundance', 'new_est_reads': 
                                                  'Read number', 'ratio_honeybee': 
                                                  'Ratio Honeybee', 'log_ratio_honeybee': 
                                                  'Log Ratio Honeybee'}).reset_index().drop(columns=['index'])
    
    # enhance brack df with key taxonomic groups (from previous section)
    taxid2parent_group = {}
    for tid in tqdm(set(brack_df['TaxID'])):
        for p in ncbi.get_lineage(tid):
            if p in group_taxid2name:
                taxid2parent_group[tid] = group_taxid2name[p]
    
    brack_df.insert(1, 'Group', [taxid2parent_group[x] for x in brack_df['TaxID']])
    return taxid2parent_group, brack_df

def get_prev_df(brack_df, taxid2parent_group, lod):

    brack_df_lod = brack_df[(brack_df['Relative abundance'] >= lod)]
    
    # calculate mean and median abundance of each species
    median_abund_df = brack_df_lod.groupby('Taxon').agg({'Relative abundance' : ['median']})['Relative abundance']
    mean_abund_df = brack_df_lod.groupby('Taxon').agg({'Relative abundance' : ['mean']})['Relative abundance']
    sp2median_abund = dict(zip(median_abund_df.index, median_abund_df['median']))
    sp2mean_abund = dict(zip(mean_abund_df.index, mean_abund_df['mean']))
    
    x = brack_df_lod[['Taxon', 'TaxID']].drop_duplicates()
    sp_name2taxid = dict(zip(x['Taxon'], x['TaxID']))
    
    species, counts = np.unique(brack_df_lod['Taxon'], return_counts=True)
    species_freqs = counts / len(bs_ids)
    # in_beexact = [x in beexact_sp_names for x in species]
    
    #core = [x >= 0.5 for x in species_freqs]
    return pd.DataFrame({
        'Taxon' : list(species),
        #'Family': [get_parent_taxon_at_level(sp_name2taxid[sp], 'family') for sp in species],
        'Group': [taxid2parent_group[sp_name2taxid[sp]] for sp in species], 
        'TaxID' : [sp_name2taxid[sp] for sp in species],
        'Prevalence' : list(species_freqs),
        'Median Relative Abundance': [sp2median_abund[sp] for sp in species],
        'Mean Relative Abundance': [sp2mean_abund[sp] for sp in species],
        #'In BEExact' : in_beexact,
        # 'Core' : core
    })
    
def get_pv_con(X, y, method):
    
    # if method == 'lr':
    #     X2 = sm.add_constant(X)
    #     model = sm.OLS(y, X2).fit()
    #     return model.pvalues[sp]

    if method == 'pearson_perm':
        corr, p_value = pearsonr(X, y, method=stats.PermutationMethod(n_resamples=999), alternative='two-sided')
        return corr, p_value

    elif method == 'pearson':
        corr, p_value = pearsonr(X, y, alternative='two-sided')
        return corr, p_value
    
    elif method == 'spearman':
        corr, p_value = spearmanr(X, y, alternative='two-sided')
        return corr, p_value  
    else:
        return None

def get_pv_cat(x, y, method):
    
    if method == 'tt':
        t_stat, p_value = stats.ttest_ind(x, y)
        return t_stat, p_value

    # pretty slow too
    elif method == 'wilcoxon':
        corr, p_value = mannwhitneyu(x, y, alternative='two-sided')
        return corr, p_value

    else:
        return None

def get_pv_df(symbio_pheno_df, symbio_pivot_df, y_variables, min_colony_nr, con_methods, cat_methods, limit_of_detection):
    columns = ['Species', 'Y variable', 'Method', 'Stat', 'P-value']
    rows = []
    for sp in tqdm(symbio_pheno_df.columns[1:len(symbio_pivot_df.columns)]):
        for y_name in y_variables:
            
            # run continuous method only where species and y variable is available for at least 5 colonies
            f = (symbio_pheno_df[sp] >= limit_of_detection) & ~(symbio_pheno_df[y_name].isna())
            if sum(f) >= min_colony_nr:
                X = symbio_pheno_df[sp][f]
                y = symbio_pheno_df[y_name][f]
                for method in con_methods:
                    tstat, pvalue = get_pv_con(X, y, method)
                    rows.append((sp, y_name, method, tstat, pvalue))
    
            # run categorical method only where species and y variable is available for at least 5 colonies
            x = symbio_pheno_df[(symbio_pheno_df[sp] >= limit_of_detection) & ~(symbio_pheno_df[y_name].isna())][y_name].to_numpy()
            y = symbio_pheno_df[(symbio_pheno_df[sp] < limit_of_detection) & ~(symbio_pheno_df[y_name].isna())][y_name].to_numpy()
            if (len(x) >= min_colony_nr) and (len(y) >= min_colony_nr):
                for method in cat_methods:
                    tstat, pvalue = get_pv_cat(x, y, method)
                    rows.append((sp, y_name, method, tstat, pvalue))
    
    return pd.DataFrame(rows, columns=columns).dropna().reset_index(drop=True)
    
def get_fdr_df(pv_df, y_variables, methods):
    
    fdr_df = pv_df[(pv_df['Y variable'].isin(y_variables)) & (pv_df['Method'].isin(methods))].copy()
    fdr_df['FDR'] = np.nan

    for (y_name, method) in product(y_variables, methods):
        f = (fdr_df['Y variable'] == y_name) & (fdr_df['Method'] == method)
        fdr_df.iloc[np.argwhere(f).flatten(), 5] = smm.multipletests(fdr_df[f]['P-value'].to_numpy(), alpha=0.05, method='fdr_bh')[1]

    return fdr_df

def get_row_colors(pheno_pivot_df, discrete_vars, continous_vars, var2cmap):
    color_dict = {}
    for var in discrete_vars:
        values = pheno_pivot_df[var]
        if var in var2cmap:
            cmap = var2cmap[var]
        else:
            cmap = dict(zip(values.unique(), [x.upper() for x in list(sns.color_palette('colorblind', n_colors=len(values.unique())).as_hex())]))
        color_dict[var] =  values.map(cmap)
    
    for var in continous_vars:
        values = pheno_pivot_df[var]
        normalized_values = np.interp(values, (min(values), max(values)), (0, 1))
        cmap = plt.get_cmap('viridis')
        hex_colors = [mcolors.to_hex(cmap(val)) for val in normalized_values]
        color_dict[var] = hex_colors
        
    return pd.DataFrame(color_dict)

def print_group_richness(df, lod, categories):
    df1 = pd.DataFrame(df.groupby('Group')['Taxon'].nunique()).rename(columns={'Taxon': 'No LoD'})
    df2 = pd.DataFrame(df[df['Relative abundance'] >= lod].groupby('Group')['Taxon'].nunique()).rename(columns={'Taxon': 'LoD {}'.format(lod)})
    df = df1.merge(df2, on='Group').reindex(categories)
    return df.dropna().astype(int)

def plot_pca(pc_x, pc_y, pca_df, hue, cmap_hue, pca_explained_variance):
    sns.set_style('whitegrid')
    plt.figure(figsize=(8, 8))
    x='PC{}'.format(pc_x)
    y='PC{}'.format(pc_y)
    sns.scatterplot(data=pca_df, x=x, y=y, hue=hue, palette=cmap_hue)
    plt.xlabel('{} ({}% variance)'.format(x, round(100 * pca_explained_variance[pc_x -1], 0)), fontsize=14)
    plt.ylabel('{} ({}% variance)'.format(y, round(100 * pca_explained_variance[pc_y -1], 0)), fontsize=14)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    plt.legend(fontsize=14)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
    plt.show()

def get_fam_enrich_df(df):
    # count foreground families (significant correlation with varroa)
    fg = df[df['FDR'] < 0.1]['Family'].dropna()
    fg_families, fg_counts = np.unique(fg, return_counts=True)
    fg_fam2counts = dict(zip(fg_families, fg_counts))
    
    # and background families
    bg = df['Family'].dropna()
    bg_families, bg_counts = np.unique(bg, return_counts=True)
    
    # calculate expected count for each family based on background
    exp_counts = bg_counts / bg_counts.sum() * fg.size
    obs_counts = np.array([fg_fam2counts.get(f, 0) for f in bg_families])

    return pd.DataFrame({
    'Family': bg_families,
    'Observed count': obs_counts,
    'Expected count': exp_counts,
    'Observed/expected ratio': obs_counts / exp_counts
    }).sort_values('Observed/expected ratio', ascending=False)

# Truncate function
def truncate(label, max_len=25):
    label_str = str(label)
    return label_str if len(label_str) <= max_len else label_str[:max_len] + "..."

def plot_clustermap(taxon_df, pheno_df, continous_vars, var2cmap, distance, figsize=(12, 8)):
    
    discrete_vars = ['month', 'region']
    
    # add metadata and reset indexes
    pheno_taxon_df = pd.merge(pheno_df, taxon_df, on='Colony')
    taxon_df = taxon_df.set_index('Colony')
    pheno_taxon_df = pheno_taxon_df.set_index('Colony')

    row_colors = get_row_colors(pheno_taxon_df, discrete_vars, continous_vars, var2cmap)
    
    cg=sns.clustermap(taxon_df, metric=distance, row_colors=row_colors, figsize=figsize, xticklabels=math.ceil(taxon_df.shape[1] / 75))
    
    # Apply truncation to tick labels
    cg.ax_heatmap.set_xticklabels([truncate(label.get_text()) for label in cg.ax_heatmap.get_xticklabels()])
    cg.ax_heatmap.set_yticklabels([truncate(label.get_text()) for label in cg.ax_heatmap.get_yticklabels()])
    
    cg.ax_row_colors.legend([Patch(facecolor=cmap_region[name]) for name in cmap_region], var2cmap['region'], title='Region',bbox_to_anchor=(1, 1), bbox_transform=plt.gcf().transFigure, loc='upper left')
    plt.setp(cg.ax_heatmap.xaxis.get_majorticklabels(), fontsize=8)
    # cg.ax_row_colors.legend([Patch(facecolor=cmap_months[name]) for name in cmap_months], cmap_months, title='Month',bbox_to_anchor=(1, 0), bbox_transform=plt.gcf().transFigure, loc='lower left')
    cg.cax.set_visible(False)
    plt.show()
    return cg

def filter_group_pivot_df(pivot_df, group, taxon2group):
    return pivot_df[[x for x in pivot_df.columns if (x == 'Colony' or taxon2group.get(x, None) in group)]]

## set env

In [None]:
shared_data_path = '/cluster/raid/home/f80878961/data/'
working_path = '/cluster/raid/home/f80878961/beestrong/'
data_path = '{}data/'.format(working_path)
sample_SRA_file = '{}sample_SRA.csv'.format(data_path)
sra_path = '{}BeeStrongSRA/'.format(data_path)
fastq_path = '{}BeeStrongFastQ/'.format(data_path)
bowtie2_path = '{}bowtie2/'.format(data_path)
idx_path = '{}bee_bt2idx/'.format(shared_data_path)
idx_name = 'bee'
kraken2_path = '{}kraken2/'.format(data_path)
kdb_path = '{}krakdb/'.format(shared_data_path)
kdb_name = 'corent'
bracken_path = '{}bracken/'.format(data_path)
tlf_path = '{}tlf/'.format(working_path)

# bs_id to SRA ids
bs_sra_df = pd.read_table(sample_SRA_file)
skip_bs_ids = {'BS17_0674', 'BS17_0721'}
bs_ids = sorted([x for x in set(bs_sra_df['num_bs'].to_list()) if x not in skip_bs_ids])

# load cleaned phenotype dataset
pheno_df = pd.read_csv('{}whole_beestrong_data_cleaned.csv'.format(data_path))
pheno_df

In [None]:
## which high level taxa we care about?
taxonomic_groups = set(('root', 'Viruses','Bacteria', 'Archaea', 'Eukaryota', 'Embryophyta', 'Fungi', 'Metazoa', 'Apidae','Varroidae'))
group2name = {'root': 'Other Root', 'Eukaryota': 'Other Eukaryota', 'Metazoa': 'Other Metazoa'}
categories=['Other Root', 'Viruses','Bacteria', 'Archaea','Other Eukaryota','Embryophyta','Fungi','Metazoa', 'Varroidae']

group_name2taxid = {group2name.get(k, k): v[0] for k, v in ncbi.get_name_translator(taxonomic_groups).items()}
group_taxid2name = {v:k for k, v in group_name2taxid.items()}
groups = [group2name.get(x, x) for x in taxonomic_groups]
groups = ['Viruses', 'Bacteria', 'Archaea', 'Other Eukaryota',  'Embryophyta', 'Fungi']

cmap = {
    'Unclassified': '#262626',
    'Kraken2 only': '#575757',
    'Other Root': '#949494',
    'Viruses': '#0173b2',
    'Bacteria': '#de8f05',
    'Archaea': '#d55e00',
    'Other Eukaryota': '#029e73',
    'Embryophyta':'#12634c',
    'Apidae': '#75c8b0',
    'Fungi': '#cc78bc',
    'Other Metazoa': '#ca9161',
    'Varroidae': '#ece133'
}

cmap_region = {
    'sweden':'#ece133',
    'netherlands':'#de8f05',
    'luxembourg': '#de8f05',
    'grand_est': '#d55e00',
    'bourgogne_franche_comte': '#d55e00',
    'suisse': '#da291c',
    'auvergne_rhone_alpes': '#56b4e9',
    'provence_alpes_cote_azur': '#56b4e9',
    'corse': '#0173b2',
    'new_zealand': '#3b3b3b',
    'bretagne': '#ca9161',
    'pays_de_la_loire': '#029e73',
    'centre_val_de_loire': '#029e73',
    'nouvelle_aquitaine': '#cc78bc',
    'occitanie': '#fbafe4',
    'unknown' : '#949494'
}
region_labels=['{} n={}'.format(x.replace('_', ' ').capitalize(), pheno_df['region'].value_counts()[x])  for x in cmap_region.keys()]

months = sorted(set(pheno_df['month'].dropna()))
cmap_months = dict(zip(months, [x.upper() for x in list(sns.color_palette("light:b", n_colors=len(months)).as_hex())]))

genetic_groups = sorted(set(pheno_df['group'].dropna()))
cmap_group = dict(zip(genetic_groups, [x.upper() for x in list(sns.color_palette("colorblind", n_colors=len(months)).as_hex())]))

var2cmap = {'region': cmap_region,
           'month': cmap_months}

In [None]:
light_palette = sns.light_palette('#000000', n_colors=9)
light_palette.as_hex()

In [None]:
print(light_palette.as_hex())

## Reads to Species abundances

In [None]:
# list of beestrong sample ids for array job
bs_ids_file = '{}bs_ids.txt'.format(data_path)
# with open(bs_id_file, 'w') as outf:
#     outf.write('\n'.join(sorted(set(bs_sra_df['num_bs']))))

# get mapping between array ids and bs ids
with open(bs_ids_file, 'r') as inf:
    lines = inf.read().split('\n')
    array_id2bs_id = dict(zip(range(1, len(lines) + 1), lines))

bs_id2array_id ={v:k for k, v in array_id2bs_id.items()}

### Check downloaded SRA files

In [None]:
sra_ids = [x.split('/')[-1] for x in glob.glob('{}BeeStrongSRA/*'.format(data_path))]

print(len(bs_sra_df['Run'].to_list()))
print(len(set(bs_sra_df['Run'])))
print(len(sra_ids))
print(len(set(sra_ids)))
print(set(bs_sra_df['Run']).difference(sra_ids))
print(set(sra_ids).difference(bs_sra_df['Run']))

complete download although two more downloaded than in correspondance table...

In [None]:
print(len(set(bs_sra_df['Run'])))
print(len(set(bs_sra_df['num_bs'])))

1513 is number of colonies also reported in the paper but it seems we have >1 samples per colonies sometimes

In [None]:
bs_id2sra_count = collections.Counter(bs_sra_df['num_bs'].to_list())
bs_id2sra_count

### Preprocessing and QC

per beestrong id:
1. convert SRA files to fastq (careful of disk space usage)
2. merge fastq files when >1 and rename
3. trim poly-G tails (cutadapt)
4. FastQC
5. gzip

In [None]:
# capture completed, running, pending, and failed/to_run jobs
pp_running_jobs = get_status_job_array_ids(status='RUNNING', job_name='pp')
pp_pending_jobs = get_status_job_array_ids(status='PENDING', job_name='pp')

pp_completed_jobs = []
for array_id, bs_id in tqdm(array_id2bs_id.items()):

    # ensure latest log file is there and finishes by DONE
    x = glob.glob('{}/pp_*_{}.out'.format(fastq_path, array_id))
    if len(x) != 1: 
        print(x)
        continue
    out_file = x[0]
    with open(out_file, 'r') as inf:
        lines = inf.readlines()
        no_error = True
        for l in lines:
            if l.startswith('ERROR') or l.startswith('No reads processed!'):
                no_error = False
        done = lines and lines[-1].startswith('DONE')
    
    # expected files
    gz_1_fn = '{}{}_1.fastq.gz'.format(fastq_path, bs_id)
    gz_2_fn = '{}{}_2.fastq.gz'.format(fastq_path, bs_id)
    fqc_1_fn = '{}{}_1_fastqc.html'.format(fastq_path, bs_id)
    fqc_2_fn = '{}{}_2_fastqc.html'.format(fastq_path, bs_id)
    
    # condition for completion
    gz_1 = os.path.exists(gz_1_fn) and os.path.getsize(gz_1_fn) > 100
    gz_2 = os.path.exists(gz_2_fn) and os.path.getsize(gz_2_fn) > 100
    fqc_1 = os.path.exists(fqc_1_fn) and os.path.getsize(fqc_1_fn) > 100
    fqc_2 = os.path.exists(fqc_2_fn) and os.path.getsize(fqc_2_fn) > 100
    
    if no_error and done and gz_1 and gz_2 and fqc_1 and fqc_2:
        pp_completed_jobs.append(array_id)

pp_jobs_to_run = sorted(list(set(array_id2bs_id).difference(pp_completed_jobs + pp_running_jobs + pp_pending_jobs)))
print(len(pp_completed_jobs))
print(len(pp_jobs_to_run))


In [None]:
# # remove old logs when more than one
# for array_id, bs_id in tqdm(array_id2bs_id.items()):
# 
#     # ensure latest log file is there and finishes by DONE
#     x = glob.glob('{}/pp_*_{}.out'.format(fastq_path, array_id))
#     if len(x) != 1: 
#         for f in sorted(x)[:-1]:
#             os.remove(f)

# # remove older logs
# for array_id in tqdm(array_id2bs_id):
#     bs_id = array_id2bs_id[array_id]
#     
#     # ensure latest log file is there and finishes by DONE
#     x = glob.glob('{}/pp_*_{}.err'.format(fastq_path, array_id))
#     if len(x) > 1:
#         for f in sorted(x)[:-1]:
#             os.remove(f)

In [None]:
# remove temporary SRR fastq
z = [os.remove(x) for x in glob.glob('{}SRR*.fastq'.format(fastq_path))]
for array_id in tqdm(pp_jobs_to_run):
    # remove logs
    z = [os.remove(x) for x in glob.glob('{}pp_*_{}.*'.format(fastq_path, array_id))]
    # and uncompleted files
    z = [os.remove(x) for x in glob.glob('{}{}*'.format(fastq_path, array_id2bs_id[array_id]))]

In [None]:
nodes = ['node01', 'node02', 'node03', 'node04', 'node05', 'node07', 'node08']
parallel_job_nr = 1
group_size = math.ceil(len(pp_jobs_to_run) / (len(nodes)))

i = 0
for node_str in nodes:
    j = min(i + group_size, len(pp_jobs_to_run))
    print(node_str, i, j)
    array_str = write_array_str(pp_jobs_to_run[i: j], parallel_job_nr)
    print(array_str)
    i = j

    
    # write the BSUB script
    pp_script = '{}pp.run'.format(script_path)
    write_preprocessing_script(pp_script, array_str, node_str)
    
    command = ['sbatch', pp_script, bs_ids_file, sra_path, fastq_path, sample_SRA_file]
    result = subprocess.run(command, cwd=fastq_path, capture_output=False)

In [None]:
# TO DO: update to check if existing 
sra_to_delete = []
for array_id in pp_completed_jobs:
    bs_id = array_id2bs_id[array_id]
    sra_to_delete.extend(bs_sra_df[bs_sra_df['num_bs'] == bs_id]['Run'].to_list())

with open('{}SRA_to_delete.txt'.format(data_path), 'w') as outf:
    outf.write('\n'.join(['{}{}'.format(sra_path, x) for x in sra_to_delete]))

### Bowtie2

In [None]:
# capture completed, running, pending, and failed/to_run jobs
bowtie2_running_jobs = get_status_job_array_ids(status='RUNNING', job_name='bowtie2')
bowtie2_pending_jobs = get_status_job_array_ids(status='PENDING', job_name='bowtie2')
bowtie2_completed_jobs = []

for array_id in tqdm(pp_completed_jobs):
    bs_id = array_id2bs_id[array_id]
    
    # ensure latest log file is there and finishes by DONE
    x = glob.glob('{}/bowtie2_*_{}.out'.format(bowtie2_path, array_id))
    if len(x) != 1:
        print(x)
        continue

    out_file = x[0]
    with open(out_file, 'r') as inf:
        lines = inf.readlines()
        no_error = True
        for l in lines:
            if l.startswith('(ERR)'):
                no_error = False
        done = lines and lines[-1].startswith('DONE')
    
    # expected files
    # bam_fn = '{}{}_bee_mapped.bam'.format(bowtie2_path, bs_id)
    out_fn = '{}{}_bee.out'.format(bowtie2_path, bs_id)
    fqc_1_fn = '{}{}_bee_unmapped.1.fastq.gz'.format(bowtie2_path, bs_id)
    fqc_2_fn = '{}{}_bee_unmapped.2.fastq.gz'.format(bowtie2_path, bs_id)
    
    # condition for completion
    # bam = os.path.exists(bam_fn) and os.path.getsize(bam_fn) > 100
    out = os.path.exists(out_fn) and os.path.getsize(out_fn) > 100
    fqc_1 = os.path.exists(fqc_1_fn) and os.path.getsize(fqc_1_fn) > 100
    fqc_2 = os.path.exists(fqc_2_fn) and os.path.getsize(fqc_2_fn) > 100
    
    if done and no_error and out and fqc_1 and fqc_2:
        bowtie2_completed_jobs.append(array_id)
        
bowtie2_jobs_to_run = list(set(pp_completed_jobs).difference(bowtie2_completed_jobs + bowtie2_running_jobs + bowtie2_pending_jobs))
print(len(bowtie2_completed_jobs))
print(len(bowtie2_jobs_to_run))

In [None]:
# # remove older logs
# for array_id in tqdm(pp_completed_jobs):
#     bs_id = array_id2bs_id[array_id]
#     
#     # ensure latest log file is there and finishes by DONE
#     x = glob.glob('{}/bowtie2_*_{}.out'.format(bowtie2_path, array_id))
#     if len(x) > 1:
#         for f in sorted(x)[:-1]:
#             os.remove(f)
# # remove older logs
# for array_id in tqdm(pp_completed_jobs):
#     bs_id = array_id2bs_id[array_id]
#     
#     # ensure latest log file is there and finishes by DONE
#     x = glob.glob('{}/bowtie2_*_{}.err'.format(bowtie2_path, array_id))
#     if len(x) > 1:
#         for f in sorted(x)[:-1]:
#             os.remove(f)

In [None]:
# remove logs
for array_id in tqdm(bowtie2_jobs_to_run):
    # remove logs
    z = [os.remove(x) for x in glob.glob('{}bowtie2_*_{}.*'.format(bowtie2_path, array_id))]
    # and uncompleted files
    z = [os.remove(x) for x in glob.glob('{}{}*'.format(bowtie2_path, array_id2bs_id[array_id]))]

In [None]:
# most efficient is actually to run things in home 
array_str = write_array_str(bowtie2_jobs_to_run, 8)
bowtie2_script_home = '{}bowtie2_home.run'.format(script_path)
write_bowtie2_script_home(bowtie2_script_home, array_str)

In [None]:
%%bash -s "$bowtie2_script_home" "$idx_path" "$idx_name" "$fastq_path" "$bowtie2_path" "$bs_ids_file"
cd $5
sbatch $1 $2 $3 $4 $5 $6

In [None]:
with open('{}bam_to_copy.txt'.format(data_path), 'w') as outf:
    outf.write('\n'.join(['{}{}_bee_mapped.bam'.format(bowtie2_path, array_id2bs_id[x]) for x in bowtie2_completed_jobs]))

In [None]:
bowtie2_completed_jobs

### Kraken2

- would be cool that multiple job would be able to read from the same database loaded in memory...

In [None]:
read_pool = 'nonbee'
mhg = '2'
cs = '0.05'
cs_str = str(cs).replace('.', '')

In [None]:
# capture completed, running, pending, and failed/to_run jobs
kraken2_running_jobs = get_status_job_array_ids(status='RUNNING', job_name='kraken2')
kraken2_pending_jobs = get_status_job_array_ids(status='PENDING', job_name='kraken2')
kraken2_completed_jobs = []
for array_id in tqdm(bowtie2_completed_jobs):
    bs_id = array_id2bs_id[array_id]
    
    # ensure latest log file is there and finishes by DONE
    x = glob.glob('{}/kraken2_*_{}.out'.format(kraken2_path, array_id))
    if len(x) != 1: 
        if len(x) > 1:
            print(x)
        continue

    out_file = x[0]
    with open(out_file, 'r') as inf:
        lines = inf.readlines()
        done = lines and lines[-1].startswith('DONE')
    
    # expected files
    out = '{}{}_{}_{}_mhg{}_cs{}_sf1_rep1'.format(kraken2_path, bs_id, kdb_name, read_pool, mhg, cs_str)
    kout_fn = '{}.kraken2'.format(out)
    krp_fn = '{}.k2report'.format(out)
    fqc_1_fn = '{}_cseqs_1.fq.gz'.format(out)
    fqc_2_fn = '{}_cseqs_2.fq.gz'.format(out)
    
    # condition for completion
    kout = os.path.exists(kout_fn) and os.path.getsize(kout_fn) > 100
    krp = os.path.exists(krp_fn) and os.path.getsize(krp_fn) > 100
    fqc_1 = os.path.exists(fqc_1_fn) and os.path.getsize(fqc_1_fn) > 100
    fqc_2 = os.path.exists(fqc_2_fn) and os.path.getsize(fqc_2_fn) > 100
    if done and kout and krp and fqc_1 and fqc_2:
        kraken2_completed_jobs.append(array_id)
        
kraken_jobs_to_run = list(set(bowtie2_completed_jobs).difference(kraken2_completed_jobs + kraken2_running_jobs + kraken2_pending_jobs))
print(len(kraken2_completed_jobs))
print(len(kraken_jobs_to_run))

In [None]:
# # remove older logs
# for array_id in tqdm(bowtie2_completed_jobs):
#     bs_id = array_id2bs_id[array_id]
#     
#     # ensure latest log file is there and finishes by DONE
#     x = glob.glob('{}/kraken2_*_{}.out'.format(kraken2_path, array_id))
#     if len(x) > 1:
#         for f in sorted(x)[:-1]:
#             os.remove(f)
# 
# # remove older logs
# for array_id in tqdm(pp_completed_jobs):
#     bs_id = array_id2bs_id[array_id]
#     
#     # ensure latest log file is there and finishes by DONE
#     x = glob.glob('{}//kraken2_*_{}.err'.format(kraken2_path, array_id))
#     if len(x) > 1:
#         for f in sorted(x)[:-1]:
#             os.remove(f)

In [None]:
# remove logs and uncompleted files
for array_id in tqdm(kraken_jobs_to_run):
    # remove logs
    z = [os.remove(x) for x in glob.glob('{}kraken2_*_{}.*'.format(kraken2_path, array_id))]
    # and uncompleted files
    z = [os.remove(x) for x in glob.glob('{}{}*'.format(kraken2_path, array_id2bs_id[array_id]))]

In [None]:
# most efficient is actually to run things in home 
array_str = write_array_str(kraken_jobs_to_run, 8)
kraken2_script = '{}kraken2.run'.format(script_path)
write_kraken2_script(kraken2_script, array_str, read_pool, mhg, cs)

In [None]:
%%bash -s "$kraken2_script" "$kdb_path" "$kdb_name" "$bowtie2_path" "$kraken2_path" "$bs_ids_file"
cd $5
sbatch $1 $2 $3 $4 $5 $6

### Bracken

In [None]:
def write_bracken_script(kraken2_script_fn, array_str, read_pool, min_hit_grps, confidence, level):
    conf_str = str(confidence).replace('.', '')
    kraken2_runstr="""#!/bin/bash -l
#SBATCH --array=ARRAY_STR
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --mem=20g
#SBATCH --time=01:00:00
#SBATCH --job-name=bracken
#SBATCH --output=%x_%A_%a.out
#SBATCH --error=%x_%A_%a.err

modulesld
ebld
module use /software/anaconda3/envs/eb/easybuild/modules/all
conda activate planb
which bracken

kdb_path=$1
kdb_name=$2
kraken2_path=$3
bracken_path=$4
bs_ids_file=$5

# getting bs_id
bs_id=$(sed -n ${SLURM_ARRAY_TASK_ID}'{p;q}' ${bs_ids_file})
file_id=${bs_id}_${kdb_name}_READPOOL_mhgMINHITGROUP_csCONFIDENCESTRING_sf1_rep1
k2report_fn=${kraken2_path}${file_id}.k2report
bracken_fn=${bracken_path}${file_id}_LEVEL.bracken
breport_fn=${bracken_path}${file_id}_LEVEL.breport

ls -lh $k2report_fn
echo $bracken_fn
echo $breport_fn

echo '1. bracken'
bracken -d ${kdb_path}${kdb_name} -i $k2report_fn -r 150 -l LEVEL -t 10 -o $bracken_fn -w $breport_fn

echo DONE""".replace('ARRAY_STR', array_str).replace('READPOOL', read_pool).replace('MINHITGROUP', min_hit_grps).replace('CONFIDENCESTRING', conf_str).replace('CONFIDENCESCORE', confidence).replace('LEVEL', level)
    with open(kraken2_script_fn, 'w') as outf:
        outf.write(kraken2_runstr)

In [None]:
# capture completed, running, pending, and failed/to_run jobs
bracken_running_jobs = get_status_job_array_ids(status='RUNNING', job_name='bracken')
bracken_pending_jobs = get_status_job_array_ids(status='PENDING', job_name='bracken')
bracken_completed_jobs = []
for array_id in tqdm(kraken2_completed_jobs):
    bs_id = array_id2bs_id[array_id]
    
    # ensure latest log file is there and finishes by DONE
    x = glob.glob('{}/bracken_*_{}.out'.format(bracken_path, array_id))
    if len(x) != 1: 
        if len(x) > 1:
            print(x)
        continue

    out_file = x[0]
    with open(out_file, 'r') as inf:
        lines = inf.readlines()
        no_error = True
        for l in lines:
            if l.startswith('ERROR'):
                no_error = False
        done = lines and lines[-1].startswith('DONE')
    
    # expected files
    out = '{}{}_{}_{}_mhg{}_cs{}_sf1_rep1_S'.format(bracken_path, bs_id, kdb_name, read_pool, mhg, cs_str)
    bout_fn = '{}.bracken'.format(out)
    brp_fn = '{}.breport'.format(out)

    # condition for completion
    bout_fn = os.path.exists(bout_fn) and os.path.getsize(bout_fn) > 100
    brp_fn = os.path.exists(brp_fn) and os.path.getsize(brp_fn) > 100
    if done and no_error and bout_fn and brp_fn:
        bracken_completed_jobs.append(array_id)
        
bracken_jobs_to_run = list(set(kraken2_completed_jobs).difference(bracken_completed_jobs + bracken_running_jobs + bracken_pending_jobs))
print(len(bracken_completed_jobs))
print(len(bracken_jobs_to_run))

In [None]:
# remove logs and uncompleted files
for array_id in tqdm(bracken_jobs_to_run):
    # remove logs
    z = [os.remove(x) for x in glob.glob('{}bracken_*_{}.*'.format(bracken_path, array_id))]
    # and uncompleted files
    z = [os.remove(x) for x in glob.glob('{}{}*'.format(bracken_path, array_id2bs_id[array_id]))]

In [None]:
read_pool = 'nonbee'
mhg = '2'
cs = '0.05'
cs_str = str(cs).replace('.', '')
level = 'F'

In [None]:
# array_str = write_array_str(bracken_jobs_to_run, 8)
array_str = '2-1513'

In [None]:
bracken_script = '{}bracken.run'.format(script_path)
write_bracken_script(bracken_script, array_str, read_pool, mhg, cs, level)

In [None]:
bs_ids_file

In [None]:
%%bash -s "$bracken_script" "$kdb_path" "$kdb_name" "$kraken2_path" "$bracken_path" "$bs_ids_file"
cd $5
sbatch $1 $2 $3 $4 $5 $6

to do: 
- triple check that everything have been processed (MultiQC, cross-checking some numbers, comparing with reported BeeStrong and Phenotype data)
- clean disk space, copying stuff to scratch / fola (SRA should probably go to scratch)

why nbr_pho_varroa_100bee  and v_pho are different?

## Clean phenotype dataset (to do once)

In [None]:
## phenotype dataset
pheno_df = pd.read_table('{}whole_beestrong_data.csv'.format(data_path))

pheno_df = pheno_df[~pheno_df['num_bs'].isin(skip_bs_ids)].reset_index(drop=True).rename(columns={'num_bs':'Colony'})
# pheno_df.index=pheno_df['Colony']

# add total read number
pheno_df['Total read nr'] = [get_total_read_nr(bowtie2_path, col) for col in pheno_df['Colony']]

# and split date into year and month
pheno_df['year'] = pheno_df['date'].map(lambda x: int(str(x).split('/')[2]) if not pd.isna(x) else None)
pheno_df['month'] = pheno_df['date'].map(lambda x: int(str(x).split('/')[1]) if not pd.isna(x) else None)

# add locations
loc_df = pd.read_table('{}Data_loc.csv'.format(data_path), sep=';')
pheno_df = pheno_df.merge(loc_df.rename(columns={'num_ruche_BS':'Colony'}), on='Colony', how='left')


In [None]:
# Varroidae abundance for Varroa abundance
level = 'F'
taxid2parent_group, brack_df = get_brack_df(bs_ids, level, bowtie2_path, kraken2_path, bracken_path, group_taxid2name)

In [None]:
pheno_df = pheno_df.merge(brack_df[brack_df['Taxon']=='Varroidae'][['Colony', 'Relative abundance', 'Log Ratio Honeybee']].rename(columns={'Relative abundance': 'Varroa relative abundance', 'Log Ratio Honeybee': 'Varroa Log Ratio Honeybee'}),
              on='Colony', how='left')

In [None]:
# choice of treating NA varroa abundance as 0 values
pheno_df['Varroa relative abundance'] = pheno_df['Varroa relative abundance'].fillna(0)
pheno_df['Varroa Log Ratio Honeybee'] = pheno_df['Varroa Log Ratio Honeybee'].fillna(np.log(1e-07)) # min relative abundance is 2.637216e-07 , so this seem a reasonable pseudo count

In [None]:
pheno_df.to_csv('{}whole_beestrong_data_cleaned.csv'.format(data_path), index=False)
pd.read_csv('{}whole_beestrong_data_cleaned.csv'.format(data_path))

In [None]:
# sns.scatterplot(data=pheno_df, x='Varroa relative abundance', y='Varroidae relative abundance')

## Covariables

In [None]:
plt.figure(figsize=(8, 8))
sns.boxplot(pheno_df, x='region', y='Varroa Log Ratio Honeybee', hue='region', palette=cmap_region, order=cmap_region)
plt.xticks(rotation=90, fontsize=12)
plt.yticks(fontsize=12)
plt.xlabel('Region', fontsize=12)
plt.ylabel('Varroa Honeybee Log Ratio', fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8, 8))
sns.boxplot(pheno_df, x='month', y='Varroa Log Ratio Honeybee', hue='month', palette=cmap_months, order=cmap_months)
plt.xticks(rotation=90, fontsize=12)
plt.yticks(fontsize=12)
plt.xlabel('Month', fontsize=12)
plt.ylabel('Varroa Honeybee Log Ratio', fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
sns.boxplot(pheno_df, x='region', y='Varroa Log Ratio Honeybee', hue='month', palette=cmap_months, order=cmap_region)
plt.xticks(ticks=range(len(region_labels)), labels=region_labels, rotation=90, fontsize=12)
plt.yticks(fontsize=12)
plt.xlabel('', fontsize=12)
plt.ylabel('Varroa Honeybee Log Ratio', fontsize=12)
plt.legend(bbox_to_anchor=(1, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(pheno_df, x='month', y='Varroa Log Ratio Honeybee', hue='region', palette=cmap_region, order=cmap_months)
plt.xticks(rotation=90, fontsize=12)
plt.yticks(fontsize=12)
plt.xlabel('', fontsize=12)
plt.ylabel('Varroa Honeybee Log Ratio', fontsize=12)
plt.legend(bbox_to_anchor=(1, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8, 8))
sns.boxplot(pheno_df, x='year', y='Varroa Log Ratio Honeybee', hue='year')
plt.xticks(rotation=90, fontsize=12)
plt.yticks(fontsize=12)
plt.xlabel('Year', fontsize=12)
plt.ylabel('Varroa Honeybee Log Ratio', fontsize=12)
plt.legend().set_visible(False)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8, 8))
sns.boxplot(pheno_df, x='group', y='Varroa Log Ratio Honeybee', hue='group', palette=cmap_group)
plt.xticks(rotation=90, fontsize=12)
plt.yticks(fontsize=12)S
plt.xlabel('', fontsize=12)
plt.ylabel('Varroa Honeybee Log Ratio', fontsize=12)
# plt.legend().set_visible(False)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
sns.boxplot(pheno_df, x='region', y='Varroa Log Ratio Honeybee', hue='group', palette=cmap_group, order=cmap_region)
plt.xticks(ticks=range(len(region_labels)), labels=region_labels, rotation=90, fontsize=12)
plt.yticks(fontsize=12)
plt.xlabel('', fontsize=12)
plt.ylabel('Varroa Honeybee Log Ratio', fontsize=12)
plt.legend(bbox_to_anchor=(1, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.boxplot(pheno_df, x='departement', y='Varroa Log Ratio Honeybee', hue='departement')
plt.xticks(rotation=90)
plt.tight_layout()

In [None]:
plt.figure(figsize=(8, 6))
sns.boxplot(pheno_df, x='group', y='Varroa Log Ratio Honeybee', hue='group')
plt.xticks(rotation=90)
plt.tight_layout()

In [None]:
pheno_df

In [None]:
plt.figure(figsize=(8, 6))
sns.boxplot(pheno_df, x='region', y='percent_Ligustica_Carnica', hue='region', palette=cmap_region, order=cmap_region)
plt.xticks(rotation=90)
plt.tight_layout()

In [None]:
plt.figure(figsize=(8, 6))
sns.boxplot(pheno_df, x='region', y='month', hue='region', palette=cmap_region, order=cmap_region)
plt.xticks(rotation=90)
plt.tight_layout()

## Varroa log ratio vs. relative abundance

In [None]:
sns.histplot(data=pheno_df, x='Varroa Log Ratio Honeybee')

In [None]:
sns.histplot(data=pheno_df, x='Varroa relative abundance', log_scale=False)

In [None]:
pheno_df[pheno_df['Varroa relative abundance'] != 0].describe()

In [None]:
c

In [None]:
sns.scatterplot(data=pheno_df, x='Varroa Log Ratio Honeybee', y='Varroa relative abundance')

# Bracken Family-level

In [None]:
level = 'F'
level_name = 'Family'
lod = 5e-06

## Classification of key taxonomic levels

In [None]:
class_df = get_class_df(bs_ids, level, group_name2taxid, kraken2_path, bracken_path, bowtie2_path)

In [None]:
fig = px.bar(class_df[class_df['Category'].isin(categories)], x="Colony", y="Relative abundance", color="Category",
            hover_data=['Category'], barmode = 'stack', color_discrete_map=cmap, category_orders={"Category": categories})
fig.update_layout(
    autosize=False,
    width=1000,
    height=700,
    xaxis={'title': {'font': {'size': 18}}},
    yaxis={'title': {'font': {'size': 18}}},
    legend={'font': {'size': 16}}
)
fig.show()

In [None]:
plt.figure(figsize=(8, 6))
ax = sns.boxplot(data=class_df[class_df['Category'].isin(categories)], x='Category', y='Relative abundance', palette=cmap, 
            showmeans=True, meanprops={"marker":"o", "markerfacecolor":"red", "markeredgecolor":"black", "markersize":"5"}, order=categories)

ax.set_yscale("log")
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

## Symbiosphere 

In [None]:
taxid2parent_group, brack_df = get_brack_df(bs_ids, level, bowtie2_path, kraken2_path, bracken_path, group_taxid2name)

In [None]:
print_group_richness(brack_df, lod, categories)

In [None]:
# current definition of symbiosphere constituents (everything but metazoa basically)
f = brack_df['Group'].isin({'Viruses', 'Bacteria', 'Archaea', 'Fungi', 'Other Eukaryota', 'Embryophyta'})
symbio_df = brack_df[f & (brack_df['Relative abundance'] >= lod)]

taxon2group = dict(zip(symbio_df['Taxon'], symbio_df['Group']))
symbio_df

In [None]:
# get core symbiosphere
min_prev = 0.2
prev_df = get_prev_df(symbio_df, taxid2parent_group, lod)
core_symbio_df = symbio_df[symbio_df['Taxon'].isin(prev_df[prev_df['Prevalence'] >= min_prev]['Taxon'])]
core_symbio_df

In [None]:
print_group_richness(core_symbio_df, lod, categories)

### Richness vs- library size

In [None]:
lod = 5e-06
# lod = 1e-05

x = brack_df[(brack_df['Relative abundance'] >= 0)].groupby('Colony').agg({'Taxon' : ['count']})['Taxon']
richness_df = pd.DataFrame({
    'Colony' : x.index,
    'Richness' : x['count']
}).reset_index(drop=True).merge(pheno_df[['Colony', 'Total read nr']], how='inner')

x = brack_df[(brack_df['Relative abundance'] >= lod)].groupby('Colony').agg({'Taxon' : ['count']})['Taxon']
richness_df_lod = pd.DataFrame({
    'Colony' : x.index,
    'Richness' : x['count']
}).reset_index(drop=True).merge(pheno_df[['Colony', 'Total read nr']], how='inner')

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 7.5))

sns.regplot(data=richness_df, x='Total read nr', y='Richness', ax=ax1)
ax1.set_title('no LoD, corr={}'.format(round(richness_df['Richness'].corr(richness_df['Total read nr']),ndigits=3)))

sns.regplot(data=richness_df_lod, x='Total read nr', y='Richness', ax=ax2)
ax2.set_title('LoD {}, corr={}'.format(lod, round(richness_df_lod['Richness'].corr(richness_df_lod['Total read nr']),ndigits=3)))

plt.tight_layout()
plt.show()

In [None]:
for group in categories:
    
    x = brack_df[(brack_df['Group'] == group) & (brack_df['Relative abundance'] >= 0)].groupby('Colony').agg({'Taxon' : ['count']})['Taxon']
    richness_df = pd.DataFrame({
        'Colony' : x.index,
        'Richness' : x['count']
    }).reset_index(drop=True).merge(pheno_df[['Colony', 'Total read nr']], how='inner')
    
    x = brack_df[(brack_df['Group'] == group) & (brack_df['Relative abundance'] >= lod)].groupby('Colony').agg({'Taxon' : ['count']})['Taxon']
    richness_df_lod = pd.DataFrame({
        'Colony' : x.index,
        'Richness' : x['count']
    }).reset_index(drop=True).merge(pheno_df[['Colony', 'Total read nr']], how='inner')
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 7.5))
    
    sns.regplot(data=richness_df, x='Total read nr', y='Richness', ax=ax1)
    ax1.set_title('{} no LoD, corr={}'.format(group, round(richness_df['Richness'].corr(richness_df['Total read nr']),ndigits=3)))
    
    sns.regplot(data=richness_df_lod, x='Total read nr', y='Richness', ax=ax2)
    ax2.set_title('{} LoD {}, corr={}'.format(group, lod, round(richness_df_lod['Richness'].corr(richness_df_lod['Total read nr']),ndigits=3)))
    
    plt.tight_layout()
    plt.show()

### Prevalence vs. abundance of families

In [None]:
fig = px.scatter(prev_df, x='Prevalence', y='Median Relative Abundance', color='Group', hover_data=['Taxon'])
fig.update_layout(
    autosize=False,
    width=1400,
    height=1000,
    xaxis={'title': {'font': {'size': 18}}},
    yaxis={'title': {'font': {'size': 18}}},
    legend={'font': {'size': 16}}
)
fig.update_yaxes(tickformat='.5f')
fig.show()

In [None]:
fig = px.scatter(prev_df[prev_df['Group'].isin(categories)], x='Prevalence', y='Median Relative Abundance', color='Group', hover_data=['Taxon'], log_y=True, color_discrete_map=cmap)

fig.update_traces(marker=dict(
    size=10,
    opacity=0.8,
    #line=dict(width=2, color='DarkSlateGrey')
))

fig.update_layout(
    autosize=False,
    width=1200,
    height=800,
    xaxis={'title': {'font': {'size': 20}}},
    yaxis={'title': {'font': {'size': 20}}},
    legend={'font': {'size': 20}}
)
fig.update_yaxes(tickformat='.6f')
fig.show()

### Relative abundance vs. Ratio

In [None]:
sns.scatterplot(symbio_df, x='Relative abundance', y='Ratio Honeybee', hue='Group')

## Core taxa & Relative abundance

In [None]:
core_relabund_df = core_symbio_df.pivot(index='Colony', columns='Taxon', values='Relative abundance').fillna(0).reset_index()
print(core_relabund_df.shape)
core_relabund_df

In [None]:
# core_relabund_df.to_csv('{}{}_RelativeAbundance_Core_Families.csv'.format(tlf_path, str(datetime.now().date())), index=False)
# pd.read_csv('{}{}_RelativeAbundance_Core_Families.csv'.format(tlf_path, str(datetime.now().date())))

In [None]:
# add metadata and reset indexes
pheno_core_relabund_df = pd.merge(pheno_df, core_relabund_df, on='Colony')
core_relabund_df = core_relabund_df.set_index('Colony')
pheno_core_relabund_df = pheno_core_relabund_df.set_index('Colony')

In [None]:
color_dict = {}
discrete_vars = ['group', 'year', 'month', 'region']
continous_vars =  ['Varroa relative abundance', 'logit_recap_inf', 'raw_mnr', 'Total read nr']

row_colors = get_row_colors(pheno_core_relabund_df, discrete_vars, continous_vars)

In [None]:
plt.figure(figsize=(30, 24))
cg=sns.clustermap(core_relabund_df, metric='braycurtis', row_colors=row_colors, xticklabels=1)
cg.cax.set_visible(True)
plt.show()

### PCoA

In [None]:
dist_matrix = beta_diversity('braycurtis', core_relabund_df)
pcoa_results = pcoa(dist_matrix)
sns.barplot(pcoa_results.proportion_explained[:10])

In [None]:
pc_nr = 5
pca_df = pd.merge(pheno_df, pd.DataFrame(pcoa_results.samples.values[:, :pc_nr], columns=['PC{}'.format(i + 1) for i in range(pc_nr)], index=dist_matrix.ids).reset_index().rename(columns={'index':'Colony'}), on='Colony')
pca_melt_df = pd.melt(pca_df, id_vars=list(pca_df.columns[:-5]), value_vars=list(pca_df.columns[-5:]), var_name = 'PC', value_name='PC_value')

In [None]:
sns.set_style('whitegrid')
plt.figure(figsize=(8, 8))
x='PC1'
y='PC2'
sns.scatterplot(data=pca_df, x=x, y=y, hue='group')
plt.xlabel(x, fontsize=14)
plt.ylabel(y, fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.legend(fontsize=14)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

In [None]:
variables = ['region', 'group', 'year', 'month']
for v in variables:
    g = sns.FacetGrid(pca_melt_df, col='PC', col_wrap=5, height=3, aspect=1)
    g.map(sns.boxplot,v,'PC_value')
    g.set_titles("{col_name}")

In [None]:
variables = [
    # 'group',
    # 'year',
    # 'month', 
    'percent_Ligustica_Carnica',
    'percent_Mellifera',
    'percent_Caucasica',
    'v_pho',
    'v_mito',
    'pca1',
    'recap_inf',
    'logit_recap_inf',
    'percent_mnr',
    'raw_mnr',
    'eb_mmr',
    'Varroa relative abundance'
]

for v in variables:
    g = sns.FacetGrid(pca_melt_df, col='PC', col_wrap=5, height=3, aspect=1)
    g.map(sns.regplot,'PC_value',v,scatter_kws={'s':4})
    g.set_titles("{col_name}")

In [None]:
pca_melt_df

In [None]:
sns.lmplot(data=pca_melt_df[pca_melt_df['PC']=='PC2'], x='PC_value', y='percent_Ligustica_Carnica', hue='region', height=8)

### Varroa-associated taxa

not much sense here given there are only 30 core families --> compute on all families

In [None]:
relabund_df = symbio_df.pivot(index='Colony', columns='Taxon', values='Relative abundance').fillna(0).reset_index()
print(relabund_df.shape)

In [None]:
# perform correlation tests on log transformed ratio to honeybees 
relabund_pheno_df = relabund_df.merge(pheno_df, on='Colony')
relabund_pheno_df['Shuffled Varroa relative abundance'] = random.sample(relabund_pheno_df['Varroa relative abundance'].to_list(), len(relabund_pheno_df))
relabund_pheno_df

In [None]:
y_variables = ['Varroa relative abundance', 'Shuffled Varroa relative abundance']
min_colony_nr = 5
con_methods = ['pearson']
cat_methods = []

pv_df = get_pv_df(relabund_pheno_df, relabund_df, y_variables, min_colony_nr, con_methods, cat_methods, lod)
pv_df

In [None]:
fdr_df = get_fdr_df(pv_df, y_variables, cat_methods + con_methods)
fdr_df

In [None]:
y_name = 'Varroa relative abundance'
for sp in fdr_df[(fdr_df['Y variable'] == y_name) & (fdr_df['Method'] == 'pearson') & (fdr_df['FDR'] <= 0.1)].sort_values('FDR')['Species']:
    f = relabund_pheno_df[sp] > np.log(lod)
    sns.regplot(data=relabund_pheno_df[f], x=sp, y=y_name)
    plt.show()

In [None]:
relabund_sig_df = core_symbio_df[core_symbio_df['Taxon'].isin(fdr_df[fdr_df['FDR'] < 0.1]['Species'])].pivot(index='Colony', columns='Taxon', values='Relative abundance').fillna(0).reset_index()
print(relabund_sig_df.shape)

In [None]:
relabund_sig_df.to_csv('{}{}_RelativeAbundance_Core_VarroaAssociated_Families.csv'.format(tlf_path, str(datetime.now().date())), index=False)
pd.read_csv('{}{}_RelativeAbundance_Core_VarroaAssociated_Families.csv'.format(tlf_path, str(datetime.now().date())))

##  Core taxa & Bee Log ratio

In [None]:
aitchison_df = core_symbio_df.pivot(index='Colony', columns='Taxon', values='Log Ratio Honeybee').fillna(np.log(1e-06)).reset_index()
print(aitchison_df.shape)

In [None]:
# aitchison_df.to_csv('{}{}_BeeLogRatio_Core_Families.csv'.format(tlf_path, str(datetime.now().date())), index=False)
# pd.read_csv('{}{}_BeeLogRatio_Core_Families.csv'.format(tlf_path, str(datetime.now().date())))

In [None]:
continous_vars =  ['Varroa Log Ratio Honeybee', 'logit_recap_inf', 'raw_mnr']
distance = 'euclidean'
plot_clustermap(aitchison_df, pheno_df, continous_vars, var2cmap, distance, figsize=(12, 8))

### PCA

In [None]:
pivot_df = aitchison_df.set_index('Colony')
pca = PCA()
x_pca = pca.fit_transform(pivot_df)
sns.barplot(pca.explained_variance_ratio_[:10])

In [None]:
pc_nr = 5
pca_df = pd.merge(pheno_df, pd.DataFrame(pcoa_results.samples.values[:, :pc_nr], columns=['PC{}'.format(i + 1) for i in range(pc_nr)], index=pivot_df.index).reset_index().rename(columns={'index':'Colony'}), on='Colony')
pca_melt_df = pd.melt(pca_df, id_vars=list(pca_df.columns[:-5]), value_vars=list(pca_df.columns[-5:]), var_name = 'PC', value_name='PC_value')

In [None]:
plot_pca(1, 2, pca_df, 'region', cmap_region, pca.explained_variance_ratio_)

In [None]:
plot_pca(3, 4, pca_df, 'region', cmap_region, pca.explained_variance_ratio_)

In [None]:
variables = ['region', 'group', 'year', 'month']
for v in variables:
    g = sns.FacetGrid(pca_melt_df, col='PC', col_wrap=5, height=3, aspect=1)
    g.map(sns.boxplot,v,'PC_value')
    g.set_titles("{col_name}")

In [None]:
variables = [
    # 'group',
    # 'year',
    # 'month', 
    'percent_Ligustica_Carnica',
    'percent_Mellifera',
    'percent_Caucasica',
    'v_pho',
    'v_mito',
    'pca1',
    'recap_inf',
    'logit_recap_inf',
    'percent_mnr',
    'raw_mnr',
    'eb_mmr',
    'Varroa relative abundance'
]

for v in variables:
    g = sns.FacetGrid(pca_melt_df, col='PC', col_wrap=5, height=3, aspect=1)
    g.map(sns.regplot,'PC_value',v,scatter_kws={'s':4})
    g.set_titles("{col_name}")

### Varroa-associated taxa

not much sense here given there are only 30 core families

In [None]:
# perform correlation tests on log transformed ratio to honeybees 
aitchison_pheno_df = aitchison_df.merge(pheno_df, on='Colony')
aitchison_pheno_df['Shuffled Varroa Log Ratio Honeybee'] = random.sample(aitchison_pheno_df['Varroa Log Ratio Honeybee'].to_list(), len(aitchison_pheno_df))
aitchison_pheno_df

In [None]:
y_variables = ['Varroa Log Ratio Honeybee', 'Shuffled Varroa Log Ratio Honeybee']
min_colony_nr = 5
con_methods = ['pearson']
cat_methods = []

pv_df = get_pv_df(aitchison_pheno_df, aitchison_df, y_variables, min_colony_nr, con_methods, cat_methods, np.log(lod)) # import to log also lod here
pv_df

In [None]:
for method in con_methods:
    for y_name in y_variables:
        sns.histplot(data=pv_df[(pv_df['Y variable'] == y_name) & (pv_df['Method'] == method)], x='P-value', bins=100)
        plt.title('{} {}'.format(y_name, method))
        plt.show()

In [None]:
fdr_df = get_fdr_df(pv_df, y_variables, cat_methods + con_methods)
fdr_df

In [None]:
y_name = 'Varroa Log Ratio Honeybee'
for sp in fdr_df[(fdr_df['Y variable'] == y_name) & (fdr_df['Method'] == 'pearson') & (fdr_df['FDR'] <= 0.1)].sort_values('FDR')['Species']:
    f = aitchison_pheno_df[sp] > np.log(lod)
    sns.regplot(data=aitchison_pheno_df[f], x=sp, y=y_name)
    plt.show()

In [None]:
aitchison_sig_df = core_symbio_df[core_symbio_df['Taxon'].isin(fdr_df[fdr_df['FDR'] < 0.1]['Species'])].pivot(index='Colony', columns='Taxon', values='Log Ratio Honeybee').fillna(np.log(1e-06)).reset_index()
print(aitchison_sig_df.shape)

In [None]:
aitchison_sig_df

In [None]:
# aitchison_sig_df.to_csv('{}{}_BeeLogRatio_Core_VarroaAssociated_Families.csv'.format(tlf_path, str(datetime.now().date())), index=False)
# pd.read_csv('{}{}_BeeLogRatio_Core_VarroaAssociated_Families.csv'.format(tlf_path, str(datetime.now().date())))

#### stats

In [None]:
for group in groups:
    print(group, filter_group_pivot_df(aitchison_sig_df, set([group]), taxon2group).shape[1] -1)

In [None]:
fdr_df = fdr_df.rename(columns={'Species': 'Family'})
fdr_df.insert(1, 'Group', [taxon2group[x] for x in fdr_df['Family']])
fdr_df

In [None]:
y_name = 'Varroa Log Ratio Honeybee'
method = 'pearson'

a =fdr_df[(fdr_df['Y variable'] == y_name) & (fdr_df['Method'] == method) & (fdr_df['Stat'] > 0) & (fdr_df['FDR'] <= 0.1)]
b =fdr_df[(fdr_df['Y variable'] == y_name) & (fdr_df['Method'] == method) & (fdr_df['Stat'] < 0) & (fdr_df['FDR'] <= 0.1)]

In [None]:
print(len(a))
print(len(b))

In [None]:
x = a
print(x.shape)
x.sort_values('FDR', ascending=True)[['Family', 'Group', 'FDR']]

In [None]:
x = b
print(x.shape)
x.sort_values('FDR', ascending=True)[['Family', 'Group', 'FDR']]

#### heatmap

In [None]:
continous_vars =  ['Varroa Log Ratio Honeybee']
distance = 'euclidean'
plot_clustermap(aitchison_sig_df, pheno_df, continous_vars, var2cmap, distance, figsize=(10, 8))

In [None]:
plot_clustermap(filter_group_pivot_df(aitchison_sig_df, 'Bacteria', taxon2group), pheno_df, continous_vars, var2cmap, distance, figsize=(10, 8))

In [None]:
plot_clustermap(filter_group_pivot_df(aitchison_sig_df, 'Embryophyta', taxon2group), pheno_df, continous_vars, var2cmap, distance, figsize=(10, 8))

In [None]:
# cluster only based on these two families
families = ['Morganellaceae', 'Spiroplasmataceae']
cg = plot_clustermap(aitchison_sig_df[['Colony'] + families], pheno_df, continous_vars, var2cmap, distance, figsize=(10, 8))

#### PCA

In [None]:
pivot_df = aitchison_sig_df.set_index('Colony')

In [None]:
pca = PCA()
x_pca = pca.fit_transform(pivot_df)
sns.barplot(pca.explained_variance_ratio_[:10])

In [None]:
pc_nr = 5
pca_df = pd.merge(pheno_df, pd.DataFrame(pcoa_results.samples.values[:, :pc_nr], columns=['PC{}'.format(i + 1) for i in range(pc_nr)], index=pivot_df.index).reset_index().rename(columns={'index':'Colony'}), on='Colony')
pca_melt_df = pd.melt(pca_df, id_vars=list(pca_df.columns[:-5]), value_vars=list(pca_df.columns[-5:]), var_name = 'PC', value_name='PC_value')

In [None]:
plot_pca(1, 2, pca_df, 'month', cmap_months, pca.explained_variance_ratio_)

In [None]:
plot_pca(1, 3, pca_df, 'Varroa Log Ratio Honeybee', None, pca.explained_variance_ratio_)

In [None]:
plot_pca(2, 4, pca_df, 'Varroa Log Ratio Honeybee', None, pca.explained_variance_ratio_)

In [None]:
variables = [
    'v_pho',
    'v_mito',
    'pca1',
    'recap_inf',
    'logit_recap_inf',
    'percent_mnr',
    'raw_mnr',
    'eb_mmr',
    'Varroa relative abundance',
    'Varroa Log Ratio Honeybee'
]

for v in variables:
    g = sns.FacetGrid(pca_melt_df, col='PC', col_wrap=5, height=3, aspect=1)
    g.map(sns.regplot,'PC_value',v,scatter_kws={'s':4})
    g.set_titles("{col_name}")

## All taxa & Prevalence analyses

In [None]:
jaccard_df = symbio_df.pivot(index='Colony', columns='Taxon', values='Relative abundance').map(lambda x: 1 if x > 0 else x).fillna(0).reset_index()
jaccard_df[jaccard_df.columns[1:]] = jaccard_df.iloc[:, 1:].astype(int)
print(jaccard_df.shape)

In [None]:
# jaccard_df.to_csv('{}{}_Prevalence_All_Families.csv'.format(tlf_path, str(datetime.now().date())), index=False)
# pd.read_csv('{}{}_Prevalence_All_Families.csv'.format(tlf_path, str(datetime.now().date())))

In [None]:
continous_vars =  ['Varroa Log Ratio Honeybee', 'logit_recap_inf', 'raw_mnr']
distance = 'jaccard'
plot_clustermap(jaccard_df, pheno_df, continous_vars, var2cmap, distance, figsize=(12, 8))

### PCoA

In [None]:
jaccard_dm = DistanceMatrix(squareform(pdist(jaccard_df, metric='jaccard')), ids=jaccard_df.index)

pcoa_results = pcoa(jaccard_dm)

sns.barplot(pcoa_results.proportion_explained[:10])

In [None]:
# jaccard_df.to_csv('{}{}_BeeStrong_species_presences_absences.csv'.format(tlf_path, str(datetime.now().date())), index=True)

# pd.read_csv('{}{}_BeeStrong_species_presences_absences.csv'.format(tlf_path, str(datetime.now().date())))

In [None]:
pc_nr = 5
pca_df = pd.merge(pheno_df, pd.DataFrame(pcoa_results.samples.values[:, :pc_nr], columns=['PC{}'.format(i + 1) for i in range(pc_nr)], index=jaccard_dm.ids).reset_index().rename(columns={'index':'Colony'}), on='Colony')

pca_melt_df = pd.melt(pca_df, id_vars=list(pca_df.columns[:-5]), value_vars=list(pca_df.columns[-5:]), var_name = 'PC', value_name='PC_value')

In [None]:
sns.set_style('whitegrid')
plt.figure(figsize=(8, 8))
x='PC2'
y='PC5'
sns.scatterplot(data=pca_df, x=x, y=y, hue='logit_recap_inf')
plt.xlabel(x, fontsize=14)
plt.ylabel(y, fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.legend(fontsize=14)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

In [None]:
variables = ['region', 'group', 'year', 'month']
for v in variables:
    g = sns.FacetGrid(pca_melt_df, col='PC', col_wrap=5, height=3, aspect=1)
    g.map(sns.boxplot,v,'PC_value')
    g.set_titles("{col_name}")

In [None]:
variables = [
    # 'group',
    # 'year',
    # 'month', 
    'percent_Ligustica_Carnica',
    'percent_Mellifera',
    'percent_Caucasica',
    'v_pho',
    'v_mito',
    'pca1',
    'recap_inf',
    'logit_recap_inf',
    'percent_mnr',
    'raw_mnr',
    'eb_mmr',
    'Varroa relative abundance'
]

for v in variables:
    g = sns.FacetGrid(pca_melt_df, col='PC', col_wrap=5, height=3, aspect=1)
    g.map(sns.regplot,'PC_value',v,scatter_kws={'s':4})
    g.set_titles("{col_name}")

### Varroa-associated taxa

In [None]:
# perform correlation tests on log transformed ratio to honeybees 
jaccard_pheno_df = jaccard_df.merge(pheno_df, on='Colony')
jaccard_pheno_df['Shuffled Varroa Log Ratio Honeybee'] = random.sample(jaccard_pheno_df['Varroa Log Ratio Honeybee'].to_list(), len(jaccard_pheno_df))
jaccard_pheno_df

In [None]:
y_variables = ['Varroa Log Ratio Honeybee', 'Shuffled Varroa Log Ratio Honeybee']
min_colony_nr = 5
con_methods = []
cat_methods = ['tt']

pv_df = get_pv_df(jaccard_pheno_df, jaccard_df, y_variables, min_colony_nr, con_methods, cat_methods, lod)
pv_df

In [None]:
for method in cat_methods:
    for y_name in y_variables:
        sns.histplot(data=pv_df[(pv_df['Y variable'] == y_name) & (pv_df['Method'] == method)], x='P-value', bins=100)
        plt.title('{} {}'.format(y_name, method))
        plt.show()

In [None]:
fdr_df = get_fdr_df(pv_df, y_variables, cat_methods + con_methods)
fdr_df

In [None]:
y_name = 'Varroa Log Ratio Honeybee'
for sp in fdr_df[(fdr_df['Y variable'] == y_name) & (fdr_df['Method'] == 'tt') & (fdr_df['FDR'] <= 0.1)].sort_values('FDR')['Species']:
    x = jaccard_pheno_df[jaccard_pheno_df[sp] >= lod][y_name].to_numpy()
    y = jaccard_pheno_df[jaccard_pheno_df[sp] < lod][y_name].to_numpy()
    plt.boxplot([x, y], tick_labels=['Presence (n={})'.format(len(x)), 'Absence (n={})'.format(len(y))])
    plt.ylabel(y_name)
    plt.title(sp)
    plt.show()

In [None]:
jaccard_sig_df = symbio_df[symbio_df['Taxon'].isin(fdr_df[fdr_df['FDR'] < 0.1]['Species'])].pivot(index='Colony', columns='Taxon', values='Relative abundance').map(lambda x: 1 if x > 0 else x).fillna(0).reset_index()
jaccard_sig_df[jaccard_sig_df.columns[1:]] = jaccard_sig_df.iloc[:, 1:].astype(int)
print(jaccard_sig_df.shape)

In [None]:
jaccard_sig_df

#### stats

In [None]:
for group in groups:
    print(group, filter_group_pivot_df(jaccard_sig_df, set([group]), taxon2group).shape[1] -1)

In [None]:
fdr_df = fdr_df.rename(columns={'Species': 'Family'})
fdr_df.insert(1, 'Group', [taxon2group[x] for x in fdr_df['Family']])
fdr_df

In [None]:
y_name = 'Varroa Log Ratio Honeybee'
method = 'tt'

a =fdr_df[(fdr_df['Y variable'] == y_name) & (fdr_df['Method'] == method) & (fdr_df['Stat'] > 0) & (fdr_df['FDR'] <= 0.1)]
b =fdr_df[(fdr_df['Y variable'] == y_name) & (fdr_df['Method'] == method) & (fdr_df['Stat'] < 0) & (fdr_df['FDR'] <= 0.1)]

In [None]:
print(len(a))
print(len(b))

In [None]:
x = a
print(x.shape)
x.sort_values('FDR', ascending=True)[['Family', 'Group', 'FDR']]

In [None]:
x = b
print(x.shape)
x.sort_values('FDR', ascending=True)[['Family', 'Group', 'FDR']]

#### heatmap

In [None]:
continous_vars =  ['Varroa Log Ratio Honeybee']
distance = 'jaccard'
plot_clustermap(jaccard_sig_df, pheno_df, continous_vars, var2cmap, distance, figsize=(12, 10))

In [None]:
plot_clustermap(filter_group_pivot_df(jaccard_sig_df, 'Bacteria', taxon2group), pheno_df, continous_vars, var2cmap, distance, figsize=(12, 8))

In [None]:
plot_clustermap(filter_group_pivot_df(jaccard_sig_df, 'Embryophyta', taxon2group), pheno_df, continous_vars, var2cmap, distance, figsize=(12, 8))

#### PCoA

In [None]:
pivot_df = jaccard_sig_df

distance_matrix = DistanceMatrix(squareform(pdist(pivot_df, metric='jaccard')), ids=pivot_df.index)
pcoa_results = pcoa(distance_matrix)
sns.barplot(pcoa_results.proportion_explained[:10])

In [None]:
pc_nr = 5
pca_df = pd.merge(pheno_df, pd.DataFrame(pcoa_results.samples.values[:, :pc_nr], columns=['PC{}'.format(i + 1) for i in range(pc_nr)], index=distance_matrix.ids).reset_index().rename(columns={'index':'Colony'}), on='Colony')

pca_melt_df = pd.melt(pca_df, id_vars=list(pca_df.columns[:-5]), value_vars=list(pca_df.columns[-5:]), var_name = 'PC', value_name='PC_value')

In [None]:
sns.set_style('whitegrid')
plt.figure(figsize=(8, 8))
x='PC1'
y='PC2'
sns.scatterplot(data=pca_df, x=x, y=y, hue='Varroa Log Ratio Honeybee')
plt.xlabel(x, fontsize=14)
plt.ylabel(y, fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.legend(fontsize=14)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

In [None]:
variables = [
    'v_pho',
    'v_mito',
    'pca1',
    'recap_inf',
    'logit_recap_inf',
    'percent_mnr',
    'raw_mnr',
    'eb_mmr',
    'Varroa relative abundance',
    'Varroa Log Ratio Honeybee'
]
for v in variables:
    g = sns.FacetGrid(pca_melt_df, col='PC', col_wrap=5, height=3, aspect=1)
    g.map(sns.regplot,'PC_value',v,scatter_kws={'s':4})
    g.set_titles("{col_name}")

## Varroa-associated families [TO MERGE WITH ABOVE SECTIONS]

In [None]:
# perform correlation tests on log transformed ratio to honeybees 
aitchison_df = symbio_df.pivot(index='Colony', columns='Taxon', values='Log Ratio Honeybee').fillna(np.log(1e-06)).reset_index()
print(aitchison_df.shape)

aitchison_pheno_df = aitchison_df.merge(pheno_df, on='Colony')
aitchison_pheno_df['Shuffled Varroa relative abundance'] = random.sample(aitchison_pheno_df['Varroa relative abundance'].to_list(), len(aitchison_pheno_df))
aitchison_pheno_df['Shuffled Varroa Log Ratio Honeybee'] = random.sample(aitchison_pheno_df['Varroa Log Ratio Honeybee'].to_list(), len(aitchison_pheno_df))
aitchison_pheno_df

### P-value distributions

In [None]:
y_variables = ['Varroa Log Ratio Honeybee', 'Shuffled Varroa Log Ratio Honeybee', 'Varroa relative abundance', 'Shuffled Varroa relative abundance', 'bee_weigth', 'nbr_open_brood']
min_colony_nr = 5
con_methods = ['pearson']
cat_methods = ['tt']
lod = 5e-06

pv_df = get_pv_df(aitchison_pheno_df, aitchison_df, y_variables, min_colony_nr, con_methods, cat_methods, np.log(lod))
pv_df

In [None]:
for method in con_methods:
    for y_name in y_variables:
        sns.histplot(data=pv_df[(pv_df['Y variable'] == y_name) & (pv_df['Method'] == method)], x='P-value', bins=100)
        plt.title('{} {}'.format(y_name, method))
        plt.show()

In [None]:
for method in cat_methods:
    for y_name in y_variables:
        sns.histplot(data=pv_df[(pv_df['Y variable'] == y_name) & (pv_df['Method'] == method)], x='P-value', bins=100)
        plt.title('{} {}'.format(y_name, method))
        plt.show()

In [None]:
pv_df[(pv_df['Y variable'] == 'bee_weigth') & (pv_df['Method'] == 'pearson')].sort_values('P-value')[:10]

In [None]:
pv_df[(pv_df['Y variable'] == 'nbr_open_brood') & (pv_df['Method'] == 'pearson')].sort_values('P-value')[:10]

### FDR

In [None]:
y_variables = ['Varroa Log Ratio Honeybee']
methods = ['pearson', 'tt']

fdr_df = get_fdr_df(pv_df, y_variables, methods)

In [None]:
# intersection methods
plt.figure(figsize=(6, 6))
set1 = set(fdr_df[(fdr_df['Y variable'] == 'Varroa Log Ratio Honeybee') & (fdr_df['Method'] == 'pearson') & (fdr_df['FDR'] <= 0.1)]['Species'])
set2 = set(fdr_df[(fdr_df['Y variable'] == 'Varroa Log Ratio Honeybee') & (fdr_df['Method'] == 'tt') & (fdr_df['FDR'] <= 0.1)]['Species'])
venn2([set1, set2], set_labels=('pearson', 'tt'))
plt.show()

In [None]:
y_name = 'Varroa Log Ratio Honeybee'
for sp in fdr_df[(fdr_df['Y variable'] == y_name) & (fdr_df['Method'] == 'pearson') & (fdr_df['FDR'] <= 0.1)].sort_values('FDR')['Species']:
    f = symbio_pheno_df[sp] > np.log(lod)
    sns.regplot(data=symbio_pheno_df[f], x=sp, y=y_name)
    plt.show()

In [None]:
y_name = 'Varroa Log Ratio Honeybee'
for sp in fdr_df[(fdr_df['Y variable'] == y_name) & (fdr_df['Method'] == 'tt') & (fdr_df['FDR'] <= 0.1)].sort_values('FDR')['Species']:
    x = symbio_pheno_df[symbio_pheno_df[sp] >= np.log(lod)][y_name].to_numpy()
    y = symbio_pheno_df[symbio_pheno_df[sp] < np.log(lod)][y_name].to_numpy()
    plt.boxplot([x, y], tick_labels=['Presence (n={})'.format(len(x)), 'Absence (n={})'.format(len(y))])
    plt.ylabel(y_name)
    plt.title(sp)
    plt.show()

### Abundance Aitchison Log Ratio Honeybee

In [None]:
aitchison_df = symbio_df[symbio_df['Taxon'].isin(set(fdr_df[(fdr_df['Y variable'] == 'Varroa Log Ratio Honeybee') & (fdr_df['Method'] == 'pearson') & (fdr_df['FDR'] < 0.1)]['Species']))].pivot(
    index='Colony', columns='Taxon', values='Log Ratio Honeybee').fillna(np.log(1e-06)).reset_index()
print(aitchison_df.shape)
aitchison_df

In [None]:
# add metadata and reset indexes
pheno_aitchison_df = pd.merge(pheno_df, aitchison_df, on='Colony')
aitchison_df = aitchison_df.set_index('Colony')
pheno_aitchison_df = pheno_aitchison_df.set_index('Colony')

In [None]:
pheno_aitchison_df.columns

In [None]:
color_dict = {}
discrete_vars = ['group', 'year', 'month', 'region']
continous_vars =  ['Varroa relative abundance', 'logit_recap_inf', 'raw_mnr', 'Total read nr', 'Varroa Log Ratio Honeybee']

for var in discrete_vars:
    values = pheno_aitchison_df[var]
    cmap = dict(zip(values.unique(), [x.upper() for x in list(sns.color_palette('husl', n_colors=len(values.unique())).as_hex())]))
    color_dict[var] =  values.map(cmap)

for var in continous_vars:
    values = pheno_aitchison_df[var]
    normalized_values = np.interp(values, (min(values), max(values)), (0, 1))
    cmap = plt.get_cmap('viridis')
    hex_colors = [mcolors.to_hex(cmap(val)) for val in normalized_values]
    color_dict[var] =  hex_colors
    
row_colors = pd.DataFrame(color_dict)

In [None]:
plt.figure(figsize=(30, 24))
cg=sns.clustermap(aitchison_df, metric='euclidean', row_colors=row_colors, xticklabels=1)
cg.cax.set_visible(True)
plt.show()

### Abundance Bray-Curtis

In [None]:
braycurtis_df = symbio_df[symbio_df['Taxon'].isin(set(fdr_df[(fdr_df['Y variable'] == 'Varroa Log Ratio Honeybee') & (fdr_df['Method'] == 'pearson') & (fdr_df['FDR'] < 0.1)]['Species']))].pivot(
    index='Colony', columns='Taxon', values='Relative abundance').fillna(0).reset_index()
print(braycurtis_df.shape)
braycurtis_df

### Presence/absence Jaccard

For families whose presence correlates with change in varroa abundance

In [None]:
jaccard_df = symbio_df[symbio_df['Taxon'].isin(set(fdr_df[(fdr_df['Y variable'] == 'Varroa Log Ratio Honeybee') & (fdr_df['Method'] == 'tt') & (fdr_df['FDR'] < 0.1)]['Species']))].pivot(
    index='Colony', columns='Taxon', values='Log Ratio Honeybee').map(lambda x: 1 if x >= np.log(lod) else x).fillna(0).reset_index()
print(jaccard_df.shape)
jaccard_df

In [None]:
# add metadata and reset indexes
pheno_jaccard_df = pd.merge(pheno_df, jaccard_df, on='Colony')
jaccard_df = jaccard_df.set_index('Colony')
pheno_jaccard_df = pheno_jaccard_df.set_index('Colony')

In [None]:
color_dict = {}
discrete_vars = ['group', 'year', 'month', 'region']
continous_vars =  ['Varroa relative abundance', 'logit_recap_inf', 'raw_mnr', 'Total read nr']

for var in discrete_vars:
    values = pheno_jaccard_df[var]
    cmap = dict(zip(values.unique(), [x.upper() for x in list(sns.color_palette('husl', n_colors=len(values.unique())).as_hex())]))
    color_dict[var] =  values.map(cmap)

for var in continous_vars:
    values = pheno_jaccard_df[var]
    normalized_values = np.interp(values, (min(values), max(values)), (0, 1))
    cmap = plt.get_cmap('viridis')
    hex_colors = [mcolors.to_hex(cmap(val)) for val in normalized_values]
    color_dict[var] =  hex_colors
    
row_colors = pd.DataFrame(color_dict)

In [None]:
plt.figure(figsize=(30, 24))
cg=sns.clustermap(jaccard_df, metric='euclidean', row_colors=row_colors, xticklabels=1)
cg.cax.set_visi ble(True)
plt.show()

# Bracken Species-level

In [None]:
level = 'S'
level_name = 'Species'
lod = 5e-06

## Classification of key taxonomic levels

In [None]:
class_df = get_class_df(bs_ids, level, group_name2taxid, kraken2_path, bracken_path, bowtie2_path)

In [None]:
fig = px.bar(class_df[class_df['Category'].isin(categories)], x="Colony", y="Relative abundance", color="Category",
            hover_data=['Category'], barmode = 'stack', color_discrete_map=cmap, category_orders={"Category": categories})
fig.update_layout(
    autosize=False,
    width=1000,
    height=700,
    xaxis={'title': {'font': {'size': 18}}},
    yaxis={'title': {'font': {'size': 18}}},
    legend={'font': {'size': 16}}
)
fig.show()

In [None]:
plt.figure(figsize=(8, 6))
ax = sns.boxplot(data=class_df[class_df['Category'].isin(categories)], x='Category', y='Relative abundance', palette=cmap, 
            showmeans=True, meanprops={"marker":"o", "markerfacecolor":"red", "markeredgecolor":"black", "markersize":"5"}, order=categories)

ax.set_yscale("log")
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

## Symbiosphere 

In [None]:
taxid2parent_group, brack_df = get_brack_df(bs_ids, level, bowtie2_path, kraken2_path, bracken_path, group_taxid2name)
brack_df

In [None]:
# current definition of symbiosphere constituents (everything but metazoa basically)
lod = 5e-06
f = brack_df['Group'].isin({'Viruses', 'Bacteria', 'Archaea', 'Fungi', 'Other Eukaryota', 'Embryophyta'})
symbio_df = brack_df[f & (brack_df['Relative abundance'] >= lod)]

In [None]:
# get core symbiosphere
min_prev = 0.2
prev_df = get_prev_df(symbio_df, taxid2parent_group, lod)
core_symbio_df = symbio_df[symbio_df['Taxon'].isin(prev_df[prev_df['Prevalence'] >= min_prev]['Taxon'])]
core_symbio_df

In [None]:
print_group_richness(brack_df, lod, categories)

In [None]:
print_group_richness(symbio_df, lod, categories)

In [None]:
print_group_richness(core_symbio_df, lod, categories)

### Richness vs- library size

In [None]:
lod = 5e-06
# lod = 1e-05

x = brack_df[(brack_df['Relative abundance'] >= 0)].groupby('Colony').agg({'Taxon' : ['count']})['Taxon']
richness_df = pd.DataFrame({
    'Colony' : x.index,
    'Richness' : x['count']
}).reset_index(drop=True).merge(pheno_df[['Colony', 'Total read nr']], how='inner')

x = brack_df[(brack_df['Relative abundance'] >= lod)].groupby('Colony').agg({'Taxon' : ['count']})['Taxon']
richness_df_lod = pd.DataFrame({
    'Colony' : x.index,
    'Richness' : x['count']
}).reset_index(drop=True).merge(pheno_df[['Colony', 'Total read nr']], how='inner')

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 7.5))

sns.regplot(data=richness_df, x='Total read nr', y='Richness', ax=ax1)
ax1.set_title('no LoD, corr={}'.format(round(richness_df['Richness'].corr(richness_df['Total read nr']),ndigits=3)))

sns.regplot(data=richness_df_lod, x='Total read nr', y='Richness', ax=ax2)
ax2.set_title('LoD {}, corr={}'.format(lod, round(richness_df_lod['Richness'].corr(richness_df_lod['Total read nr']),ndigits=3)))

plt.tight_layout()
plt.show()

In [None]:
for group in categories:
    
    x = brack_df[(brack_df['Group'] == group) & (brack_df['Relative abundance'] >= 0)].groupby('Colony').agg({'Taxon' : ['count']})['Taxon']
    richness_df = pd.DataFrame({
        'Colony' : x.index,
        'Richness' : x['count']
    }).reset_index(drop=True).merge(pheno_df[['Colony', 'Total read nr']], how='inner')
    
    x = brack_df[(brack_df['Group'] == group) & (brack_df['Relative abundance'] >= lod)].groupby('Colony').agg({'Taxon' : ['count']})['Taxon']
    richness_df_lod = pd.DataFrame({
        'Colony' : x.index,
        'Richness' : x['count']
    }).reset_index(drop=True).merge(pheno_df[['Colony', 'Total read nr']], how='inner')
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 7.5))
    
    sns.regplot(data=richness_df, x='Total read nr', y='Richness', ax=ax1)
    ax1.set_title('{} no LoD, corr={}'.format(group, round(richness_df['Richness'].corr(richness_df['Total read nr']),ndigits=3)))
    
    sns.regplot(data=richness_df_lod, x='Total read nr', y='Richness', ax=ax2)
    ax2.set_title('{} LoD {}, corr={}'.format(group, lod, round(richness_df_lod['Richness'].corr(richness_df_lod['Total read nr']),ndigits=3)))
    
    plt.tight_layout()
    plt.show()

### Prevalence vs. abundance of families

In [None]:
fig = px.scatter(prev_df[prev_df['Group'].isin(categories)], x='Prevalence', y='Median Relative Abundance', color='Group', hover_data=['Taxon'])
fig.update_layout(
    autosize=False,
    width=1400,
    height=1000,
    xaxis={'title': {'font': {'size': 18}}},
    yaxis={'title': {'font': {'size': 18}}},
    legend={'font': {'size': 16}}
)
fig.update_yaxes(tickformat='.5f')
fig.show()

In [None]:
fig = px.scatter(prev_df[prev_df['Group'].isin(categories)], x='Prevalence', y='Median Relative Abundance', color='Group', hover_data=['Taxon'],  
                 log_y=True, color_discrete_map=)
fig.update_layout(
    autosize=False,
    width=1200,
    height=800,
    xaxis={'title': {'font': {'size': 20}}},
    yaxis={'title': {'font': {'size': 20}}},
    legend={'font': {'size': 20}}
)
fig.update_yaxes(tickformat='.6f')
fig.show()

### Relative abundance vs. Ratio

In [None]:
symbio_df

In [None]:
sns.scatterplot(symbio_df, x='Relative abundance', y='Ratio Honeybee', hue='Group')

## Core taxa & Relative abundance

In [None]:
core_relabund_df = core_symbio_df.pivot(index='Colony', columns='Taxon', values='Relative abundance').fillna(0).reset_index()
print(core_relabund_df.shape)
core_relabund_df

In [None]:
# core_relabund_df.to_csv('{}{}_RelativeAbundance_Core_{}.csv'.format(tlf_path, str(datetime.now().date()), level_name), index=False)
# pd.read_csv('{}{}_RelativeAbundance_Core_{}.csv'.format(tlf_path, str(datetime.now().date()), level_name))

In [None]:
# add metadata and reset indexes
pheno_core_relabund_df = pd.merge(pheno_df, core_relabund_df, on='Colony')
core_relabund_df = core_relabund_df.set_index('Colony')
pheno_core_relabund_df = pheno_core_relabund_df.set_index('Colony')

In [None]:
discrete_vars = ['month', 'region']
continous_vars =  ['Varroa Log Ratio Honeybee']

row_colors = get_row_colors(pheno_core_relabund_df, discrete_vars, continous_vars, var2cmap)

In [None]:
cg=sns.clustermap(core_relabund_df, metric='braycurtis', row_colors=row_colors, xticklabels=1, figsize=(12, 10))
cg.ax_row_colors.legend([Patch(facecolor=cmap_region[name]) for name in cmap_region], cmap_region, title='Region',bbox_to_anchor=(1, 1), bbox_transform=plt.gcf().transFigure, loc='upper left')
plt.setp(cg.ax_heatmap.xaxis.get_majorticklabels(), fontsize=8)
# cg.ax_row_colors.legend([Patch(facecolor=cmap_months[name]) for name in cmap_months], cmap_months, title='Month',bbox_to_anchor=(1, 0), bbox_transform=plt.gcf().transFigure, loc='lower left')
cg.cax.set_visible(False)

### PCoA

In [None]:
dist_matrix = beta_diversity('braycurtis', core_relabund_df)
pcoa_results = pcoa(dist_matrix)
sns.barplot(pcoa_results.proportion_explained[:10])

In [None]:
pc_nr = 5
pca_df = pd.merge(pheno_df, pd.DataFrame(pcoa_results.samples.values[:, :pc_nr], columns=['PC{}'.format(i + 1) for i in range(pc_nr)], index=dist_matrix.ids).reset_index().rename(columns={'index':'Colony'}), on='Colony')
pca_melt_df = pd.melt(pca_df, id_vars=list(pca_df.columns[:-5]), value_vars=list(pca_df.columns[-5:]), var_name = 'PC', value_name='PC_value')

In [None]:
sns.set_style('whitegrid')
plt.figure(figsize=(8, 8))
x='PC1'
y='PC2'
sns.scatterplot(data=pca_df, x=x, y=y, hue='group')
plt.xlabel(x, fontsize=14)
plt.ylabel(y, fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.legend(fontsize=14)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

In [None]:
variables = ['region', 'group', 'year', 'month']
for v in variables:
    g = sns.FacetGrid(pca_melt_df, col='PC', col_wrap=5, height=3, aspect=1)
    g.map(sns.boxplot,v,'PC_value')
    g.set_titles("{col_name}")

In [None]:
variables = [
    # 'group',
    # 'year',
    # 'month', 
    'percent_Ligustica_Carnica',
    'percent_Mellifera',
    'percent_Caucasica',
    'v_pho',
    'v_mito',
    'pca1',
    'recap_inf',
    'logit_recap_inf',
    'percent_mnr',
    'raw_mnr',
    'eb_mmr',
    'Varroa relative abundance'
]

for v in variables:
    g = sns.FacetGrid(pca_melt_df, col='PC', col_wrap=5, height=3, aspect=1)
    g.map(sns.regplot,'PC_value',v,scatter_kws={'s':4})
    g.set_titles("{col_name}")

In [None]:
sns.lmplot(data=pca_melt_df[pca_melt_df['PC']=='PC3'], x='PC_value', y='percent_Ligustica_Carnica', hue='region', height=8)

### Varroa-associated taxa

not much sense here given there are only 30 core families --> compute on all families

In [None]:
relabund_df = symbio_df.pivot(index='Colony', columns='Taxon', values='Relative abundance').fillna(0).reset_index()
print(relabund_df.shape)

In [None]:
# perform correlation tests on log transformed ratio to honeybees 
relabund_pheno_df = relabund_df.merge(pheno_df, on='Colony')
relabund_pheno_df['Shuffled Varroa relative abundance'] = random.sample(relabund_pheno_df['Varroa relative abundance'].to_list(), len(relabund_pheno_df))
relabund_pheno_df

In [None]:
y_variables = ['Varroa relative abundance', 'Shuffled Varroa relative abundance']
min_colony_nr = 5
con_methods = ['pearson']
cat_methods = []

pv_df = get_pv_df(relabund_pheno_df, relabund_df, y_variables, min_colony_nr, con_methods, cat_methods, lod)
pv_df

In [None]:
fdr_df = get_fdr_df(pv_df, y_variables, cat_methods + con_methods)
fdr_df

In [None]:
y_name = 'Varroa relative abundance'
for sp in fdr_df[(fdr_df['Y variable'] == y_name) & (fdr_df['Method'] == 'pearson') & (fdr_df['FDR'] <= 0.1)].sort_values('FDR')['Species']:
    f = relabund_pheno_df[sp] > np.log(lod)
    sns.regplot(data=relabund_pheno_df[f], x=sp, y=y_name)
    plt.show()

In [None]:
relabund_sig_df = core_symbio_df[core_symbio_df['Taxon'].isin(fdr_df[fdr_df['FDR'] < 0.1]['Species'])].pivot(index='Colony', columns='Taxon', values='Relative abundance').fillna(0).reset_index()
print(relabund_sig_df.shape)

In [None]:
# relabund_sig_df.to_csv('{}{}_RelativeAbundance_Core_VarroaAssociated_{}.csv'.format(tlf_path, str(datetime.now().date()), level_name), index=False)
# pd.read_csv('{}{}_RelativeAbundance_Core_VarroaAssociated_{}.csv'.format(tlf_path, str(datetime.now().date()), level_name))

##  Core taxa & Bee Log ratio

In [None]:
aitchison_df = core_symbio_df.pivot(index='Colony', columns='Taxon', values='Log Ratio Honeybee').fillna(np.log(1e-06)).reset_index()
print(aitchison_df.shape)

In [None]:
# aitchison_df.to_csv('{}{}_BeeLogRatio_Core_{}.csv'.format(tlf_path, str(datetime.now().date()), level_name), index=False)
# pd.read_csv('{}{}_BeeLogRatio_Core_{}.csv'.format(tlf_path, str(datetime.now().date()), level_name))

In [None]:
continous_vars =  ['Varroa Log Ratio Honeybee', 'logit_recap_inf', 'raw_mnr']
distance = 'euclidean'
plot_clustermap(aitchison_df, pheno_df, continous_vars, var2cmap, distance, figsize=(12, 9))

In [None]:
# add metadata and reset indexes
pheno_aitchison_df = pd.merge(pheno_df, aitchison_df, on='Colony')
aitchison_df = aitchison_df.set_index('Colony')
pheno_aitchison_df = pheno_aitchison_df.set_index('Colony')

In [None]:
discrete_vars = ['month', 'region']
continous_vars =  ['Varroa Log Ratio Honeybee']

row_colors = get_row_colors(pheno_aitchison_df, discrete_vars, continous_vars, var2cmap)

In [None]:
cg=sns.clustermap(aitchison_df, metric='euclidean', row_colors=row_colors, xticklabels=1, figsize=(12, 10))
cg.ax_row_colors.legend([Patch(facecolor=cmap_region[name]) for name in cmap_region], cmap_region, title='Region',bbox_to_anchor=(1, 1), bbox_transform=plt.gcf().transFigure, loc='upper left')
plt.setp(cg.ax_heatmap.xaxis.get_majorticklabels(), fontsize=8)
# cg.ax_row_colors.legend([Patch(facecolor=cmap_months[name]) for name in cmap_months], cmap_months, title='Month',bbox_to_anchor=(1, 0), bbox_transform=plt.gcf().transFigure, loc='lower left')
cg.cax.set_visible(False)

#### Mantel tests

In [None]:
from skbio.stats.distance import mantel

In [None]:
df1 = core_symbio_df.pivot(index='Colony', columns='Taxon', values='Log Ratio Honeybee').fillna(np.log(1e-06))
df2 = core_symbio_df.pivot(index='Colony', columns='Taxon', values='Log Ratio Honeybee').fillna(np.log(1e-07))

In [None]:
# not entirely sure this is something that make sense honestly to have braycurtis on negative values
dm1_bc = DistanceMatrix(squareform(pdist(df1, metric='braycurtis')), ids=df1.index)
dm2_bc = DistanceMatrix(squareform(pdist(df2, metric='braycurtis')), ids=df1.index)

In [None]:
dm1_eu = DistanceMatrix(squareform(pdist(df1, metric='euclidean')), ids=df1.index)
dm2_eu = DistanceMatrix(squareform(pdist(df2, metric='euclidean')), ids=df2.index)

In [None]:
# Perform Mantel test
r, p_value, n = mantel(dm1_eu, dm2_eu, method='pearson', permutations=999)
print(f"Mantel test statistic: {r}")

In [None]:
# Perform Mantel test
r, p_value, n = mantel(dm1_bc, dm2_bc, method='pearson', permutations=999)
print(f"Mantel test statistic: {r}")

In [None]:
# Perform Mantel test
r, p_value, n = mantel(dm2_bc, dm2_eu, method='pearson', permutations=999)
print(f"Mantel test statistic: {r}")

In [None]:
# Perform Mantel test
r, p_value, n = mantel(dm1_bc, dm1_eu, method='pearson', permutations=999)
print(f"Mantel test statistic: {r}")

compare relative abundance bray curtis with euclidian log ratio

In [None]:
df1 = core_symbio_df.pivot(index='Colony', columns='Taxon', values='Log Ratio Honeybee').fillna(np.log(1e-06))
df2 = core_symbio_df.pivot(index='Colony', columns='Taxon', values='Relative abundance').fillna(0)

In [None]:
dm1 = DistanceMatrix(squareform(pdist(df1, metric='euclidean')), ids=df1.index)
dm2 = DistanceMatrix(squareform(pdist(df2, metric='braycurtis')), ids=df2.index)

In [None]:
# Perform Mantel test
r, p_value, n = mantel(dm1, dm2, method='pearson', permutations=999)
print(f"Mantel test statistic: {r}")

In [None]:
# very different....

### PCA

In [None]:
pivot_df = aitchison_df.set_index('Colony')
pca = PCA()
x_pca = pca.fit_transform(pivot_df)
sns.barplot(pca.explained_variance_ratio_[:10])

In [None]:
pc_nr = 5
pca_df = pd.merge(pheno_df, pd.DataFrame(x_pca[:, :pc_nr], columns=['PC{}'.format(i + 1) for i in range(pc_nr)], index=pivot_df.index).reset_index().rename(columns={'index':'Colony'}), on='Colony')
pca_melt_df = pd.melt(pca_df, id_vars=list(pca_df.columns[:-5]), value_vars=list(pca_df.columns[-5:]), var_name = 'PC', value_name='PC_value')

In [None]:
plot_pca(1, 2, pca_df, 'region', cmap_region, pca.explained_variance_ratio_)

In [None]:
plot_pca(3, 4, pca_df, 'region', cmap_region, pca.explained_variance_ratio_)

In [None]:
variables = ['region', 'group', 'year', 'month']
for v in variables:
    g = sns.FacetGrid(pca_melt_df, col='PC', col_wrap=5, height=3, aspect=1)
    g.map(sns.boxplot,v,'PC_value')
    g.set_titles("{col_name}")

In [None]:
variables = [
    # 'group',
    # 'year',
    # 'month', 
    'percent_Ligustica_Carnica',
    'percent_Mellifera',
    'percent_Caucasica',
    'v_pho',
    'v_mito',
    'pca1',
    'recap_inf',
    'logit_recap_inf',
    'percent_mnr',
    'raw_mnr',
    'eb_mmr',
    'Varroa relative abundance',
    'Varroa Log Ratio Honeybee'
]

for v in variables:
    g = sns.FacetGrid(pca_melt_df, col='PC', col_wrap=5, height=3, aspect=1)
    g.map(sns.regplot,'PC_value',v,scatter_kws={'s':4})
    g.set_titles("{col_name}")

### Varroa-associated taxa

In [None]:
aitchison_pheno_df = aitchison_df.merge(pheno_df, on='Colony')
aitchison_pheno_df['Shuffled Varroa Log Ratio Honeybee'] = random.sample(aitchison_pheno_df['Varroa Log Ratio Honeybee'].to_list(), len(aitchison_pheno_df))
aitchison_pheno_df

In [None]:
y_variables = ['Varroa Log Ratio Honeybee', 'Shuffled Varroa Log Ratio Honeybee']
min_colony_nr = 5
con_methods = ['pearson']
cat_methods = []

pv_df = get_pv_df(aitchison_pheno_df, aitchison_df, y_variables, min_colony_nr, con_methods, cat_methods, np.log(lod)) # import to log also lod here
pv_df

In [None]:
for method in con_methods:
    for y_name in y_variables:
        sns.histplot(data=pv_df[(pv_df['Y variable'] == y_name) & (pv_df['Method'] == method)], x='P-value', bins=100)
        plt.title('{} {}'.format(y_name, method))
        plt.show()

In [None]:
fdr_df = get_fdr_df(pv_df, y_variables, cat_methods + con_methods)
fdr_df

In [None]:
y_name = 'Varroa Log Ratio Honeybee'
for sp in fdr_df[(fdr_df['Y variable'] == y_name) & (fdr_df['Method'] == 'pearson') & (fdr_df['FDR'] <= 0.1)].sort_values('FDR')['Species']:
    f = aitchison_pheno_df[sp] > np.log(lod)
    sns.regplot(data=aitchison_pheno_df[f], x=sp, y=y_name)
    plt.show()

In [None]:
aitchison_sig_df = core_symbio_df[core_symbio_df['Taxon'].isin(fdr_df[fdr_df['FDR'] < 0.1]['Species'])].pivot(index='Colony', columns='Taxon', values='Log Ratio Honeybee').fillna(np.log(1e-06)).reset_index()
print(aitchison_sig_df.shape)

In [None]:
for group in groups:
    print(group, filter_group_pivot_df(aitchison_sig_df, set([group]), taxon2group).shape[1] -1)

In [None]:
# aitchison_sig_df.to_csv('{}{}_BeeLogRatio_Core_VarroaAssociated_{}.csv'.format(tlf_path, str(datetime.now().date()), level_name), index=False)
# pd.read_csv('{}{}_BeeLogRatio_Core_VarroaAssociated_{}.csv'.format(tlf_path, str(datetime.now().date()),level_name))

#### family-enrichment

In [None]:
# fdr_df = fdr_df.drop('Family', axis=1)

In [None]:
x = symbio_df[['Taxon', 'TaxID']].drop_duplicates()
sp_name2taxid = dict(zip(x['Taxon'], x['TaxID']))
sp2fam = {sp: get_parent_taxon_at_level(sp_name2taxid[sp], 'family') for sp in set(fdr_df['Species'])}
fdr_df.insert(1, 'Family', [sp2fam[sp] for sp in fdr_df['Species']])

In [None]:
y_name = 'Varroa Log Ratio Honeybee'
method = 'pearson'

a =fdr_df[(fdr_df['Y variable'] == y_name) & (fdr_df['Method'] == method) & (fdr_df['Stat'] > 0) & (fdr_df['FDR'] <= 0.1)]
b =fdr_df[(fdr_df['Y variable'] == y_name) & (fdr_df['Method'] == method) & (fdr_df['Stat'] < 0) & (fdr_df['FDR'] <= 0.1)]

In [None]:
print(len(a))
print(len(b))

In [None]:
x = a
print(x.shape)
x.sort_values('FDR', ascending=True)[['Species', 'Family', 'FDR']][:50]

In [None]:
x = b
print(x.shape)
x.sort_values('FDR', ascending=True)[['Species', 'Family', 'FDR']][:50]

In [None]:
#pos_con_fam_df = get_fam_enrich_df(fdr_df[(fdr_df['Y variable'] == 'Varroa relative abundance') & (fdr_df['Method'] == 'pearson') & (fdr_df['Stat'] > 0)])
pos_cat_fam_df = get_fam_enrich_df(fdr_df[(fdr_df['Y variable'] == y_name) & (fdr_df['Method'] == method) & (fdr_df['Stat'] > 0)])
# neg_con_fam_df = get_fam_enrich_df(fdr_df[(fdr_df['Y variable'] == 'Varroa relative abundance') & (fdr_df['Method'] == 'pearson') & (fdr_df['Stat'] < 0)])
neg_cat_fam_df = get_fam_enrich_df(fdr_df[(fdr_df['Y variable'] == y_name) & (fdr_df['Method'] == method) & (fdr_df['Stat'] < 0)])

In [None]:
pos_cat_fam_df[:20]

In [None]:
pos_cat_fam_df[pos_cat_fam_df['Observed count']>1][:20]

In [None]:
neg_cat_fam_df[neg_cat_fam_df['Observed count']>1][:20]

In [None]:
for group in groups:
    print(group, filter_group_pivot_df(jaccard_sig_df, set([group]), taxon2group).shape[1] -1)

#### heatmap

In [None]:
continous_vars =  ['Varroa Log Ratio Honeybee']
distance = 'euclidean'
plot_clustermap(aitchison_sig_df, pheno_df, continous_vars, var2cmap, distance, figsize=(12, 10))

## All taxa & Prevalence analyses

In [None]:
jaccard_df = symbio_df.pivot(index='Colony', columns='Taxon', values='Relative abundance').map(lambda x: 1 if x > 0 else x).fillna(0).reset_index()
jaccard_df[jaccard_df.columns[1:]] = jaccard_df.iloc[:, 1:].astype(int)
print(jaccard_df.shape)

In [None]:
# jaccard_df.to_csv('{}{}_Prevalence_All_{}.csv'.format(tlf_path, str(datetime.now().date()), level_name), index=False)
# pd.read_csv('{}{}_Prevalence_All_{}.csv'.format(tlf_path, str(datetime.now().date()), level_name))

In [None]:
continous_vars =  ['Varroa Log Ratio Honeybee', 'logit_recap_inf', 'raw_mnr']
distance = 'jaccard'
plot_clustermap(jaccard_df, pheno_df, continous_vars, var2cmap, distance, figsize=(12, 10))

In [None]:
# add metadata and reset indexes
pheno_jaccard_df = pd.merge(pheno_df, jaccard_df, on='Colony')
jaccard_df = jaccard_df.set_index('Colony')
pheno_jaccard_df = pheno_jaccard_df.set_index('Colony')

In [None]:
discrete_vars = ['month', 'region']
continous_vars =  ['Varroa Log Ratio Honeybee']

row_colors = get_row_colors(pheno_jaccard_df, discrete_vars, continous_vars, var2cmap)

In [None]:
cg=sns.clustermap(jaccard_df, metric='jaccard', row_colors=row_colors, figsize=(12, 10))
cg.ax_row_colors.legend([Patch(facecolor=cmap_region[name]) for name in cmap_region], cmap_region, title='Region',bbox_to_anchor=(1, 1), bbox_transform=plt.gcf().transFigure, loc='upper left')
plt.setp(cg.ax_heatmap.xaxis.get_majorticklabels(), fontsize=8)
# cg.ax_row_colors.legend([Patch(facecolor=cmap_months[name]) for name in cmap_months], cmap_months, title='Month',bbox_to_anchor=(1, 0), bbox_transform=plt.gcf().transFigure, loc='lower left')
cg.cax.set_visible(False)

#### PCoA

In [None]:
jaccard_dm = DistanceMatrix(squareform(pdist(jaccard_df, metric='jaccard')), ids=jaccard_df.index)

pcoa_results = pcoa(jaccard_dm)

sns.barplot(pcoa_results.proportion_explained[:10])

In [None]:
pc_nr = 5
pca_df = pd.merge(pheno_df, pd.DataFrame(pcoa_results.samples.values[:, :pc_nr], columns=['PC{}'.format(i + 1) for i in range(pc_nr)], index=jaccard_dm.ids).reset_index().rename(columns={'index':'Colony'}), on='Colony')

pca_melt_df = pd.melt(pca_df, id_vars=list(pca_df.columns[:-5]), value_vars=list(pca_df.columns[-5:]), var_name = 'PC', value_name='PC_value')

In [None]:
plot_pca(1, 2, pca_df, 'region', cmap_region, list(pcoa_results.proportion_explained))

In [None]:
plot_pca(3, 4, pca_df, 'region', cmap_region, list(pcoa_results.proportion_explained))

In [None]:
plot_pca(1, 2, pca_df, 'group', cmap_group, list(pcoa_results.proportion_explained))

In [None]:
variables = ['region', 'group', 'year', 'month']
for v in variables:
    g = sns.FacetGrid(pca_melt_df, col='PC', col_wrap=5, height=3, aspect=1)
    g.map(sns.boxplot,v,'PC_value')
    g.set_titles("{col_name}")

In [None]:
variables = [
    # 'group',
    # 'year',
    # 'month', 
    'percent_Ligustica_Carnica',
    'percent_Mellifera',
    'percent_Caucasica',
    'v_pho',
    'v_mito',
    'pca1',
    'recap_inf',
    'logit_recap_inf',
    'percent_mnr',
    'raw_mnr',
    'eb_mmr',
    'Varroa relative abundance'
]

for v in variables:
    g = sns.FacetGrid(pca_melt_df, col='PC', col_wrap=5, height=3, aspect=1)
    g.map(sns.regplot,'PC_value',v,scatter_kws={'s':4})
    g.set_titles("{col_name}")

### Varroa-associated taxa

In [None]:
jaccard_df = symbio_df.pivot(index='Colony', columns='Taxon', values='Relative abundance').map(lambda x: 1 if x > 0 else x).fillna(0).reset_index()
jaccard_df[jaccard_df.columns[1:]] = jaccard_df.iloc[:, 1:].astype(int)
print(jaccard_df.shape)

In [None]:
# perform correlation tests on log transformed ratio to honeybees 
jaccard_pheno_df = jaccard_df.merge(pheno_df, on='Colony')
# jaccard_pheno_df['Shuffled Varroa relative abundance'] = random.sample(jaccard_pheno_df['Varroa relative abundance'].to_list(), len(jaccard_pheno_df))
jaccard_pheno_df['Shuffled Varroa Log Ratio Honeybee'] = random.sample(jaccard_pheno_df['Varroa Log Ratio Honeybee'].to_list(), len(jaccard_pheno_df))
jaccard_pheno_df

In [None]:
y_variables = ['Varroa Log Ratio Honeybee', 'Shuffled Varroa Log Ratio Honeybee'] # , 'Varroa relative abundance', 'Shuffled Varroa relative abundance']
min_colony_nr = 5
con_methods = []
cat_methods = ['tt']

pv_df = get_pv_df(jaccard_pheno_df, jaccard_df, y_variables, min_colony_nr, con_methods, cat_methods, lod)
pv_df

In [None]:
for method in cat_methods:
    for y_name in y_variables:
        sns.histplot(data=pv_df[(pv_df['Y variable'] == y_name) & (pv_df['Method'] == method)], x='P-value', bins=100)
        plt.title('{} {}'.format(y_name, method))
        plt.show()

In [None]:
fdr_df = get_fdr_df(pv_df, y_variables, cat_methods + con_methods)
fdr_df

In [None]:
y_name = 'Varroa Log Ratio Honeybee'
for sp in fdr_df[(fdr_df['Y variable'] == y_name) & (fdr_df['Method'] == 'tt') & (fdr_df['FDR'] <= 0.1)].sort_values('FDR')['Species']:
    x = jaccard_pheno_df[jaccard_pheno_df[sp] >= lod][y_name].to_numpy()
    y = jaccard_pheno_df[jaccard_pheno_df[sp] < lod][y_name].to_numpy()
    plt.boxplot([x, y], tick_labels=['Presence (n={})'.format(len(x)), 'Absence (n={})'.format(len(y))])
    plt.ylabel(y_name)
    plt.title(sp)
    plt.show()

In [None]:
y_name = 'Varroa Log Ratio Honeybee'
jaccard_sig_df = symbio_df[symbio_df['Taxon'].isin(fdr_df[(fdr_df['FDR'] < 0.1) & (fdr_df['Y variable']==y_name)]['Species'])].pivot(index='Colony', columns='Taxon', values='Relative abundance').map(lambda x: 1 if x > 0 else x).fillna(0).reset_index()
jaccard_sig_df[jaccard_sig_df.columns[1:]] = jaccard_sig_df.iloc[:, 1:].astype(int)
print(jaccard_sig_df.shape)

In [None]:
# jaccard_sig_df.to_csv('{}{}_Prevalence_All_VarroaAssociated_{}.csv'.format(tlf_path, str(datetime.now().date()), level_name), index=False)
# pd.read_csv('{}{}_Prevalence_All_VarroaAssociated_{}.csv'.format(tlf_path, str(datetime.now().date()), level_name))

#### family-enrichment

In [None]:
# fdr_df = fdr_df.drop('Family', axis=1)

In [None]:
x = symbio_df[['Taxon', 'TaxID']].drop_duplicates()
sp_name2taxid = dict(zip(x['Taxon'], x['TaxID']))
sp2fam = {sp: get_parent_taxon_at_level(sp_name2taxid[sp], 'family') for sp in set(fdr_df['Species'])}
fdr_df.insert(1, 'Family', [sp2fam[sp] for sp in fdr_df['Species']])

In [None]:
y_name = 'Varroa Log Ratio Honeybee'

a =fdr_df[(fdr_df['Y variable'] == y_name) & (fdr_df['Method'] == 'tt') & (fdr_df['Stat'] > 0) & (fdr_df['FDR'] <= 0.1)]
b =fdr_df[(fdr_df['Y variable'] == y_name) & (fdr_df['Method'] == 'tt') & (fdr_df['Stat'] < 0) & (fdr_df['FDR'] <= 0.1)]

In [None]:
print(len(a))
print(len(b))

In [None]:
x = a
print(x.shape)
x.sort_values('FDR', ascending=True)[['Species', 'Family', 'FDR']][:50]

In [None]:
x = b
print(x.shape)
x.sort_values('FDR', ascending=True)[['Species', 'Family', 'FDR']][:50]

In [None]:
#pos_con_fam_df = get_fam_enrich_df(fdr_df[(fdr_df['Y variable'] == 'Varroa relative abundance') & (fdr_df['Method'] == 'pearson') & (fdr_df['Stat'] > 0)])
pos_cat_fam_df = get_fam_enrich_df(fdr_df[(fdr_df['Y variable'] == y_name) & (fdr_df['Method'] == 'tt') & (fdr_df['Stat'] > 0)])
# neg_con_fam_df = get_fam_enrich_df(fdr_df[(fdr_df['Y variable'] == 'Varroa relative abundance') & (fdr_df['Method'] == 'pearson') & (fdr_df['Stat'] < 0)])
neg_cat_fam_df = get_fam_enrich_df(fdr_df[(fdr_df['Y variable'] == y_name) & (fdr_df['Method'] == 'tt') & (fdr_df['Stat'] < 0)])

In [None]:
pos_cat_fam_df[pos_cat_fam_df['Observed count']>1][:20]

In [None]:
neg_cat_fam_df[neg_cat_fam_df['Observed count']>1][:20]

#### heatmap

In [None]:
continous_vars =  ['Varroa Log Ratio Honeybee']
distance = 'jaccard'
plot_clustermap(jaccard_sig_df, pheno_df, continous_vars, var2cmap, distance, figsize=(12, 10))

#### per group

In [None]:
taxon2group = dict(zip(symbio_df['Taxon'], symbio_df['Group']))

In [None]:
groups = ['Viruses', 'Bacteria', 'Archaea', 'Other Eukaryota',  'Embryophyta', 'Fungi']

In [None]:
for group in groups:
    print(group, filter_group_pivot_df(jaccard_sig_df, set([group]), taxon2group).shape[1] -1)

In [None]:
continous_vars =  ['Varroa Log Ratio Honeybee']
group2figsize = {'Embryophyta': (12, 8)}
distance = 'jaccard'
for group in groups:
    f_df = filter_group_pivot_df(jaccard_sig_df, group, taxon2group)
    if f_df.shape[1] > 2:
        print(group)
        plot_clustermap(f_df, pheno_df, continous_vars, var2cmap, distance, group2figsize.get(group, (12, 10)))

In [None]:
# cluster only based on these two families
families = ['Morganellaceae', 'Spiroplasmataceae']
cg = plot_clustermap(filter_group_pivot_df(jaccard_sig_df, families, sp2fam), pheno_df, continous_vars, var2cmap, distance, figsize=(12, 10))

In [None]:
# keep original clustering but zoom on these two families
families = ['Morganellaceae', 'Spiroplasmataceae']
c = sns.clustermap(filter_group_pivot_df(cg.data2d, families, sp2fam), row_cluster=False, col_cluster=False, row_linkage=cg.dendrogram_row.linkage, col_linkage=cg.dendrogram_col.linkage)
c.ax_heatmap.set_xticklabels([truncate(label.get_text()) for label in c.ax_heatmap.get_xticklabels()])
c.ax_heatmap.set_yticklabels([truncate(label.get_text()) for label in c.ax_heatmap.get_yticklabels()])

# Bracken species-level[OLD]

In [None]:
## which high level taxa we care about?
taxonomic_groups = set(('root', 'Viruses','Bacteria', 'Archaea', 'Eukaryota', 'Embryophyta', 'Fungi', 'Metazoa', 'Apis','Varroa'))
group2name = {'root': 'Other Root', 'Eukaryota': 'Other Eukaryota'}

group_name2taxid = {group2name.get(k, k): v[0] for k, v in ncbi.get_name_translator(taxonomic_groups).items()}
group_taxid2name = {v:k for k, v in group_name2taxid.items()}
groups = [group2name.get(x, x) for x in taxonomic_groups]

## beexact
beexact_sp_name2taxid = get_beexact_species_name2taxid()
beexact_sp_taxids = set(chain(*list(beexact_sp_name2taxid.values())))
beexact_sp_names = set(list(beexact_sp_name2taxid))


## Classification overview of key taxonomic levels 

In [None]:
skip_bs_ids = {'BS17_0674', 'BS17_0721'}
bs_ids = sorted([x for x in set(bs_sra_df['num_bs'].to_list()) if x not in skip_bs_ids])
krakdb, readpool, mhg, cs, sf, r, level = ('corent', 'nonbee', 2, '005', 1, 1, 'S')

tree = ncbi.get_topology(list(group_name2taxid.values()))

columns = ['Colony', 'Category', 'Read number', 'Relative abundance']
rows = []
for bs_id in tqdm(bs_ids):
    ucseqs_nr, cseqs_nr = parse_kreport(kraken2_path, bs_id, krakdb, readpool, mhg, cs, sf, r)
    taxa2seqnr = parse_breport(bracken_path, bs_id, krakdb, readpool, mhg, cs, sf, r, level, taxonomic_groups)
    classified_read_nr = get_classified_read_nr(bowtie2_path, kraken2_path, bracken_path, bs_id, krakdb, readpool, mhg, cs, sf, r, level)
    
    rows.append((bs_id, 'Unclassified', ucseqs_nr))
    rows.append((bs_id, 'Kraken2 only', cseqs_nr - taxa2seqnr['root']))
    
    for taxon in taxonomic_groups:
        name = group2name.get(taxon, taxon)
        node = tree&group_name2taxid[name]
        read_nr = taxa2seqnr.get(taxon, 0) - sum([taxa2seqnr.get(x, 0) for x in get_desc_taxa(node, taxonomic_groups.difference({node.sci_name}))])
        rows.append((bs_id, name, read_nr, read_nr / classified_read_nr))

class_df = pd.DataFrame(data=rows, columns=columns)

In [None]:
class_df

In [None]:
cmap = {
    'Unclassified': '#262626',
    'Kraken2 only': '#575757',
    'Other Root': '#949494',
    'Viruses': '#0173b2',
    'Bacteria': '#de8f05',
    'Archaea': '#d55e00',
    'Other Eukaryota': '#029e73',
    'Embryophyta':'#12634c',
    'Apis': '#75c8b0',
    'Fungi': '#cc78bc',
    'Metazoa': '#ca9161',
    'Varroa': '#ece133'
}

categories=['Other Root', 'Viruses','Bacteria', 'Archaea','Other Eukaryota','Embryophyta','Fungi','Metazoa', 'Varroa']

In [None]:
fig = px.bar(class_df, x="Colony", y="Read number", color="Category",
            hover_data=['Category'], barmode = 'stack', color_discrete_map=cmap, category_orders={"Category": cmap.keys()})
fig.update_layout(
    autosize=False,
    width=1500,
    height=700,
)
fig.show()

In [None]:
fig = px.bar(class_df[class_df['Category'].isin(categories)], x="Colony", y="Relative abundance", color="Category",
            hover_data=['Category'], barmode = 'stack', color_discrete_map=cmap, category_orders={"Category": categories})
fig.update_layout(
    autosize=False,
    width=1000,
    height=700,
    xaxis={'title': {'font': {'size': 18}}},
    yaxis={'title': {'font': {'size': 18}}},
    legend={'font': {'size': 16}}
)
fig.show()

In [None]:
plt.figure(figsize=(8, 6))
ax = sns.boxplot(data=class_df, x='Category', y='Read number', palette=cmap, 
            showmeans=True, meanprops={"marker":"o", "markerfacecolor":"red", "markeredgecolor":"black", "markersize":"5"},order=list(cmap.keys()))

ax.set_yscale("log")
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
ax = sns.boxplot(data=class_df[class_df['Category'].isin(categories)], x='Category', y='Relative abundance', palette=cmap, 
            showmeans=True, meanprops={"marker":"o", "markerfacecolor":"red", "markeredgecolor":"black", "markersize":"5"}, order=categories)

ax.set_yscale("log")
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

## Species-level classification

In [None]:
# parse bracken tables
skip_bs_ids = {'BS17_0674', 'BS17_0721'}
bs_ids = sorted([x for x in set(bs_sra_df['num_bs'].to_list()) if x not in skip_bs_ids])
krakdb, readpool, mhg, cs, sf, r, level = ('corent', 'nonbee', 2, '005', 1, 1, 'S')

df_list = []
for bs_id in tqdm(bs_ids):
    df = get_bracken_df(bowtie2_path, kraken2_path, bracken_path, bs_id, krakdb, readpool, mhg, cs, sf, r, level, {})
    df = df[['name', 'taxonomy_id', 'fraction_total_reads', 'new_est_reads', 'log_ratio_honeybee', 'ratio_honeybee']]
    df.insert(0, 'Colony', ['{}'.format(bs_id) for i in range(len(df))])
    df_list.append(df)

brack_df = pd.concat(df_list).rename(columns={'name': 'Taxon', 
                                              'taxonomy_id': 'TaxID', 
                                              'fraction_total_reads': 
                                              'Relative abundance', 'new_est_reads': 
                                              'Read number', 'ratio_honeybee': 
                                              'Ratio Honeybee', 'log_ratio_honeybee': 
                                              'Log Ratio Honeybee'}).reset_index().drop(columns=['index'])

# enhance brack df with key taxonomic groups (from previous section)
taxid2parent_group = {}
for tid in tqdm(set(brack_df['TaxID'])):
    for p in ncbi.get_lineage(tid):
        if p in group_taxid2name:
            taxid2parent_group[tid] = group_taxid2name[p]

brack_df.insert(1, 'Group', [taxid2parent_group[x] for x in brack_df['TaxID']])
brack_df

In [None]:
lod = 5e-06
categories=['Other Root', 'Viruses','Bacteria', 'Archaea','Other Eukaryota','Embryophyta','Fungi','Metazoa', 'Varroa']

df1 = pd.DataFrame(brack_df.groupby('Group')['Taxon'].nunique()).rename(columns={'Taxon': 'No LoD'})
df2 = pd.DataFrame(brack_df[brack_df['Relative abundance'] >= lod].groupby('Group')['Taxon'].nunique()).rename(columns={'Taxon': 'LoD {}'.format(lod)})
df = df1.merge(df2, on='Group').reindex(categories)
df

In [None]:
# temporary definition of symbiosphere constituents
lod = 5e-06
#f = (brack_df['Group'].isin({'Viruses', 'Bacteria', 'Fungi'})) | (brack_df['Taxon'].isin({'Lotmaria passim'}))
f = brack_df['Group'].isin({'Viruses', 'Bacteria', 'Fungi'})
symbio_df = brack_df[f & (brack_df['Relative abundance'] >= lod)]

In [None]:
pheno_df = pheno_df.merge(brack_df[brack_df['Taxon']=='Varroa destructor'][['Colony', 'Relative abundance']].rename(columns={'Relative abundance': 'Varroa relative abundance'}),
              on='Colony', how='left')
# choice of treating NA varroa abundance as 0 values
pheno_df['Varroa relative abundance'] = pheno_df['Varroa relative abundance'].fillna(0)

In [None]:
pheno_df

In [None]:
# for Kevin
symbio_df.to_csv('{}{}_symbiosphere_abundances_BeeStrong.csv'.format(tlf_path, str(datetime.now().date())), index=False)
pheno_df.to_csv('{}{}_phenotypes_BeeStrong.csv'.format(tlf_path, str(datetime.now().date())), index=False)

### Richness vs. Library size

Do Richness depends on Library size? after LoD?

does it vary depending on clade?

In [None]:
lod = 5e-06
# lod = 1e-05

x = brack_df[(brack_df['Relative abundance'] >= 0)].groupby('Colony').agg({'Taxon' : ['count']})['Taxon']
richness_df = pd.DataFrame({
    'Colony' : x.index,
    'Richness' : x['count']
}).reset_index(drop=True).merge(pheno_df[['Colony', 'Total read nr']], how='inner')

x = brack_df[(brack_df['Relative abundance'] >= lod)].groupby('Colony').agg({'Taxon' : ['count']})['Taxon']
richness_df_lod = pd.DataFrame({
    'Colony' : x.index,
    'Richness' : x['count']
}).reset_index(drop=True).merge(pheno_df[['Colony', 'Total read nr']], how='inner')

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 7.5))

sns.regplot(data=richness_df, x='Total read nr', y='Richness', ax=ax1)
ax1.set_title('no LoD, corr={}'.format(round(richness_df['Richness'].corr(richness_df['Total read nr']),ndigits=3)))

sns.regplot(data=richness_df_lod, x='Total read nr', y='Richness', ax=ax2)
ax2.set_title('LoD {}, corr={}'.format(lod, round(richness_df_lod['Richness'].corr(richness_df_lod['Total read nr']),ndigits=3)))

plt.tight_layout()
plt.show()

In [None]:
richness_df

In [None]:
for group in categories:
    
    x = brack_df[(brack_df['Group'] == group) & (brack_df['Relative abundance'] >= 0)].groupby('Colony').agg({'Taxon' : ['count']})['Taxon']
    richness_df = pd.DataFrame({
        'Colony' : x.index,
        'Richness' : x['count']
    }).reset_index(drop=True).merge(pheno_df[['Colony', 'Total read nr']], how='inner')
    
    x = brack_df[(brack_df['Group'] == group) & (brack_df['Relative abundance'] >= lod)].groupby('Colony').agg({'Taxon' : ['count']})['Taxon']
    richness_df_lod = pd.DataFrame({
        'Colony' : x.index,
        'Richness' : x['count']
    }).reset_index(drop=True).merge(pheno_df[['Colony', 'Total read nr']], how='inner')
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 7.5))
    
    sns.regplot(data=richness_df, x='Total read nr', y='Richness', ax=ax1)
    ax1.set_title('{} no LoD, corr={}'.format(group, round(richness_df['Richness'].corr(richness_df['Total read nr']),ndigits=3)))
    
    sns.regplot(data=richness_df_lod, x='Total read nr', y='Richness', ax=ax2)
    ax2.set_title('{} LoD {}, corr={}'.format(group, lod, round(richness_df_lod['Richness'].corr(richness_df_lod['Total read nr']),ndigits=3)))
    
    plt.tight_layout()
    plt.show()

## Varroa infestation

In [None]:
pheno_df

In [None]:
varroa_df = pheno_df.reset_index(drop=True).merge(brack_df[brack_df['Taxon']=='Varroa destructor'][['Colony', 'Relative abundance', 'Ratio Honeybee']].rename(columns={
    'Relative abundance': 'Varroa relative abundance', 'Ratio Honeybee': 'Varroa Ratio Honeybee'}), on='Colony', how='left')

In [None]:
varroa_df

In [None]:
varroa_df['Varroa relative abundance']


In [None]:
varroa_df['Varroa relative abundance']

In [None]:
# # my NA means 0 but I don't know what sonia's NA means
# # varroa_df = varroa_df.fillna({'Relative abundance': 0, 'Log Ratio Honeybee': np.log(1e-07)})
# 
# x = varroa_df['Nb varroas/100 bees'].to_numpy()
# y = np.full(len(varroa_df), np.nan)
# y[x > 0] = np.log(x[x > 0])
# varroa_df['Log Nb varroas/100 bees'] = y

### compare measures

compare my measures: Log Ratio Honeybee and Relative abundance, and Sonia' measure: VarroaMitoRatio

for fair comparison filter rows with NA and 0 values in one of these measures

In [None]:
# compare measure when they are not NA
sum(varroa_df['Varroa relative abundance'].isna() | varroa_df['v_mito'].isna())

In [None]:
varroa_df_f = varroa_df[~(varroa_df['Varroa relative abundance'].isna() | varroa_df['v_mito'].isna())]

In [None]:
varroa_df_f

In [None]:
print(varroa_df_f['v_pho'].corr(varroa_df_f['Varroa relative abundance']))

plt.figure(figsize=figsize)
sns.scatterplot(data=varroa_df_f, x='v_pho', y='Varroa relative abundance')
plt.show()

In [None]:
print(varroa_df_f['v_pho'].corr(varroa_df_f['Varroa Ratio Honeybee']))
plt.figure(figsize=figsize)
sns.scatterplot(data=varroa_df_f, x='v_pho', y='Varroa Ratio Honeybee')
plt.show()

In [None]:
print(varroa_df_f['v_pho'].corr(varroa_df_f['v_mito']))
plt.figure(figsize=figsize)
sns.scatterplot(data=varroa_df_f, x='v_pho', y='v_mito')
plt.show()

very similar results, but relative abundance is actually better than ratio

In [None]:
print(varroa_df_f['Varroa relative abundance'].corr(varroa_df_f['v_mito']))
plt.figure(figsize=figsize)
sns.scatterplot(data=varroa_df_f, x='Varroa relative abundance', y='v_mito')
plt.show()

few outliers where I have a lower estimate of varroa load with my method than Sonia

In [None]:
print(varroa_df_f['Varroa Ratio Honeybee'].corr(varroa_df_f['v_mito']))
plt.figure(figsize=figsize)
sns.scatterplot(data=varroa_df_f, x='Varroa Ratio Honeybee', y='v_mito')
plt.show()

In [None]:
sns.scatterplot(data=varroa_df_f, x='Varroa Ratio Honeybee', y='Varroa relative abundance')
print(varroa_df_f['Varroa Ratio Honeybee'].corr(varroa_df_f['Varroa relative abundance']))

my two measures are extremely similar , more than either with sonia's measure, which makes sense

### swiss colonies

In [None]:
varroa_df_f_ch = varroa_df_f[varroa_df_f['Colony'].isin({'BS18_%04d' % i for i in range(1, 186)})]

In [None]:
print(varroa_df_f_ch['v_pho'].corr(varroa_df_f_ch['Varroa relative abundance']))

plt.figure(figsize=figsize)
sns.scatterplot(data=varroa_df_f_ch, x='v_pho', y='Varroa relative abundance')
plt.show()

In [None]:
print(varroa_df_f_ch['v_pho'].corr(varroa_df_f_ch['v_mito']))
plt.figure(figsize=figsize)
sns.scatterplot(data=varroa_df_f_ch, x='v_pho', y='v_mito')
plt.show()

### include all samples

In [None]:
varroa_df.shape

In [None]:
sns.scatterplot(data=varroa_df, x='v_pho', y='Varroa relative abundance')
print(varroa_df['v_pho'].corr(varroa_df['Varroa relative abundance']))

In [None]:
sns.scatterplot(data=varroa_df, x='v_pho', y='v_mito')
print(varroa_df['v_pho'].corr(varroa_df['v_mito']))

### other variables

In [None]:
varroa_df

In [None]:
# pca1 I think is a combination of the 4 varroa load measures
sns.scatterplot(data=varroa_df, x='pca1', y='Varroa relative abundance')
print(varroa_df['pca1'].corr(varroa_df['Varroa relative abundance']))

In [None]:
# not sure but maybe it is the brood infestation
sns.scatterplot(data=varroa_df, x='nbr_inf', y='Varroa relative abundance')
print(varroa_df['nbr_inf'].corr(varroa_df['Varroa relative abundance']))

In [None]:
# should be combination of brood and phoretic varroas
sns.scatterplot(data=varroa_df, x='v_load', y='Varroa relative abundance')
print(varroa_df['v_load'].corr(varroa_df['Varroa relative abundance']))

In [None]:
# just v_pho rounded I guess
sns.scatterplot(data=varroa_df, x='nbr_pho_varroa_100bee', y='Varroa relative abundance')
print(varroa_df['nbr_pho_varroa_100bee'].corr(varroa_df['Varroa relative abundance']))

## Species composition

In [None]:
lod = 5e-06
# lod = 1e-05

brack_df_lod = brack_df[(brack_df['Relative abundance'] >= lod)]

In [None]:
# calculate mean and median abundance of each species
median_abund_df = brack_df_lod.groupby('Taxon').agg({'Relative abundance' : ['median']})['Relative abundance']
mean_abund_df = brack_df_lod.groupby('Taxon').agg({'Relative abundance' : ['mean']})['Relative abundance']
sp2median_abund = dict(zip(median_abund_df.index, median_abund_df['median']))
sp2mean_abund = dict(zip(mean_abund_df.index, mean_abund_df['mean']))

In [None]:
x = brack_df_lod[['Taxon', 'TaxID']].drop_duplicates()
sp_name2taxid = dict(zip(x['Taxon'], x['TaxID']))

In [None]:
species, counts = np.unique(brack_df_lod['Taxon'], return_counts=True)
species_freqs = counts / len(bs_ids)
in_beexact = [x in beexact_sp_names for x in species]

In [None]:
x = brack_df_lod[['Taxon', 'TaxID']].drop_duplicates()
sp_name2taxid = dict(zip(x['Taxon'], x['TaxID']))


In [None]:
#core = [x >= 0.5 for x in species_freqs]
prev_df = pd.DataFrame({
    'Taxon' : list(species),
    'Family': [get_parent_taxon_at_level(sp_name2taxid[sp], 'family') for sp in species],
    'Group': [taxid2parent_group[sp_name2taxid[sp]] for sp in species], 
    'TaxID' : [sp_name2taxid[sp] for sp in species],
    'Prevalence' : list(species_freqs),
    'Median Relative Abundance': [sp2median_abund[sp] for sp in species],
    'Mean Relative Abundance': [sp2mean_abund[sp] for sp in species],
    'In BEExact' : in_beexact,
    # 'Core' : core
})

In [None]:
prev_df

In [None]:
for group in categories:
    fig = px.scatter(prev_df[prev_df['Group'] == group], x='Prevalence', y='Median Relative Abundance', color='In BEExact', hover_data=['Taxon'], title=group)
    fig.update_layout(
        autosize=False,
        width=700,
        height=500,
        xaxis={'title': {'font': {'size': 18}}},
        yaxis={'title': {'font': {'size': 18}}},
        legend={'font': {'size': 16}}
    )
    fig.update_yaxes(tickformat='.5f')
    fig.show()

In [None]:
prev_df[prev_df['Taxon'] == 'Acidovorax sp. JMULE5']

In [None]:
0.000005

In [None]:
# log version
for group in ['Other Eukaryota','Embryophyta','Fungi','Viruses','Bacteria']:
    fig = px.scatter(prev_df[prev_df['Group'] == group], x='Prevalence', y='Median Relative Abundance', color='In BEExact', hover_data=['Taxon'], title=group, log_y=True)
    fig.update_layout(
        autosize=False,
        width=700,
        height=500,
        xaxis={'title': {'font': {'size': 18}}},
        yaxis={'title': {'font': {'size': 18}}},
        legend={'font': {'size': 16}}
    )
    fig.show()

## 1-1 associations

In [None]:
import statsmodels.api as sm
import statsmodels.stats.multitest as smm
from scipy import stats

from scipy.stats import pearsonr, mannwhitneyu,  spearmanr
from matplotlib_venn import venn3, venn2

from skbio.stats.ordination import pcoa
from skbio import DistanceMatrix
from scipy.spatial.distance import pdist, squareform
from skbio.diversity import beta_diversity


In [None]:
symbio_pivot_df = symbio_df.pivot(index='Colony', columns='Taxon', values='Relative abundance').reset_index()
# symbio_pivot_df = pd.merge(pheno_df[['v_pho']].dropna(), symbio_pivot_df, left_index=True, right_index=True)
symbio_pivot_df

In [None]:
symbio_pheno_df = symbio_pivot_df.merge(pheno_df, on='Colony')

symbio_pheno_df['Shuffled Varroa relative abundance'] = random.sample(symbio_pheno_df['Varroa relative abundance'].to_list(), len(symbio_pheno_df))

symbio_pheno_df['Shuffled v_pho'] = random.sample(symbio_pheno_df['v_pho'].to_list(), len(symbio_pheno_df))

symbio_pheno_df['Shuffled v_mito'] = random.sample(symbio_pheno_df['v_mito'].to_list(), len(symbio_pheno_df))

symbio_pheno_df

### compute p-values

In [None]:
min_colony_nr = 5
# y_variables = ['v_pho', 'Varroa relative abundance', 'v_mito', 'recap_inf', 'raw_mnr', 'bee_weigth', 'nbr_open', 'random']
con_methods = ['pearson', 'pearson_perm']
#con_methods = ['lr', 'pearson', 'spearman']

cat_methods = ['tt', 'wilcoxon' ]

y_variables = ['v_pho', 'Shuffled v_pho', 'v_mito', 'Shuffled v_mito', 'Varroa relative abundance', 'Shuffled Varroa relative abundance', 'bee_weigth', 'nbr_open_brood']

columns = ['Species', 'Y variable', 'Method', 'Stat', 'P-value']
rows = []
for sp in tqdm(symbio_pheno_df.columns[1:len(symbio_pivot_df.columns)]):
    for y_name in y_variables:
        
        # run continuous method only where species and y variable is available for at least 5 colonies
        f = (symbio_pheno_df[sp] > 0) & ~(symbio_pheno_df[y_name].isna())
        if sum(f) >= min_colony_nr:
            X = symbio_pheno_df[sp][f]
            y = symbio_pheno_df[y_name][f]
            for method in con_methods:
                tstat, pvalue = get_pv_con(X, y, method)
                rows.append((sp, y_name, method, tstat, pvalue))

        # run categorical method only where species and y variable is available for at least 5 colonies
        x = symbio_pheno_df[(symbio_pheno_df[sp] > 0) & ~(symbio_pheno_df[y_name].isna())][y_name].to_numpy()
        y = symbio_pheno_df[(symbio_pheno_df[sp].isna()) & ~(symbio_pheno_df[y_name].isna())][y_name].to_numpy()
        if (len(x) >= min_colony_nr) and (len(y) >= min_colony_nr):
            for method in cat_methods:
                tstat, pvalue = get_pv_cat(x, y, method)
                rows.append((sp, y_name, method, tstat, pvalue))

pv_df = pd.DataFrame(rows, columns=columns).dropna().reset_index(drop=True)
pv_df

In [None]:
# pv_df.to_csv('{}{}_pvalues.csv'.format(tlf_path, str(datetime.now().date())))
pv_df = pd.read_csv('{}2025-05-13_pvalues.csv'.format(tlf_path), index_col=0)
pv_df

In [None]:
for method in con_methods:
    for y_name in y_variables:
        sns.histplot(data=pv_df[(pv_df['Y variable'] == y_name) & (pv_df['Method'] == method)], x='P-value', bins=100)
        plt.title('{} {}'.format(y_name, method))
        plt.show()

In [None]:
for method in cat_methods:
    for y_name in y_variables:
        sns.histplot(data=pv_df[(pv_df['Y variable'] == y_name) & (pv_df['Method'] == method)], x='P-value', bins=100)
        plt.title('{} {}'.format(y_name, method))
        plt.show()

- (run the two non-parametric approach: but quite slow)
- seems there is an enrichment of small p-value, compared to shuffling
- but it also exists for other variables where I would not expect any (bee weight, number of open brood, etc.)
- 

#### compare methods and variables

In [None]:
y_name = 'Varroa relative abundance'
method_df = pv_df[(pv_df['Y variable'] == y_name)].pivot(index=['Species'], columns='Method', values='P-value').reset_index()
method_df

In [None]:
x = 'pearson'
y = 'pearson_perm'
sns.scatterplot(data=method_df, x=x, y=y)
method_df[x].corr(method_df[y])

In [None]:
x = 'tt'
y = 'wilcoxon'
sns.scatterplot(data=method_df, x=x, y=y)
method_df[x].corr(method_df[y])

- pearson on LM p-values are identical
- ranks are very much different
- both pearson are very similar but tt and wilcoxon generate very different p-values, with more significant p-values with wilcoxon

#### compare variables

In [None]:
method_name = 'pearson'
y_df = pv_df[(pv_df['Method'] == method_name)].pivot(index=['Species'], columns='Y variable', values='P-value').reset_index()
y_df

In [None]:
x = 'Varroa relative abundance'
y = 'v_mito'
sns.scatterplot(data=y_df, x=x, y=y)
y_df[x].corr(y_df[y])

In [None]:
x = 'v_pho'
y = 'v_mito'
sns.scatterplot(data=y_df, x=x, y=y)
y_df[x].corr(y_df[y])

In [None]:
x = 'v_pho'
y = 'Varroa relative abundance'
sns.scatterplot(data=y_df, x=x, y=y)
y_df[x].corr(y_df[y])

In [None]:
method_name = 'tt'
y_df = pv_df[(pv_df['Method'] == method_name)].pivot(index=['Species'], columns='Y variable', values='P-value').reset_index()
y_df

In [None]:
x = 'Varroa relative abundance'
y = 'v_mito'
sns.scatterplot(data=y_df, x=x, y=y)
y_df[x].corr(y_df[y])

In [None]:
x = 'v_pho'
y = 'v_mito'
sns.scatterplot(data=y_df, x=x, y=y)
y_df[x].corr(y_df[y])

In [None]:
x = 'v_pho'
y = 'Varroa relative abundance'
sns.scatterplot(data=y_df, x=x, y=y)
y_df[x].corr(y_df[y])

pretty different p-values are generated but the two genomics approach generate more similar p-values

--> let's pick the parametric tests then

#### other variables

In [None]:
pv_df[(pv_df['Y variable'] == 'bee_weigth') & (pv_df['Method'] == 'pearson') & (pv_df['Stat'] > 0)].sort_values('P-value')[:20]

In [None]:
pv_df[(pv_df['Y variable'] == 'nbr_open_brood') & (pv_df['Method'] == 'pearson') & (pv_df['Stat'] < 0)].sort_values('P-value')[:20]

### FDR

In [None]:
y_variables = ['Varroa relative abundance']
methods = ['pearson', 'tt']
fdr_df = pv_df[(pv_df['Y variable'].isin(y_variables)) & (pv_df['Method'].isin(methods))].copy()
fdr_df['FDR'] = np.nan

In [None]:
# compute FDR
for (y_name, method) in product(y_variables, methods):
    f = (fdr_df['Y variable'] == y_name) & (fdr_df['Method'] == method)
    fdr_df.iloc[np.argwhere(f).flatten(), 5] = smm.multipletests(fdr_df[f]['P-value'].to_numpy(), alpha=0.05, method='fdr_bh')[1]

In [None]:
fdr_df

In [None]:
for (y_name, method) in product(y_variables, methods):
    f = (fdr_df['Y variable'] == y_name) & (fdr_df['Method'] == method)
    print((y_name, method), len(fdr_df[f & (fdr_df['FDR'] < 0.1)]))

In [None]:
plt.figure(figsize=(6, 6))
set1 = set(fdr_df[(fdr_df['Y variable'] == 'Varroa relative abundance') & (fdr_df['Method'] == 'pearson') & (fdr_df['FDR'] <= 0.1)]['Species'])
set2 = set(fdr_df[(fdr_df['Y variable'] == 'Varroa relative abundance') & (fdr_df['Method'] == 'tt') & (fdr_df['FDR'] <= 0.1)]['Species'])
venn2([set1, set2], set_labels=('pearson', 'tt'))
plt.show()

In [None]:
fdr_df[(fdr_df['Y variable'] == 'Varroa relative abundance') & (fdr_df['Method'] == 'tt') & (fdr_df['FDR'] <= 0.1)].sort_values('FDR')['Species']

#### some diagnostics on significant species

In [None]:
x = prev_df[prev_df['Taxon'].isin(fdr_df[(fdr_df['Y variable'] == 'Varroa relative abundance') & (fdr_df['Stat'] > 0) & (fdr_df['Method'] == 'tt') & (fdr_df['FDR'] <= 0.1)].sort_values('FDR')['Species'])]['Prevalence']
y = prev_df['Prevalence']
d = {
    'Varroa-associated species': x,
    'All species' : y
}
sns.boxplot(d)
plt.yscale('log')

In [None]:
d = {fdr_df[(fdr_df['Y variable'] == 'Varroa relative abundance') & (fdr_df['Method'] == 'tt') & (fdr_df['FDR'] <= 0.1)].sort_values('FDR')[:10]['Taxon']}

In [None]:
fdr_df[(fdr_df['Y variable'] == 'Varroa relative abundance') & (fdr_df['Method'] == 'tt') & (fdr_df['FDR'] <= 0.1)].sort_values('FDR')

In [None]:
group = 'Bacteria'
fig = px.scatter(prev_df[prev_df['Group'] == group], x='Prevalence', y='Median Relative Abundance', color='Family', hover_data=['Taxon', 'Family'], title=group)
fig.update_layout(
    autosize=False,
    width=1400,
    height=1000,
    xaxis={'title': {'font': {'size': 18}}},
    yaxis={'title': {'font': {'size': 18}}},
    legend={'font': {'size': 16}}
)
fig.update_yaxes(tickformat='.5f')
fig.show()

In [None]:
fdr_df[(fdr_df['Y variable'] == 'Varroa relative abundance') & (fdr_df['Method'] == 'tt') & (fdr_df['FDR'] <= 0.1)].sort_values('FDR')[:20]

In [None]:
sp = 'Vibrio parahaemolyticus'
# sp = 'Providencia sp. PROV236'


In [None]:
symbio_df[symbio_df['Taxon'] == sp]

In [None]:
brack_df[brack_df['Taxon'] == sp]

In [None]:
prev_df[prev_df['Taxon'] == sp]

In [None]:
a, b = np.unique(pheno_df[pheno_df['Colony'].isin(symbio_df[symbio_df['Taxon'] == sp]['Colony'])]['region'], return_counts=True)
sp_region2prop = dict(zip(a, b / sum(b)))

In [None]:
a, b = np.unique(pheno_df['region'], return_counts=True)
region2prop = dict(zip(a, b / sum(b)))

In [None]:
d = {k: (sp_region2prop.get(k, 0.00000000000000001) / v) for k, v in region2prop.items()}
{k: v if v > 1 else None for k, v in d.items()}

In [None]:
pheno_df[pheno_df['Colony'].isin(symbio_df[symbio_df['Taxon'] == sp]['Colony'])].describe()

In [None]:
pheno_df[~pheno_df['Colony'].isin(symbio_df[symbio_df['Taxon'] == sp]['Colony'])].describe()

In [None]:
richness_df_lod[richness_df['Colony'].isin(symbio_df[symbio_df['Taxon'] == sp]['Colony'])].describe()

In [None]:
richness_df_lod[~richness_df['Colony'].isin(symbio_df[symbio_df['Taxon'] == sp]['Colony'])].describe()

In [None]:
symbio_df[symbio_df['Colony'].isin(symbio_df[symbio_df['Taxon'] == sp]['Colony'])]

In [None]:
x = pheno_df[pheno_df['Colony'].isin(symbio_df[symbio_df['Taxon'] == sp]['Colony'])]['Varroa relative abundance']
y = pheno_df[~pheno_df['Colony'].isin(symbio_df[symbio_df['Taxon'] == sp]['Colony'])]['Varroa relative abundance']
d = {
    'Marbach apiary': x,
    'Other apiary' : y
}
sns.boxplot(d)
plt.ylabel('Varroa relative abundance')
#plt.yscale('log')

In [None]:
# proven
x = pheno_df[pheno_df['region'] == 'provence_alpes_cote_azur']['Varroa relative abundance']
y = pheno_df[~(pheno_df['region'] == 'provence_alpes_cote_azur')]['Varroa relative abundance']
d = {
    'provence_alpes_cote_azur': x,
    'Other regions' : y
}
sns.boxplot(d)
plt.ylabel('Varroa relative abundance')
plt.yscale('log')

In [None]:
# set1 = set(lm_df[(lm_df['Y variable'] == 'v_pho') & (lm_df['FDR'] <= 0.1)]['Species'])
# set2 = set(lm_df[(lm_df['Y variable'] == 'v_mito') & (lm_df['FDR'] <= 0.1)]['Species'])
# set3 = set(lm_df[(lm_df['Y variable'] == 'Varroa relative abundance') & (lm_df['FDR'] <= 0.1)]['Species'])
# 
# plt.figure(figsize=(6, 6))
# venn3([set1, set2, set3], set_labels=('v_pho', 'v_mito', 'Varroa relative abundance'))
# plt.show()

In [None]:
y_name = 'Varroa relative abundance'
for sp in fdr_df[(fdr_df['Y variable'] == y_name) & (fdr_df['Method'] == 'pearson') & (fdr_df['FDR'] <= 0.1)].sort_values('FDR')['Species']:
    f = symbio_pheno_df[sp] > 0
    sns.regplot(data=symbio_pheno_df[f], x=sp, y=y_name)
    plt.show()

In [None]:
y_name = 'Varroa relative abundance'
for sp in fdr_df[(fdr_df['Y variable'] == y_name) & (fdr_df['Method'] == 'tt') & (fdr_df['FDR'] <= 0.1)].sort_values('FDR')['Species']:
    x = symbio_pheno_df[symbio_pheno_df[sp] > 0][y_name].to_numpy()
    y = symbio_pheno_df[symbio_pheno_df[sp].isna()][y_name].to_numpy()
    plt.boxplot([x, y], tick_labels=['Presence (n={})'.format(len(x)), 'Absence (n={})'.format(len(y))])
    plt.ylabel(y_name)
    plt.title(sp)
    plt.show()v

- more cases of positive correlation between presence of species and varroa load (spiroplasma genus, Lactobacillus)

- I should probably treat these differently in follow-up analyses (beneficial vs. pathogenic constituents)  --> should store correlation coefficients for pearson...

- look what are the intersecting bacteria

### Family enrichment

In [None]:
def get_fam_enrich_df(df):
    # count foreground families (significant correlation with varroa)
    fg = df[df['FDR'] < 0.1]['Family'].dropna()
    fg_families, fg_counts = np.unique(fg, return_counts=True)
    fg_fam2counts = dict(zip(fg_families, fg_counts))
    
    # and background families
    bg = df['Family'].dropna()
    bg_families, bg_counts = np.unique(bg, return_counts=True)
    
    # calculate expected count for each family based on background
    exp_counts = bg_counts / bg_counts.sum() * fg.size
    obs_counts = np.array([fg_fam2counts.get(f, 0) for f in bg_families])

    return pd.DataFrame({
    'Family': bg_families,
    'Observed count': obs_counts,
    'Expected count': exp_counts,
    'Observed/expected ratio': obs_counts / exp_counts
    }).sort_values('Observed/expected ratio', ascending=False)

In [None]:
# fdr_df = fdr_df.drop('Family', axis=1)

In [None]:
x = symbio_df[['Taxon', 'TaxID']].drop_duplicates()
sp_name2taxid = dict(zip(x['Taxon'], x['TaxID']))
sp2fam = {sp: get_parent_taxon_at_level(sp_name2taxid[sp], 'family') for sp in set(fdr_df['Species'])}
fdr_df.insert(1, 'Family', [sp2fam[sp] for sp in fdr_df['Species']])

In [None]:
a =fdr_df[(fdr_df['Y variable'] == 'Varroa relative abundance') & (fdr_df['Method'] == 'pearson') & (fdr_df['Stat'] > 0) & (fdr_df['FDR'] <= 0.1)]
b =fdr_df[(fdr_df['Y variable'] == 'Varroa relative abundance') & (fdr_df['Method'] == 'tt') & (fdr_df['Stat'] > 0) & (fdr_df['FDR'] <= 0.1)]
c =fdr_df[(fdr_df['Y variable'] == 'Varroa relative abundance') & (fdr_df['Method'] == 'pearson') & (fdr_df['Stat'] < 0) & (fdr_df['FDR'] <= 0.1)]
d =fdr_df[(fdr_df['Y variable'] == 'Varroa relative abundance') & (fdr_df['Method'] == 'tt') & (fdr_df['Stat'] < 0) & (fdr_df['FDR'] <= 0.1)]

In [None]:
x = d

print(x.shape)
x.sort_values('FDR', ascending=True)[['Species', 'Family', 'FDR']]


In [None]:
print(len(a))
print(len(b))
print(len(c))
print(len(d))

In [None]:
pos_con_fam_df = get_fam_enrich_df(fdr_df[(fdr_df['Y variable'] == 'Varroa relative abundance') & (fdr_df['Method'] == 'pearson') & (fdr_df['Stat'] > 0)])
pos_cat_fam_df = get_fam_enrich_df(fdr_df[(fdr_df['Y variable'] == 'Varroa relative abundance') & (fdr_df['Method'] == 'tt') & (fdr_df['Stat'] > 0)])
neg_con_fam_df = get_fam_enrich_df(fdr_df[(fdr_df['Y variable'] == 'Varroa relative abundance') & (fdr_df['Method'] == 'pearson') & (fdr_df['Stat'] < 0)])
neg_cat_fam_df = get_fam_enrich_df(fdr_df[(fdr_df['Y variable'] == 'Varroa relative abundance') & (fdr_df['Method'] == 'tt') & (fdr_df['Stat'] < 0)])

In [None]:
neg_con_fam_df[:10]

In [None]:
neg_cat_fam_df[:20]

In [None]:
pos_cat_fam_df[:20]

In [None]:
neg_con_fam_df[:10]

In [None]:
neg_cat_fam_df[:20]

- families whose presence correlate positively with Varroa  : Morganellaceae (15 species) --> opportunistic pathogens. Could be symptom of dysbiosis
- families whose abundance correlates negatively with Varroa : Gut bacteria families
- families whose presence correlates negatively with Varroa : Spiroplasmataceae

non-bacteria observations:
- Bifidobacterium phage BadAztec4 (and other variants) with positive correlation with Varroa (both presence and abundance) --> engel study on it
- one variant whose abundance and presence negatively correlate with varroa: Bifidobacterium phage BigBern1

now, zooming on these varroa-associated symbiosphere constituents, the question is : do they co-occur in colonies with high varroa load and do the one associated with low-varroa abundance co-occur in colonies with low-varroa abundace, which would indicate health/dysbosis communities

- hierarchical clustering on Jaccard distances for presence/absence
- same with Bray-Curtis distance for abundances

### Clustering and PCA

#### Jaccard

In [None]:
jaccard_df = symbio_df[symbio_df['Taxon'].isin(set(fdr_df[(fdr_df['Y variable'] == 'Varroa relative abundance') & (fdr_df['Method'] == 'tt') & (fdr_df['FDR'] < 0.1)]['Species']))].pivot(
    index='Colony', columns='Taxon', values='Relative abundance').map(lambda x: 1 if x > 0 else x).fillna(0).reset_index()
print(jaccard_df.shape)
jaccard_df

In [None]:
# add metadata and reset indexes
pheno_jaccard_df = pd.merge(pheno_df, jaccard_df, on='Colony')
jaccard_df = jaccard_df.set_index('Colony')
pheno_jaccard_df = pheno_jaccard_df.set_index('Colony')

In [None]:
color_dict = {}
discrete_vars = ['group', 'year', 'month', 'region']
continous_vars =  ['Varroa relative abundance']

for var in discrete_vars:
    values = pheno_jaccard_df[var]
    cmap = dict(zip(values.unique(), [x.upper() for x in list(sns.color_palette('colorblind', n_colors=len(values.unique())).as_hex())]))
    color_dict[var] =  values.map(cmap)

for var in continous_vars:
    values = pheno_jaccard_df[var]
    normalized_values = np.interp(values, (min(values), max(values)), (0, 1))
    cmap = plt.get_cmap('viridis')
    hex_colors = [mcolors.to_hex(cmap(val)) for val in normalized_values]
    color_dict[var] =  hex_colors
    
row_colors = pd.DataFrame(color_dict)

In [None]:
row_colors

In [None]:
from matplotlib.patches import Patch


In [None]:
plt.figure(figsize=(20, 16))
cg=sns.clustermap(jaccard_df, metric='jaccard', row_colors=row_colors)
cg.cax.set_visible(False)


# cg.ax_row_dendrogram.legend(handles, unique_labels, loc='center', bbox_to_anchor=(1.2, 0.5), title="Row Groups")
#handles = [Patch(facecolor=lut[name]) for name in lut]
#plt.legend(handles, lut, title='Species',
#           bbox_to_anchor=(1, 1), bbox_transform=plt.gcf().transFigure, loc='upper right')

plt.show()

In [None]:
jaccard_dm = DistanceMatrix(squareform(pdist(jaccard_df, metric='jaccard')), ids=jaccard_df.index)

pcoa_results = pcoa(jaccard_dm)

sns.barplot(pcoa_results.proportion_explained[:10])

In [None]:
pd.DataFrame(pcoa_results.samples.values[:, :pc_nr], columns=['PC{}'.format(i + 1) for i in range(pc_nr)], index=jaccard_dm.ids).reset_index().rename(columns={'index':'Colony'})

In [None]:
pc_nr = 5
pca_df = pd.merge(pheno_df, pd.DataFrame(pcoa_results.samples.values[:, :pc_nr], columns=['PC{}'.format(i + 1) for i in range(pc_nr)], index=jaccard_dm.ids).reset_index().rename(columns={'index':'Colony'}), on='Colony')

In [None]:
pca_melt_df = pd.melt(pca_df, id_vars=list(pca_df.columns[:-5]), value_vars=list(pca_df.columns[-5:]), var_name = 'PC', value_name='PC_value')

In [None]:
pca_melt_df

In [None]:
sns.set_style('whitegrid')
plt.figure(figsize=(8, 8))
x='PC1'
y='PC2'
sns.scatterplot(data=pca_df, x=x, y=y, hue='group')
plt.xlabel(x, fontsize=14)
plt.ylabel(y, fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.legend(fontsize=14)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

In [None]:
sns.set_style('whitegrid')
plt.figure(figsize=(8, 8))
x='PC1'
y='PC2'
sns.scatterplot(data=pca_df, x=x, y=y, hue='year')
plt.xlabel(x, fontsize=14)
plt.ylabel(y, fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.legend(fontsize=14)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

In [None]:
sns.set_style('whitegrid')
plt.figure(figsize=(8, 8))
x='PC1'
y='PC2'
sns.scatterplot(data=pca_df, x=x, y=y, hue='month')
plt.xlabel(x, fontsize=14)
plt.ylabel(y, fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.legend(fontsize=14)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

In [None]:
# for v in discrete_vars:
#     g = sns.FacetGrid(pca_melt_df, col='PC', col_wrap=5, height=3, aspect=1)
#     g.map(sns.boxplot,'PC_value',v)
#     g.set_titles("{col_name}")

In [None]:
for v in continous_vars:
    g = sns.FacetGrid(pca_melt_df, col='PC', col_wrap=5, height=3, aspect=1)
    g.map(sns.regplot,'PC_value',v,scatter_kws={'s':4})
    g.set_titles("{col_name}")

#### Bray-Curtis

In [None]:
braycurtis_df = symbio_df[symbio_df['Taxon'].isin(set(fdr_df[(fdr_df['Y variable'] == 'Varroa relative abundance') & (fdr_df['Method'] == 'pearson') & (fdr_df['FDR'] < 0.1)]['Species']))].pivot(
    index='Colony', columns='Taxon', values='Relative abundance').fillna(0).reset_index()
print(braycurtis_df.shape)
braycurtis_df

In [None]:
# add metadata and reset indexes
pheno_braycurtis_df = pd.merge(pheno_df, braycurtis_df, on='Colony')
braycurtis_df = braycurtis_df.set_index('Colony')
pheno_braycurtis_df = pheno_braycurtis_df.set_index('Colony')

In [None]:
color_dict = {}
discrete_vars = ['group', 'year', 'month']
continous_vars =  ['Varroa relative abundance']

for var in discrete_vars:
    values = pheno_braycurtis_df[var]
    cmap = dict(zip(values.unique(), [x.upper() for x in list(sns.color_palette('husl', n_colors=len(values.unique())).as_hex())]))
    color_dict[var] =  values.map(cmap)

for var in continous_vars:
    values = pheno_braycurtis_df[var]
    normalized_values = np.interp(values, (min(values), max(values)), (0, 1))
    cmap = plt.get_cmap('viridis')
    hex_colors = [mcolors.to_hex(cmap(val)) for val in normalized_values]
    color_dict[var] =  hex_colors
    
row_colors = pd.DataFrame(color_dict)

In [None]:
from scipy.stats import zscore

In [None]:
# trying to give same weight to all species...
braycurtis_df =zscore(braycurtis_df)

In [None]:
plt.figure(figsize=(20, 16))
cg=sns.clustermap(braycurtis_df, metric='braycurtis', row_colors=row_colors)
cg.cax.set_visible(False)
plt.show()

In [None]:
braycurtis_dm = DistanceMatrix(squareform(pdist(braycurtis_df, metric='braycurtis')), ids=braycurtis_df.index)

pcoa_results = pcoa(braycurtis_dm)

sns.barplot(pcoa_results.proportion_explained[:10])

In [None]:
pc_nr = 5
pca_df = pd.merge(pheno_df, pd.DataFrame(pcoa_results.samples.values[:, :pc_nr], columns=['PC{}'.format(i + 1) for i in range(pc_nr)], index=braycurtis_dm.ids).reset_index().rename(columns={'index':'Colony'}), on='Colony')

In [None]:
pca_melt_df = pd.melt(pca_df, id_vars=list(pca_df.columns[:-5]), value_vars=list(pca_df.columns[-5:]), var_name = 'PC', value_name='PC_value')

In [None]:
pca_melt_df

In [None]:
sns.set_style('whitegrid')
plt.figure(figsize=(8, 8))
x='PC1'
y='PC2'
sns.scatterplot(data=pca_df, x=x, y=y, hue='group')
plt.xlabel(x, fontsize=14)
plt.ylabel(y, fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.legend(fontsize=14)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

In [None]:
sns.set_style('whitegrid')
plt.figure(figsize=(8, 8))
x='PC1'
y='PC2'
sns.scatterplot(data=pca_df, x=x, y=y, hue='Varroa relative abundance')
plt.xlabel(x, fontsize=14)
plt.ylabel(y, fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.legend(fontsize=14)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

In [None]:
sns.set_style('whitegrid')
plt.figure(figsize=(8, 8))
x='PC1'
y='PC2'
sns.scatterplot(data=pca_df, x=x, y=y, hue='year')
plt.xlabel(x, fontsize=14)
plt.ylabel(y, fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.legend(fontsize=14)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

In [None]:
sns.set_style('whitegrid')
plt.figure(figsize=(8, 8))
x='PC1'
y='PC2'
sns.scatterplot(data=pca_df, x=x, y=y, hue='month')
plt.xlabel(x, fontsize=14)
plt.ylabel(y, fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.legend(fontsize=14)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

In [None]:
continous_vars

In [None]:
# for v in discrete_vars:
#     g = sns.FacetGrid(pca_melt_df, col='PC', col_wrap=5, height=3, aspect=1)
#     g.map(sns.boxplot,'PC_value',v)
#     g.set_titles("{col_name}")

In [None]:
for v in continous_vars:
    g = sns.FacetGrid(pca_melt_df, col='PC', col_wrap=5, height=3, aspect=1)
    g.map(sns.regplot,'PC_value',v,scatter_kws={'s':4})
    g.set_titles("{col_name}")

## Symbiosphere communities

In [None]:
from skbio.stats.ordination import pcoa
from skbio import DistanceMatrix
from scipy.spatial.distance import pdist, squareform
from skbio.diversity import beta_diversity


### Presence/absence Jaccard

In [None]:
jaccard_df = symbio_df.pivot(index='Colony', columns='Taxon', values='Relative abundance').map(lambda x: 1 if x > 0 else x).fillna(0)
print(jaccard_df.shape)

jaccard_dm = DistanceMatrix(squareform(pdist(jaccard_df, metric='jaccard')), ids=jaccard_df.index)

pcoa_results = pcoa(jaccard_dm)

sns.barplot(pcoa_results.proportion_explained[:10])

In [None]:
# jaccard_df.to_csv('{}{}_BeeStrong_species_presences_absences.csv'.format(tlf_path, str(datetime.now().date())), index=True)

# pd.read_csv('{}{}_BeeStrong_species_presences_absences.csv'.format(tlf_path, str(datetime.now().date())))

In [None]:
pc_nr = 5
pca_df = pd.merge(pheno_df, pd.DataFrame(pcoa_results.samples.values[:, :pc_nr], columns=['PC{}'.format(i + 1) for i in range(pc_nr)], index=jaccard_dm.ids).reset_index().rename(columns={'index':'Colony'}), on='Colony')

pca_melt_df = pd.melt(pca_df, id_vars=list(pca_df.columns[:-5]), value_vars=list(pca_df.columns[-5:]), var_name = 'PC', value_name='PC_value')

In [None]:
pca_df

In [None]:
['BS18_0{}'.format(i) for i in range(161, 169)]

In [None]:
marbach = set(['BS18_0{}'.format(i) for i in range(161, 169)])

In [None]:
pca_df['Marbach'] = [1 if x in marbach else 0 for x in pca_df['Colony']]

In [None]:
sns.set_style('whitegrid')
plt.figure(figsize=(8, 8))
x='PC1'
y='PC2'
sns.scatterplot(data=pca_df, x=x, y=y, hue='Marbach')
plt.xlabel(x, fontsize=14)
plt.ylabel(y, fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.legend(fontsize=14)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

In [None]:
variables = ['region', 'group', 'year', 'month']
for v in variables:
    g = sns.FacetGrid(pca_melt_df, col='PC', col_wrap=5, height=3, aspect=1)
    g.map(sns.boxplot,v,'PC_value')
    g.set_titles("{col_name}")

In [None]:
variables = [
    # 'group',
    # 'year',
    # 'month', 
    'percent_Ligustica_Carnica',
    'percent_Mellifera',
    'percent_Caucasica',
    'v_pho',
    'v_mito',
    'pca1',
    'recap_inf',
    'logit_recap_inf',
    'percent_mnr',
    'raw_mnr',
    'eb_mmr',
    'Varroa relative abundance'
]

for v in variables:
    g = sns.FacetGrid(pca_melt_df, col='PC', col_wrap=5, height=3, aspect=1)
    g.map(sns.regplot,'PC_value',v,scatter_kws={'s':4})
    g.set_titles("{col_name}")

### Abundance Bray-Curtis

In [None]:
braycurtis_df = symbio_df.pivot(index='Colony', columns='Taxon', values='Relative abundance').fillna(0)
print(braycurtis_df.shape)

braycurtis_dm = beta_diversity('braycurtis', braycurtis_df)

pcoa_results = pcoa(braycurtis_dm)

sns.barplot(pcoa_results.proportion_explained[:10])

In [None]:
pc_nr = 5
pca_df = pd.merge(pheno_df, pd.DataFrame(pcoa_results.samples.values[:, :pc_nr], columns=['PC{}'.format(i + 1) for i in range(pc_nr)], index=braycurtis_dm.ids).reset_index().rename(columns={'index':'Colony'}), on='Colony')

pca_melt_df = pd.melt(pca_df, id_vars=list(pca_df.columns[:-5]), value_vars=list(pca_df.columns[-5:]), var_name = 'PC', value_name='PC_value')


In [None]:
sns.set_style('whitegrid')
plt.figure(figsize=(8, 8))
x='PC1'
y='PC3'
sns.scatterplot(data=pca_df, x=x, y=y, hue='region')
plt.xlabel(x, fontsize=14)
plt.ylabel(y, fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.legend(fontsize=14)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

In [None]:
sns.set_style('whitegrid')
plt.figure(figsize=(8, 8))
x='PC1'
y='PC2'
sns.scatterplot(data=pca_df, x=x, y=y, hue='month')
plt.xlabel(x, fontsize=14)
plt.ylabel(y, fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.legend(fontsize=14)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

In [None]:
variables = ['region', 'group', 'month', 'year']
for v in variables:
    g = sns.FacetGrid(pca_melt_df, col='PC', col_wrap=5, height=3, aspect=1)
    g.map(sns.boxplot,v,'PC_value')
    g.set_titles("{col_name}")

In [None]:
variables = [
    'percent_Ligustica_Carnica',
    'percent_Mellifera',
    'percent_Caucasica',
    'v_pho',
    'v_mito',
    'pca1',
    'recap_inf',
    'logit_recap_inf',
    'percent_mnr',
    'raw_mnr',
    'eb_mmr',
    'Varroa relative abundance'
]

for v in variables:
    g = sns.FacetGrid(pca_melt_df, col='PC', col_wrap=5, height=3, aspect=1)
    g.map(sns.regplot,'PC_value',v,scatter_kws={'s':4})
    g.set_titles("{col_name}")

### Core Bray-Curtis

In [None]:
print(set(prev_df[(prev_df['Prevalence'] >= 0.5) & (prev_df['Group'].isin({'Viruses', 'Bacteria', 'Fungi'}))]['Taxon']))

In [None]:
f = brack_df['Taxon'].isin(set(prev_df[(prev_df['Prevalence'] >= 0.5) & (prev_df['Group'].isin({'Viruses', 'Bacteria', 'Fungi'}))]['Taxon']))

In [None]:
symbio_df = brack_df[f & (brack_df['Relative abundance'] >= lod)]
symbio_df

In [None]:
braycurtis_df = symbio_df.pivot(index='Colony', columns='Taxon', values='Relative abundance').fillna(0)
print(braycurtis_df.shape)

braycurtis_dm = beta_diversity('braycurtis', braycurtis_df)

pcoa_results = pcoa(braycurtis_dm)

sns.barplot(pcoa_results.proportion_explained[:10])

In [None]:
pc_nr = 5
pca_df = pd.merge(pheno_df, pd.DataFrame(pcoa_results.samples.values[:, :pc_nr], columns=['PC{}'.format(i + 1) for i in range(pc_nr)], index=braycurtis_dm.ids), left_index=True, right_index=True)

pca_melt_df = pd.melt(pca_df, id_vars=list(pca_df.columns[:-5]), value_vars=list(pca_df.columns[-5:]), var_name = 'PC', value_name='PC_value')


In [None]:
sns.set_style('whitegrid')
plt.figure(figsize=(8, 8))
x='PC1'
y='PC2'
sns.scatterplot(data=pca_df, x=x, y=y, hue='group')
plt.xlabel(x, fontsize=14)
plt.ylabel(y, fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.legend(fontsize=14)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

In [None]:
sns.set_style('whitegrid')
plt.figure(figsize=(8, 8))
x='PC1'
y='PC3'
sns.scatterplot(data=pca_df, x=x, y=y, hue='group')
plt.xlabel(x, fontsize=14)
plt.ylabel(y, fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.legend(fontsize=14)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

In [None]:
variables = ['group']
for v in variables:
    g = sns.FacetGrid(pca_melt_df, col='PC', col_wrap=5, height=3, aspect=1)
    g.map(sns.boxplot,'PC_value',v)
    g.set_titles("{col_name}")

In [None]:
variables = [
    'percent_Ligustica_Carnica',
    'percent_Mellifera',
    'percent_Caucasica',
    'v_pho',
    'v_mito',
    'pca1',
    'recap_inf',
    'logit_recap_inf',
    'percent_mnr',
    'raw_mnr',
    'eb_mmr',
]

for v in variables:
    g = sns.FacetGrid(pca_melt_df, col='PC', col_wrap=5, height=3, aspect=1)
    g.map(sns.regplot,'PC_value',v,scatter_kws={'s':4})
    g.set_titles("{col_name}")

# Graveyard

In [None]:
def write_bowtie2_script_nodescratch(script_fn, array_str):
    runstr="""#!/bin/bash -l
#SBATCH --array=ARRAY_STR
#SBATCH --nodes=1
#SBATCH --ntasks=24
#SBATCH --nodelist=node04
#SBATCH --mem=30g
#SBATCH --time=04:00:00
#SBATCH --job-name=bowtie2
#SBATCH --output=%x_%A_%a.out
#SBATCH --error=%x_%A_%a.err

modulesld
ebld
module use /software/anaconda3/envs/eb/easybuild/modules/all
ml Bowtie2
ml SAMtools

idx_path=$1
idx_name=$2
fastq_path=$3
bowtie2_path=$4
bs_ids_file=$5

# work on scratch node is better when lots of I/O operations
node_scratch=/scratch/${USER}/tmp_${SLURM_ARRAY_JOB_ID}_${SLURM_ARRAY_TASK_ID}/
mkdir -p $node_scratch

# let's see if it matters
cp -r $idx_path $node_scratch
bt_idx_path=${node_scratch}bee_bt2idx/

# getting bs_id
bs_id=$(sed -n ${SLURM_ARRAY_TASK_ID}'{p;q}' ${bs_ids_file})

fastq_1=$fastq_path${bs_id}_1.fastq.gz
fastq_2=$fastq_path${bs_id}_2.fastq.gz
ls $fastq_1
ls $fastq_2

echo "1. copy fastq.gz to node scratch"
cp $fastq_1 $node_scratch
cp $fastq_2 $node_scratch
ls -lh $node_scratch

# not sure if I need to copy also the index...

echo "2. bowtie2"
unmapped_prefix=${node_scratch}${bs_id}_${idx_name}_unmapped
mapped_prefix=${node_scratch}${bs_id}_${idx_name}_mapped
bowtie2 -x  ${bt_idx_path}${idx_name} -p 8 -1 $node_scratch${bs_id}_1.fastq.gz -2 $node_scratch${bs_id}_2.fastq.gz --un-conc ${unmapped_prefix}.fastq -S ${mapped_prefix}.sam > ${bowtie2_path}${bs_id}_${idx_name}.out 2>&1

echo "3. SAM to BAM"
samtools view -S -b ${mapped_prefix}.sam > ${mapped_prefix}.bam

echo "4. compress"
gzip -f ${unmapped_prefix}.1.fastq 
gzip -f ${unmapped_prefix}.2.fastq

echo "5. copy to home and clean"
mv -f ${unmapped_prefix}.1.fastq.gz ${bowtie2_path}
mv -f ${unmapped_prefix}.2.fastq.gz ${bowtie2_path}
mv -f ${mapped_prefix}.bam ${bowtie2_path}
rm -rf $node_scratch

echo DONE""".replace(
    'ARRAY_STR', array_str)
    with open(script_fn, 'w') as outf:
        outf.write(runstr)