In [1]:
import pandas as pd
from copy import deepcopy
from pathlib import Path
import re
from scipy.stats import ttest_ind
from Bio.Seq import Seq
from statsmodels.stats.multitest import multipletests

In [2]:
# load data
data_path = Path('/mnt/d/data/popnet_paper')
data_sets = ['purE']

In [3]:
def infer_wt(col):
    """col is a column with the genotype calls of each sample at a given position."""
    # we don't count dots
    counts = col.value_counts()
    # returns WT, Mutant
    # the results of value_counts are sorted in descending order by default
    return counts.index[0], counts.index[1]

def get_mutation(seq, pos, wt, mut, reverse=False):
    # adjust for zero index
    if reverse:
        pos = pos + 1
    else:
        pos = pos - 1

    if seq[pos] != wt:
        raise ValueError("WT sequences don't match")
    mut_seq = seq[:pos] + mut + seq[pos + 1:]
    offset = pos % 3
    wp = Seq(seq[pos-offset:pos+3-offset]).translate()
    mp = Seq(mut_seq[pos-offset:pos+3-offset]).translate()
    return wp, mp 


In [5]:
# perform one-way ANOVA
results = []
with open(data_path / 'neis_meta.tsv', 'r') as f:
    meta = [int(float(x)) for x in f.read().splitlines()]
for gene in data_sets:
    df = pd.read_csv(data_path / f'{gene}.csv')
    df['meta'] = meta
    y_col = 'meta'
    x_cols = [col for col in df.columns if re.match(r'^[0-9]+', col)]
    for x_col in x_cols:
        wt, mut = infer_wt(df[x_col]) # since there are many mutants, we just take the first one as WT
        wt_sample = df.loc[df[x_col] == wt, y_col]
        mut_sample = df.loc[df[x_col] != wt, y_col]
        _, p_val = ttest_ind(wt_sample, mut_sample, equal_var=False, nan_policy='omit', alternative='two-sided')
        results.append({
            'gene': gene,
            'position': x_col,
            'p_val': p_val,
            'wt': wt,
            'mut': mut,
        })
results_df = pd.DataFrame(results)
results_df['p_adj'] = multipletests(results_df['p_val'].values)[1]
filtered_results = results_df.loc[results_df['p_adj'] < 0.01]


In [6]:
filtered_results

Unnamed: 0,gene,position,p_val,wt,mut,p_adj
13,purE,745887,0.000373,A,G,0.005205
14,purE,745891,3e-06,T,C,5.1e-05
15,purE,745916,3e-06,A,C,5.1e-05


All three significant variants are downstream of the coding reason, in the intergenic region between purE and transposase NGO_RS11705 (also downstream because it's anti-sense). 