# Wzy Linkage Associative Statistics

## Setup

### Define file paths

In [None]:
from pathlib import Path
parent_path = Path('/Users/tsta0015/Programming/Wzy_Analysis')
jk_path = parent_path / 'from_JK'
pc_path = jk_path / 'ProteinCartography'
result_path = parent_path / 'results'
linkage_data_path = jk_path / 'Acinetobacter_Wzy.xlsx'
pc_cluster_path = pc_path / 'final_results' / 'Wzy_Ab_only_Ph2_tom2025_aggregated_features.tsv'

### Load data

#### Load Wzy _A. baumannii_ linkage data

In [None]:
import pandas as pd
linkage_data = pd.read_excel(linkage_data_path, engine='calamine', index_col=0)
# linkage_data = linkage_data.drop(
#     columns=[
#         'Leiden cluster', 'Genus/species/complex', 'NCBI accession no.',
#         'NCBIfam (Interproscan)', 'Pfam (Interproscan)', 'PANTHER (Interproscan)', 'Structure 1', 'Structure 2', 'SMILES'
#     ]
# ))
# linkage_data.index = linkage_data.index.str.lower()#%% md
#### Load Wzy _A. baumannii_ ProteinCartography clusters

In [None]:
# import pandas as pd
# clusters = pd.read_csv(pc_cluster_path, nrows=244, sep='\t', index_col=0).drop(
#     columns=['pdb_origin', 'pdb_confidence', 'pdb_chains', 'Protein names', 'StruCluster'])
# clusters.index = clusters.index.str.removesuffix('_model')

### Prepare data

Check difference between the clusters and linkage data - remember we may not have 3D protein structures for everything so we can't perform associative stats on missing data...

In [None]:
# linkage_data.index.difference(clusters.index)

Clusters for abaumannii_kl24 abaumannii_ph3 and abaumannii_ph4 are missing because we didn't model these, therefore we can't perform associative stats on these...

#### Merge data

In [None]:
# linkage_data = linkage_data.join(clusters, how='inner')

#### Add extra substrate data

We define "Substrate set" as the combination of donor and acceptor substrate sorted alphabetically (donor/acceptor agnostic)

In [None]:
linkage_data['Substrate set'] = linkage_data[
    ['Donor substrate', 'Acceptor substrate']].apply(
    lambda x: ' '.join(sorted(x)), axis=1)

We define "Linkage" as the ordered combination of donor, carbons and acceptor.

In [None]:
linkage_data['Linkage'] = linkage_data[
    ['Donor substrate', 'Carbon positions', 'Acceptor substrate']].apply(
    lambda x: '-'.join(x), axis=1)

## Stats

#### Load statistics helper function

In [None]:
from helpers import StatHelper
helper = StatHelper(linkage_data)
cluster_col = "Assigned cluster"

In [None]:
print("--- 1. Anomeric Configuration (One vs One) ---")
# Chi2 Omnibus
print(helper.run_omnibus_chi2(cluster_col, "Anomeric configuration"))
# Fisher Pairwise (A vs B, B vs C...)
print(helper.run_pairwise_categorical(cluster_col, "Anomeric configuration", mode='one_vs_one'))
print("\n")

In [None]:
print("--- 2. Linkages (One vs Rest, n > 5 because the sample dataset is small) ---")
print(helper.run_pairwise_categorical(cluster_col, "Linkage", mode='one_vs_rest', min_obs=5))
print("\n")

In [None]:
print("--- 3. Substrate Set (One vs Rest, n > 5) ---")
print(helper.run_pairwise_categorical(cluster_col, "Substrate set", mode='one_vs_rest', min_obs=5))
print("\n")

In [None]:
print("--- 4. Protein Lengths (Kruskal + MWU) ---")
omnibus, pairwise = helper.run_numerical_distributions(cluster_col, "Length")
print("Omnibus:", omnibus)
print("Pairwise:")
print(pairwise)

In [None]:
helper.plot_distributions(cluster_col, "Length")

In [None]:
helper.plot_categorical_association(cluster_col, "Anomeric configuration")
helper.plot_categorical_association(cluster_col, "Linkage")
helper.plot_categorical_association(cluster_col, "Substrate set")

In [None]:
# Filter dataframe based on whether a column contains a slash '/'
# Replace 'ColumnName' with the column you want to check. Use ~ to exclude these rows.
linkage_data_filtered = linkage_data[~linkage_data['No. sugars in unit'].str.contains('/', na=False)].copy()
linkage_data_filtered['No. sugars in unit'] = pd.to_numeric(linkage_data_filtered['No. sugars in unit'])

In [None]:
helper_filtered = StatHelper(linkage_data_filtered)

In [None]:
print("--- 5. No. sugars in unit (Kruskal + MWU) ---")
omnibus, pairwise = helper_filtered.run_numerical_distributions(cluster_col, 'No. sugars in unit')
print("Omnibus:", omnibus)
print("Pairwise:")
print(pairwise)
helper_filtered.plot_distributions(cluster_col, 'No. sugars in unit')

In [None]:
print("--- 6. No. sugars in chain (Kruskal + MWU) ---")
omnibus, pairwise = helper_filtered.run_numerical_distributions(cluster_col, 'No. sugars in main chain')
print("Omnibus:", omnibus)
print("Pairwise:")
print(pairwise)
helper_filtered.plot_distributions(cluster_col, 'No. sugars in main chain')

In [None]:
print("--- 7. Carbon positions (One vs Rest) ---")
# Chi2 Omnibus
print(helper.run_omnibus_chi2(cluster_col, "Carbon positions"))
# Fisher Pairwise (A vs B, B vs C...)
print(helper.run_pairwise_categorical(cluster_col, "Carbon positions", mode='one_vs_rest'))
print("\n")
print(helper.plot_categorical_association(cluster_col, "Carbon positions"))