In [None]:
import pandas as pd
from scripts.clusteranalyzer import (
    filtrate_low_clusters,
    create_one_zero_matrix,
    create_coocurance_matrix,
    calculate_pvals,
    correct_pvals,
    calculate_similarity_matrix,
    adjust_similarity_matrix,
    clusterize,
    assemble_results,
)
from scripts.assembleresults import add_coocurance, add_immune_numbers, add_distance
from scripts.calculate_pairwise_distance import calculate_pairwise_distance
from scripts.addannotation import (
    annotation_transformation,
    process_go_annotations,
    process_pfam_annotations,
    transform_cluster_protein_data,
    combine_annotations_data,
    name_transformation,
)

**Part 1.** Calculate statistical significance of cluster co-occurrence and graph clustering

In [None]:
DB_clu = pd.read_csv(
    "./data/mmseq2_results/3_output_tsv/DB_clu.tsv",
    sep="\t",
    header=None,
    names=["cluster_id", "protein_id"],
)
filtered_DB, cluster_sizes = filtrate_low_clusters(DB_clu, min_cluster_size=3)
genome_names, cluster_names, sparse_matrix = create_one_zero_matrix(filtered_DB)
coocurance_matrix = create_coocurance_matrix(sparse_matrix)
corrected_pvals = correct_pvals(
    calculate_pvals(genome_names, coocurance_matrix, sparse_matrix)
)
similarity_norm = adjust_similarity_matrix(calculate_similarity_matrix(corrected_pvals))
inflation = 2.0
modules = clusterize(similarity_norm, inflation)

Number of unique regions : 186264
Number of unique proteins: 730638
Number of unique clusters : 362472
Number of unique regions (after filtration) : 138447
Number of unique proteins (after filtration) : 375950
Number of unique clusters (after filtration) : 42708
 Delete clusters containing less than 3 proteins
Sparse matrix with 138447 rows and 42708 columns was created
Coocurance matrix 42708 rows and 42708 columns was created
Get 21523 modules!


**Part 2.** Select modules and add annotation to selescted clusters

In [None]:
result = assemble_results(modules, cluster_sizes, cluster_names)

# Add coocurance, number of immune proteins in module, avarage pairwise distance
annotation = pd.read_csv("./data/annotation.csv", sep=",", low_memory=False)
distance = calculate_pairwise_distance(result, filtered_DB)
result = add_coocurance(filtered_DB, result)
result = add_immune_numbers(result, DB_clu, annotation)
result = add_distance(result, distance)

# Filtrate modules
filtrated_results = result[
    (result.cluster_sizes.apply(lambda x: all(num > 20 for num in x)))
    & (result.immune == 0)
].sort_values("coocurance", ascending=False)
filtrated_results = filtrated_results[
    filtrated_results["coocurance"] / filtrated_results["cluster_sizes"].apply(min)
    > 0.7
]
filtrated_results = filtrated_results.explode(
    ["cluster_ids", "cluster_inds", "cluster_sizes"]
).rename(columns={"cluster_ids": "cluster_id", "cluster_inds": "cluster_ind"})

# Create annotation
new_anotation = annotation_transformation(annotation)
GO_annotation = process_go_annotations(new_anotation, max_workers=20)
pfam_annotation = process_pfam_annotations(new_anotation)
new_data = transform_cluster_protein_data(filtered_DB)
combine_annotations = combine_annotations_data(new_data, GO_annotation, pfam_annotation)

# Add annotation to selected clusters
filtrated_results.cluster_id = filtrated_results.cluster_id.apply(name_transformation)
filtrated_results_annotated = filtrated_results.merge(
    combine_annotations, on="cluster_id", how="left"
)
filtrated_results_annotated

Unnamed: 0,module_id,module_size,cluster_id,cluster_ind,cluster_sizes,coocurance,immune,average_distance,cluster_size,GO_terms,pfam_term,region
0,14191,2,sample_2485_contig_1773_9,22125,30,20,0,0.0,30,-,"Bacterial antitoxin of type II TA system, VapB : 22;","downstream: 15, immuneisland: 0, upstream: 15"
1,14191,2,sample_3118_contig_19531_10,24243,23,20,0,0.0,23,-,PIN domain : 2;,"downstream: 13, immuneisland: 0, upstream: 10"
2,12963,2,sample_1849_contig_6904_6,19614,22,20,0,0.0,22,-,-,"downstream: 7, immuneisland: 1, upstream: 14"
3,12963,2,sample_2487_contig_1607_8,22585,22,20,0,0.0,22,-,"ParE toxin of type II toxin-antitoxin system, parDE : 1;","downstream: 9, immuneisland: 1, upstream: 12"
4,12647,2,sample_17807_contig_5401_4,18988,24,19,0,0.0,24,-,-,"downstream: 17, immuneisland: 0, upstream: 7"
5,12647,2,sample_5535_contig_9_143,33719,23,19,0,0.0,23,dTMP kinase activity : 23; ATP binding : 23; dTDP biosynthetic process : 23;,Thymidylate kinase : 23;,"downstream: 14, immuneisland: 0, upstream: 9"
6,3018,2,sample_14219_contig_4171_3,3527,25,18,0,1.111111,25,-,Cysteine-rich domain : 21;,"downstream: 8, immuneisland: 0, upstream: 17"
7,3018,2,sample_17780_contig_2415_2,18357,29,18,0,1.111111,29,-,LUD domain : 29;,"downstream: 10, immuneisland: 0, upstream: 19"
8,1400,2,sample_1182_contig_3641_12,1608,22,16,0,0.0,22,-,HipA-like C-terminal domain : 22;,"downstream: 11, immuneisland: 0, upstream: 11"
9,1400,2,sample_1849_contig_355_3,19517,24,16,0,0.0,24,-,-,"downstream: 12, immuneisland: 0, upstream: 12"


In [None]:
# to create vizualization
filtered_DB.to_csv("./data/tables/filtered_DB.csv", index=False)

modules_without_filtration = assemble_results(
    modules, cluster_sizes, cluster_names, min_module_size=0
)
modules_without_filtration.to_csv(
    "./data/tables/modules_without_filtration.csv", index=False
)

result = assemble_results(modules, cluster_sizes, cluster_names)
result.to_csv("./data/tables/modules_with_filtration.csv", index=False)

filtrated_results_annotated.to_csv(
    "./data/tables/filtrated_results_annotated.csv", index=False
)