In [2]:
#  SETUP

import pandas as pd
import ast
from collections import defaultdict
import ast
from difflib import SequenceMatcher

# === Load your files ===
structured_df = pd.read_csv("../data/structured_event_sequences.csv")
embedding_patterns_df = pd.read_csv("../data/embedded_cluster_patterns.csv")
boolean_patterns_df = pd.read_csv("../data/frequent_patterns_multilabel.csv")


In [3]:
#     Helper Functions
def get_cluster_examples(df, cluster_col='cluster_id', text_col='triplet_text', top_k=3):
    cluster_examples = {}
    for cid in sorted(df[cluster_col].dropna().unique()):
        top_items = (
            df[df[cluster_col] == cid][text_col]
            .value_counts()
            .head(top_k)
            .index
            .tolist()
        )
        cluster_examples[cid] = top_items
    return cluster_examples

def decode_cluster_pattern(pattern_str, mapping):
    try:
        cluster_seq = ast.literal_eval(pattern_str)
        return [mapping.get(cid, ["?"]) for cid in cluster_seq]
    except:
        return ["parse_error"]


cat = pd.read_csv("../data/Category_Combinations_Reference.csv")
# Define id_to_label mapping
id_to_label = {
    row['global_category_id']: (row['Actor_Category'], row['Phase_Category'], row['Goal_Category'])
    for _, row in cat.iterrows()
}
def decode_boolean_pattern(pattern_str):
    try:
        steps = ast.literal_eval(pattern_str)
        return [[id_to_label.get(i, f"ID_{i}") for i in step] for step in steps]
    except:
        return ["parse_error"]

boolean_patterns_df["Decoded_Pattern"] = boolean_patterns_df["Pattern"].apply(decode_boolean_pattern)

In [4]:
structured_df[['Actor', 'Action', 'Object']] = structured_df[['Actor', 'Action', 'Object']].astype(str)
structured_df['triplet_text'] = structured_df[['Actor', 'Action', 'Object']].agg(' '.join, axis=1)


# Ensure 'cluster_id' column exists in structured_df
if 'cluster_id' not in structured_df.columns:
    structured_df['cluster_id'] = structured_df['dyad_id']  # Use 'dyad_id' as a placeholder for 'cluster_id'

cluster_to_examples = get_cluster_examples(structured_df)


embedding_patterns_df["Decoded_Pattern"] = embedding_patterns_df["Pattern"].apply(
    lambda x: decode_cluster_pattern(x, cluster_to_examples)
)


# === 4. Save both ===
embedding_patterns_df.to_csv("../data/decoded_embedding_patterns.csv", index=False)
boolean_patterns_df.to_csv("../data/decoded_boolean_patterns.csv", index=False)

print("✅ Saved decoded pattern files:")
print("→ decoded_embedding_patterns.csv")
print("→ decoded_boolean_patterns.csv")

✅ Saved decoded pattern files:
→ decoded_embedding_patterns.csv
→ decoded_boolean_patterns.csv


In [5]:
#DOES NOT WORK
#TOO MUCH COMPLEXITY

# Load formatted boolean and embedding patterns
df_boolean = pd.read_csv("../data/decoded_boolean_patterns.csv")
df_embedding = pd.read_csv("../data/decoded_embedding_patterns.csv")

# Optional: convert pattern strings to clean readable form
def format_decoded_pattern(pattern_str):
    try:
        pattern = ast.literal_eval(pattern_str)
        flat = ["/".join(triple) for item in pattern for triple in item]
        return " → ".join(flat)
    except:
        return pattern_str

# Apply formatting to both boolean and embedding patterns
df_boolean['Formatted_Pattern'] = df_boolean['Decoded_Pattern'].apply(format_decoded_pattern)
df_embedding['Formatted_Pattern'] = df_embedding['Decoded_Pattern'].apply(format_decoded_pattern)

# Function to compute similarity (0 to 1)
def similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()

# Create a result table to compare top matches
results = []

for i, boolean_row in df_boolean.iterrows():
    best_match = None
    best_score = 0.0
    for j, embedding_row in df_embedding.iterrows():
        score = similarity(boolean_row['Formatted_Pattern'], embedding_row['Formatted_Pattern'])
        if score > best_score:
            best_score = score
            best_match = embedding_row['Formatted_Pattern']
    results.append({
        "Boolean_Pattern": boolean_row['Formatted_Pattern'],
        "Best_Embedding_Match": best_match,
        "Similarity_Score": round(best_score, 3)
    })

# Convert to DataFrame and optionally save
df_compare = pd.DataFrame(results)
df_compare.to_csv("../data/pattern_similarity_comparison.csv", index=False)

# Display top examples
print(df_compare.head())


KeyboardInterrupt: 