In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import spacy
from prefixspan import PrefixSpan

# Download stopwords if not already available
nltk.download('stopwords')
nltk.download('punkt')
nlp = spacy.load("en_core_web_sm")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ymont\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ymont\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
#Combine texts from all the rows 

df = pd.read_csv('../data/peace_observatory(negotiations).csv', delimiter = ';')

df = df.groupby('dyad_id')['description'].apply(lambda x: ' | '.join(x)).reset_index()

In [3]:


# Ensure column is a string & handle NaN values
df['description'] = df['description'].astype(str).fillna('')

# Remove special characters
df['description'] = df['description'].str.replace(r'[^\w\s]', '', regex=True)

# Convert to lowercase
df['description'] = df['description'].str.lower()

# Tokenization (word_tokenize outputs a list)
df['description'] = df['description'].apply(lambda x: x.split()) 

# Remove stopwords (works on lists now)
stop_words = set(stopwords.words('english'))
df['description'] = df['description'].apply(lambda tokens: [word for word in tokens if word not in stop_words])

# Apply stemming correctly (without .split())
ps = PorterStemmer()
df['description'] = df['description'].apply(lambda tokens: [ps.stem(word) for word in tokens])

# Convert list of words back to a string for saving
df['description'] = df['description'].apply(lambda tokens: ' '.join(tokens))

# Save processed data
df.to_csv('../data/processed_descriptions.csv', index=False)

print("Text preprocessing completed successfully!")


Text preprocessing completed successfully!


In [4]:
# -----------------------------------------
# 🔹 FUNCTION: Extract Named Entities (NER)
# -----------------------------------------
def extract_entities(text):
    doc = nlp(text)
    entities = [ent.text for ent in doc.ents if ent.label_ in ["PERSON", "ORG", "GPE"]]
    return entities  # Extract key actors (mediators, rebels, governments, locations)

df['entities'] = df['description'].apply(extract_entities)

# ----------------------------------------
# 🔹 FUNCTION: Extract Verbs (Actions)
# ----------------------------------------
def extract_verbs(text):
    doc = nlp(text)
    verbs = [token.lemma_ for token in doc if token.pos_ == "VERB"]
    return verbs  # Extract actions (e.g., "propose", "reject", "demand")

df['actions'] = df['description'].apply(extract_verbs)

# ----------------------------------------------------
# 🔹 FUNCTION: Extract Action-Object Relationships
# ----------------------------------------------------
def extract_action_object_pairs(text):
    doc = nlp(text)
    pairs = []

    for token in doc:
        if token.pos_ == "VERB":  # Find verbs (actions)
            subject = [child.text for child in token.children if child.dep_ in ["nsubj", "nsubjpass"]]
            obj = [child.text for child in token.children if child.dep_ in ["dobj", "attr", "prep"]]

            if subject and obj:
                pairs.append((subject[0], token.lemma_, obj[0]))  # (Actor, Action, Object)

    return pairs

In [5]:
df['event_triplets'] = df['description'].apply(extract_action_object_pairs)

# Prepare data for mining action-object pairs

structured_data = []

for index, row in df.iterrows():
    dyad_id = row['dyad_id']  # Extract dyad_id from the row
    for event in row['event_triplets']:
        structured_data.append({"Actor": event[0], "Action": event[1], "Object": event[2], "dyad_id": dyad_id})

# Convert extracted sequences into a structured DataFrame
structured_df = pd.DataFrame(structured_data)

# Save structured sequences for pattern mining
structured_df.to_csv('../data/structured_event_sequences.csv', index=False)

In [6]:
#SPM algo
structured_df = pd.read_csv('../data/structured_event_sequences.csv')

# Ensure all values in 'Actor', 'Action', and 'Object' columns are strings
structured_df[['Actor', 'Action', 'Object']] = structured_df[['Actor', 'Action', 'Object']].astype(str)

# Combine columns into a single 'event' column
structured_df['event'] = structured_df[['Actor', 'Action', 'Object']].agg('_'.join, axis=1)

# Group by dyad_id (sequence ID)
grouped_sequences = structured_df.groupby('dyad_id')['event'].apply(list).tolist()

# Run PrefixSpan
ps = PrefixSpan(grouped_sequences)

# Extract frequent patterns (support ≥ 2)
patterns = ps.frequent(10)

# Sort and take top 20
top_patterns = sorted(patterns, key=lambda x: x[0], reverse=True)[:20]

# Convert to DataFrame
patterns_df = pd.DataFrame(top_patterns, columns=["Support", "Pattern"])
print(patterns_df)

# Optional: Save to CSV
patterns_df.to_csv("../data/top_sequential_patterns.csv", index=False)

   Support                                 Pattern
0       45                     [negoti_take_place]
1       19                 [negoti_lead_agreement]
2       17  [negoti_take_place, negoti_take_place]
3       14                       [meet_take_place]
4       13                    [process_take_place]
5       12                    [schedul_take_place]


In [8]:
# SPM but with embeddings

import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from collections import Counter
from pymining import seqmining
from prefixspan import PrefixSpan

# Step 1: Load your triplet file
df = pd.read_csv("../data/structured_event_sequences.csv")

# Step 2: Combine Actor, Action, Object into a single sentence
df[['Actor', 'Action', 'Object']] = df[['Actor', 'Action', 'Object']].astype(str)
df['triplet_text'] = df[['Actor', 'Action', 'Object']].agg(' '.join, axis=1)

# Step 3: Compute embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(df['triplet_text'].tolist(), show_progress_bar=True)

# Step 4: Cluster embeddings
n_clusters = 50
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto')
df['cluster_id'] = kmeans.fit_predict(embeddings)

# Step 5: Generate human-readable names for clusters
cluster_summaries = {}

for label in sorted(df['cluster_id'].unique()):
    texts = df.loc[df['cluster_id'] == label, 'triplet_text']
    words = " ".join(texts).lower().split()
    common_words = [w for w, _ in Counter(words).most_common(3)]
    summary = " / ".join(common_words) if common_words else f"Cluster {label}"
    cluster_summaries[label] = summary

print("✅ Cluster summaries:")
for cid, summary in cluster_summaries.items():
    print(f"Cluster {cid}: {summary}")

# Step 6: Build sequences of cluster_ids by dyad
sequences = df.groupby('dyad_id')['cluster_id'].apply(list).tolist()

# Step 7: Run PrefixSpan on cluster sequences
ps = PrefixSpan(sequences)
patterns = ps.frequent(minsup=10)

# Step 8: Replace cluster IDs with human-readable names
results = []

for support, seq in sorted(patterns, key=lambda x: -x[0]):
    if len(seq) >= 2 and len(set(seq)) > 1:
        triplets = []
        
        for cluster_id in seq:
            summary = cluster_summaries.get(cluster_id, f"Cluster {cluster_id}")
            parts = [s.strip() for s in summary.split("/")]

            # Ensure exactly 3 elements
            if len(parts) == 3:
                actor, action, object_ = parts
            else:
                actor, action, object_ = parts + [""]*(3 - len(parts))

            triplet_text = f"{actor} {action} {object_}".strip()
            triplets.append(triplet_text)

        results.append({
            "Support": support,
            "Pattern": " → ".join(triplets),
            "Cluster Sequence": " → ".join(str(cid) for cid in seq)
        })

patterns_df = pd.DataFrame(results)
patterns_df.to_csv("../data/embedded_cluster_patterns_named.csv", index=False, encoding='utf-8')

print("✅ Named patterns saved to 'embedded_cluster_patterns_named.csv'.")
print(patterns_df.head())


KeyboardInterrupt: 