In [None]:
import json
from pprint import pprint
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from collections import defaultdict
from itertools import combinations


class FinBERTAspectOrganizer:
    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained('yiyanghkust/finbert-tone')
        self.model = AutoModel.from_pretrained('yiyanghkust/finbert-tone')
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model.to(self.device)

    def get_embeddings(self, texts):
        """Get FinBERT embeddings for texts"""
        encoded = self.tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
        encoded = {k: v.to(self.device) for k, v in encoded.items()}

        with torch.no_grad():
            outputs = self.model(**encoded)
            embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()

        return embeddings

    def preprocess_aspects(self, aspects_dict):
        flat_aspects = []
        for cluster, aspects in aspects_dict.items():
            for aspect in aspects:
                lvl1, lvl2 = aspect.split('/')
                flat_aspects.append({
                    'cluster': cluster,
                    'level1': lvl1,
                    'level2': lvl2,
                    'full_aspect': aspect
                })
        return pd.DataFrame(flat_aspects)

    def get_parent_similarity(self, level2_aspect, level1_aspect):
        """Calculate similarity between a level2 aspect and its parent level1"""
        embeddings = self.get_embeddings([level2_aspect, level1_aspect])
        return cosine_similarity(embeddings)[0, 1]

    def filter_similar_aspects_global(self, df, threshold=0.8):
        """Filter similar aspects across all categories"""
        # Get all level2 aspects and their embeddings
        level2_aspects = df[['level2', 'level1']].values.tolist()  # Keep all instances, not just unique
        level2_texts = [aspect[0] for aspect in level2_aspects]  # Get just the level2 texts for embeddings

        # Get embeddings for all level2 aspects
        embeddings = self.get_embeddings(level2_texts)

        # Calculate similarities between all level2 pairs
        similarities = cosine_similarity(embeddings)

        # Keep track of aspects to remove (will store tuples of level2 and level1)
        aspects_to_remove = set()

        # For each similar pair, calculate parent similarities and decide which to keep
        for i in range(len(level2_aspects)):
            for j in range(len(level2_aspects)):
                if similarities[i, j] > threshold and i != j:
                    aspect1, parent1 = level2_aspects[i]
                    aspect2, parent2 = level2_aspects[j]

                    # Skip if either aspect-parent pair is already marked for removal
                    if (aspect1, parent1) in aspects_to_remove or (aspect2, parent2) in aspects_to_remove:
                        continue

                    # Skip if comparing same aspect-parent pair
                    if aspect1 == aspect2 and parent1 == parent2:
                        continue
                    # Get parent similarities
                    sim1 = self.get_parent_similarity(aspect1, parent1)
                    sim2 = self.get_parent_similarity(aspect2, parent2)

                    # Keep the one with higher parent similarity
                    if sim1 >= sim2:
                        aspects_to_remove.add((aspect2, parent2))
                        print(f"removed {aspect2} from {parent2} for {aspect1} from {parent1}")
                    else:
                        aspects_to_remove.add((aspect1, parent1))
                        print(f"removed {aspect1} from {parent1} for {aspect2} from {parent2}")

        # Filter the dataframe - now checking both level2 and level1
        return df[~df.apply(lambda row: (row['level2'], row['level1']) in aspects_to_remove, axis=1)]

    def filter_similar_aspects_within_level1(self, level1_aspect, level2_aspects, threshold=0.75):
        """Filter similar aspects within a Level 1 category"""
        if not level2_aspects:
            return set()

        all_texts = [level1_aspect] + list(level2_aspects)
        embeddings = self.get_embeddings(all_texts)

        level1_emb = embeddings[0].reshape(1, -1)
        level2_embs = embeddings[1:]

        level1_sims = cosine_similarity(level1_emb, level2_embs)[0]
        level2_sims = cosine_similarity(level2_embs)

        aspects_to_remove = set()
        aspects_list = list(level2_aspects)

        for i in range(len(aspects_list)):
            for j in range(len(aspects_list)):
                if level2_sims[i, j] > threshold and i != j:
                    # Skip if either aspect is already marked for removal
                    if aspects_list[i] in aspects_to_remove or aspects_list[j] in aspects_to_remove:
                        continue

                    # Skip if comparing same aspect
                    if aspects_list[i] == aspects_list[j]:
                        continue

                    if level1_sims[i] >= level1_sims[j]:
                        aspects_to_remove.add(aspects_list[j])
                        print(
                            f"From {level1_aspect} removed {aspects_list[j]} for {aspects_list[i]} (parent sims: {level1_sims[i]:.3f} vs {level1_sims[j]:.3f})")
                    else:
                        aspects_to_remove.add(aspects_list[i])
                        print(
                            f" From {level1_aspect} removed {aspects_list[i]} for {aspects_list[j]} (parent sims: {level1_sims[j]:.3f} vs {level1_sims[i]:.3f})")

        return set(level2_aspects) - aspects_to_remove


    def reorganize_aspects(self, df):
        """Reorganize aspects with enhanced filtering"""
        # First filter similar aspects globally
        filtered_df = self.filter_similar_aspects_global(df)
        # Group aspects by level1
        initial_categories = defaultdict(set)
        for _, row in filtered_df.iterrows():
            initial_categories[row['level1']].add(row['level2'])

        # Then filter within each category
        final_categories = {}
        for level1, level2_aspects in initial_categories.items():
            filtered_aspects = self.filter_similar_aspects_within_level1(level1, level2_aspects)
            final_categories[level1] = filtered_aspects

        return final_categories


def reassign_aspects(final_categories, df):
    """Reassign filtered aspects back to their original clusters"""
    new_aspects_dict = defaultdict(list)

    # Create mapping of original aspects to filtered aspects
    filtered_pairs = {
        (level1, level2)
        for level1, level2_aspects in final_categories.items()
        for level2 in level2_aspects
    }

    # Go through original dataframe and keep only filtered aspects
    for _, row in df.iterrows():
        if (row['level1'], row['level2']) in filtered_pairs:
            aspect = f"{row['level1']}/{row['level2']}"
            if aspect not in new_aspects_dict[row['cluster']]:
                new_aspects_dict[row['cluster']].append(aspect)

    return dict(new_aspects_dict)


# Modify the main function:
def main():
    with open('original_aspects.json', 'r') as f:
        data = json.load(f)
    aspects_dict = {}
    for cluster, aspects in data.items():
        aspects = aspects['aspects']
        aspects_dict[int(cluster)] = aspects

    organizer = FinBERTAspectOrganizer()

    # Process aspects
    df = organizer.preprocess_aspects(aspects_dict)

    # Reorganize and filter aspects
    final_categories = organizer.reorganize_aspects(df)

    # Reassign filtered aspects to clusters
    new_aspects_dict = reassign_aspects(final_categories, df)

    # Save the new assignments
    with open('filtered_aspects.json', 'w') as f:
        json.dump(new_aspects_dict, f, indent=2)

    # Print both categorizations
    print("\nFinal Categorization by Level 1:")
    print("================================")
    for level1, level2_aspects in sorted(final_categories.items()):
        print(f"\n{level1} ({len(level2_aspects)} aspects):")
        for aspect in sorted(level2_aspects):
            print(f"  - {aspect}")

    print("\n Old Cluster Assignments:")
    print("==========================")
    for cluster, aspects in sorted(aspects_dict.items()):
        print(f"\nCluster {cluster} ({len(aspects)} aspects):")
        for aspect in sorted(aspects):
            print(f"  - {aspect}")
    print("\nFinal Cluster Assignments:")
    print("==========================")
    for cluster, aspects in sorted(new_aspects_dict.items()):
        print(f"\nCluster {cluster} ({len(aspects)} aspects):")
        for aspect in sorted(aspects):
            print(f"  - {aspect}")


main()