In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sentence_transformers import SentenceTransformer
from together import Together

# Initialize the sentence transformer model for embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Initialize the language model client
client = Together(api_key='')

# Step 1: Load the saved CSV file
df = pd.read_csv('highest_level_topic_labels_with_representatives.csv')

# Step 2: Compute embeddings for each Highest_Topic_Label
df['Embeddings'] = df['Highest_Topic_Label'].apply(lambda x: model.encode(x))

# Step 3: Compute pairwise cosine similarity among Highest_Topic_Labels
embeddings = np.vstack(df['Embeddings'].to_list())
similarity_matrix = cosine_similarity(embeddings)

# Step 4: Initialize dictionaries to keep track of merged labels and their original pairs
merged_labels = {}
original_pairs = {}
used_labels = set()  # Keep track of labels that have been used

# Identify and merge labels with similarity greater than 0.8
for i in range(len(similarity_matrix)):
    for j in range(i + 1, len(similarity_matrix)):
        label_1 = df['Highest_Topic_Label'].iloc[i]
        label_2 = df['Highest_Topic_Label'].iloc[j]
        
        # Skip if either label has already been used in a merge
        if label_1 in used_labels or label_2 in used_labels or label_1 == label_2:
            continue
        
        if similarity_matrix[i, j] > 0.75:
            # Determine which label to keep (the longer one)
            if len(label_1) >= len(label_2):
                merged_labels[label_2] = label_1
                original_pairs[label_1] = label_2
                used_labels.add(label_2)  # Mark label_2 as used
            else:
                merged_labels[label_1] = label_2
                original_pairs[label_2] = label_1
                used_labels.add(label_1)  # Mark label_1 as used

# Step 5: Replace merged labels in the dataframe and create a new column for the merged labels
df['Merged_Label'] = df['Highest_Topic_Label'].apply(lambda x: merged_labels.get(x, x))

# Step 6: Add a column to keep track of the original label pairs that were merged
df['Original_Label_Pair'] = df['Highest_Topic_Label'].apply(lambda x: f"{x} , {original_pairs.get(x, x)}")

# Step 7: Function to generate a final consolidated label using Together AI only for multiple matches
def generate_final_label(merged_label, original_labels):
    prompt = (
        f"You are an intelligent assistant skilled in generating concise and comprehensive labels. "
        f"Given the following details:\n"
        f"Merged Label: {merged_label}\n"
        f"Original Labels: {original_labels}\n"
        f"Please generate a final, concise label that best represents these combined topics. "
        f"Focus on capturing the most general and encompassing theme without delving into specifics. "
        f"Return only the final label and nothing else!"
    )
    
    stream = client.chat.completions.create(
        model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
        messages=[{"role": "user", "content": prompt}],
        stream=True,
        temperature=0.1,
    )

    response = ""
    for chunk in stream:
        response += chunk.choices[0].delta.content or ""

    return response.strip()

# Step 8: Generate the final label using the LLM only if there are multiple labels
def generate_or_keep_label(row):
    original_labels = row['Original_Label_Pair'].split(' , ')
    if len(set(original_labels)) > 1:
        return generate_final_label(row['Merged_Label'], row['Original_Label_Pair'])
    else:
        return row['Merged_Label']

df['Final_Label'] = df.apply(generate_or_keep_label, axis=1)

# Step 9: Merge rows with the same Final_Label
df_merged = df.groupby('Final_Label').agg({
    'Highest_Topic_Label': lambda x: ' , '.join(x),
    'Merged_Label': 'first',
    'Original_Label_Pair': lambda x: ' , '.join(x),
    'Representative_Document': lambda x: ' , '.join(x),  # Assuming this column exists in your dataframe
}).reset_index()

# Step 10: Save the merged dataframe to a new CSV file
df_merged.to_csv('final_highest_level_topic_labels_with_representatives.csv', index=False)

print("Final highest-level topic labels with merged representations saved to final_highest_level_topic_labels_with_representatives.csv")

  from tqdm.autonotebook import tqdm, trange


Final highest-level topic labels with merged representations saved to final_highest_level_topic_labels_with_representatives.csv


In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sentence_transformers import SentenceTransformer
from together import Together

# Initialize the sentence transformer model for embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Initialize the language model client
client = Together(api_key='')

# Step 1: Load the saved CSV file
df = pd.read_csv('Hierchical_Topics_Third_Level-Topics.csv')

# Step 2: Compute embeddings for each Highest_Topic_Label
df['Embeddings'] = df['Highest_Topic_Label'].apply(lambda x: model.encode(x))

# Step 3: Compute pairwise cosine similarity among Highest_Topic_Labels
embeddings = np.vstack(df['Embeddings'].to_list())
similarity_matrix = cosine_similarity(embeddings)

# Step 4: Initialize dictionaries to keep track of merged labels and their original pairs
merged_labels = {}
original_pairs = {}
used_labels = set()  # Keep track of labels that have been used

# Identify and merge labels with similarity greater than 0.8
for i in range(len(similarity_matrix)):
    for j in range(i + 1, len(similarity_matrix)):
        label_1 = df['Highest_Topic_Label'].iloc[i]
        label_2 = df['Highest_Topic_Label'].iloc[j]
        
        # Skip if either label has already been used in a merge
        if label_1 in used_labels or label_2 in used_labels or label_1 == label_2:
            continue
        
        if similarity_matrix[i, j] > 0.75:
            # Determine which label to keep (the longer one)
            if len(label_1) >= len(label_2):
                merged_labels[label_2] = label_1
                original_pairs[label_1] = label_2
                used_labels.add(label_2)  # Mark label_2 as used
            else:
                merged_labels[label_1] = label_2
                original_pairs[label_2] = label_1
                used_labels.add(label_1)  # Mark label_1 as used

# Step 5: Replace merged labels in the dataframe and create a new column for the merged labels
df['Merged_Label'] = df['Highest_Topic_Label'].apply(lambda x: merged_labels.get(x, x))

# Step 6: Add a column to keep track of the original label pairs that were merged
df['Original_Label_Pair'] = df['Highest_Topic_Label'].apply(lambda x: f"{x} , {original_pairs.get(x, x)}")

# Step 7: Function to generate a final consolidated label using Together AI only for multiple matches
def generate_final_label(merged_label, original_labels):
    prompt = (
        f"You are an intelligent assistant skilled in generating concise and comprehensive labels. "
        f"Given the following details:\n"
        f"Merged Label: {merged_label}\n"
        f"Original Labels: {original_labels}\n"
        f"Please generate a final, concise label that best represents these combined topics. "
        f"Focus on capturing the most general and encompassing theme without delving into specifics. "
        f"Return only the final label and nothing else!"
    )
    
    stream = client.chat.completions.create(
        model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
        messages=[{"role": "user", "content": prompt}],
        stream=True,
        temperature=0.1,
    )

    response = ""
    for chunk in stream:
        response += chunk.choices[0].delta.content or ""

    return response.strip()

# Step 8: Generate the final label using the LLM only if there are multiple labels
def generate_or_keep_label(row):
    original_labels = row['Original_Label_Pair'].split(' , ')
    if len(set(original_labels)) > 1:
        return generate_final_label(row['Merged_Label'], row['Original_Label_Pair'])
    else:
        return row['Merged_Label']

df['Final_Label'] = df.apply(generate_or_keep_label, axis=1)

# Step 9: Merge rows with the same Final_Label
df_merged = df.groupby('Final_Label').agg({
    'Highest_Topic_Label': lambda x: ' , '.join(x),
    'Merged_Label': 'first',
    'Original_Label_Pair': lambda x: ' , '.join(x),
    'Representative_Document': lambda x: ' , '.join(x),  # Assuming this column exists in your dataframe
}).reset_index()

# Step 10: Save the merged dataframe to a new CSV file
df_merged.to_csv('Hierchical_Topic_Intepreation_Final.csv', index=False)

print("Final highest-level topic labels with merged representations saved to final_highest_level_topic_labels_with_representatives.csv")



Final highest-level topic labels with merged representations saved to final_highest_level_topic_labels_with_representatives.csv


In [4]:
import pandas as pd
df = pd.read_csv('final_highest_level_topic_labels_with_representatives.csv')
df.to_csv('Labels.csv')