In [30]:
from pathlib import Path
for file in Path('.').rglob('bertopic_35_25_9_0.3_0.csv'):
    file.rename(file.with_name('bertopic_100_35_25_9_0.3_0.csv'))

In [31]:
import os
from pathlib import Path
import pandas as pd

data_sources = ['20ng', 'agris', 'tweets_ny']

llm_model_name_list = ['mistral', 'mistral-large', 'llama', 'llama-large', 'qwen', 'qwen-large', 'gemma', 'gemma-large']

metrics = [
    "Coherence",
    "Repetitive",
    "Readability",
    "Number of Outliers",
    "Number of Same Concept Pairs",
    "Number of Non-words",
]

for data_source in data_sources:
    rows = []
    for llm_model_name in llm_model_name_list:
        directory = f"{data_source}/{llm_model_name}/number"
        if os.path.exists(directory):
            for file_name in Path(directory).glob('*.csv'):
                topic_model = file_name.name.split('_')[0]
                number_of_topic = file_name.name.split('_')[1]
                df = pd.read_csv(file_name, sep='\t', encoding='utf-8')

                row_dic = {
                    'data_source': data_source,
                    'llm_model': llm_model_name,
                    'topic_model': topic_model,
                    'number of topics': number_of_topic,
                    'file_name': file_name.name,
                }
                for metric in metrics:
                    rates = df[metric]
                    mean = rates.mean()
                    row_dic[metric] = mean
                rows.append(row_dic)
    
    df_retult = pd.DataFrame(rows)

    # Calculate the mean of each metric grouped by 'model' and 'number of topics'
    each_model_path = f"analysis_result_each_model_{data_source}.csv"
    grouped_means = df_retult.groupby(['llm_model', 'topic_model', 'number of topics'])[metrics].mean().reset_index()
    grouped_means[metrics] = grouped_means[metrics].round(4)
    grouped_means.to_csv(each_model_path, sep='\t', index=False, encoding='utf-8')
    print(f"Data saved to {each_model_path}")

Data saved to analysis_result_each_model_20ng.csv
Data saved to analysis_result_each_model_agris.csv
Data saved to analysis_result_each_model_tweets_ny.csv


In [43]:
# Analysis of Coverage Metrics
import os
import pandas as pd
from pathlib import Path

overall = []
def analyze_coverage(base_folder):
    print(f"Analyzing coverage metrics in folder: {base_folder}")
    """Analyze coverage metrics for different models and datasets."""
    for file in Path(base_folder).glob('sample*.csv'):
        print(f"Processing file: {file.name}")
        if '20ng' in file.name:
            dataset = '20ng'
        elif 'agris' in file.name:
            dataset = 'agris'
        elif 'tweets_ny' in file.name:
            dataset = 'tweets_ny'

        llm = file.parent.name
        
        # Read and clean data
        df = pd.read_csv(file, sep='\t', encoding='utf-8')
        df = df.replace(r'\n', ' ', regex=True)
        
        # Calculate aggregated metrics
        metrics = ['under_coverage', 'over_coverage']
        grouped = df.groupby(['model', 'number of topics'])[metrics].mean().round(3).reset_index()
    
        # Save final combined results
        output_path = base_folder / f'analysis_coverage_each_model_{dataset}.csv'
        grouped.to_csv(output_path, sep='\t', encoding='utf-8', index=False)
        grouped['dataset'] = dataset
        grouped['llm'] = llm
        overall.append(grouped)
        print(f"Final combined results saved to: {output_path} \n")

# Usage example
data_source = Path('coverage_results')

for path in data_source.glob('*'):
    results = analyze_coverage(path)
pd.concat(overall).to_csv('analysis_coverage_all.csv', sep='\t', encoding='utf-8', index=False)

Analyzing coverage metrics in folder: coverage_results/gemma-large
Processing file: sample_100_data_agris_202504131800_coverage_results_gemma-large.csv
Final combined results saved to: coverage_results/gemma-large/analysis_coverage_each_model_agris.csv 

Processing file: sample_100_data_tweets_ny_202504131800_coverage_results_gemma-large.csv
Final combined results saved to: coverage_results/gemma-large/analysis_coverage_each_model_tweets_ny.csv 

Processing file: sample_100_data_20ng_202504131800_coverage_results_gemma-large.csv
Final combined results saved to: coverage_results/gemma-large/analysis_coverage_each_model_20ng.csv 

Analyzing coverage metrics in folder: coverage_results/llama
Processing file: sample_100_data_tweets_ny_202504131800_coverage_results_llama.csv
Final combined results saved to: coverage_results/llama/analysis_coverage_each_model_tweets_ny.csv 

Processing file: sample_100_data_20ng_202504131800_coverage_results_llama.csv
Final combined results saved to: coverag

In [32]:
import json

datasets = ['20ng', 'agris', 'tweets_ny']
llm_model_name_list = ['mistral', 'mistral-large', 'llama', 'llama-large', 'qwen', 'qwen-large', 'gemma', 'gemma-large']
for dataset in datasets:
    for llm_model_name in llm_model_name_list:
        path = Path(f"{dataset}/{llm_model_name}/number/diversity/raw_outputs")
        if path.exists():
            for file_name in path.glob('*_raw.json'):
                hierarchy = {
                    "taxonomic": 0,
                    "aspectual": 0,
                    "containment": 0,
                }
                topic_model = file_name.name.split('_')[0]
                number_of_topic = file_name.name.split('_')[1]
                with open(file_name, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                for k,v in data.items():
                    for i, j in v["diversity"]["processed"]["hierarchy"].items():
                        if not isinstance(j, dict):
                            print(file_name, i, j)
                            continue
                        if j["is_hierarchical"]:
                            hierarchy[i] += 1
                print(f"{dataset} {llm_model_name} {topic_model} {number_of_topic} hierarchy: {hierarchy}")

20ng mistral bertopic 100 hierarchy: {'taxonomic': 267, 'aspectual': 433, 'containment': 460}
20ng mistral combinedtm 100 hierarchy: {'taxonomic': 241, 'aspectual': 449, 'containment': 443}
20ng mistral combinedtm 50 hierarchy: {'taxonomic': 77, 'aspectual': 128, 'containment': 124}
20ng mistral prodlda 50 hierarchy: {'taxonomic': 65, 'aspectual': 106, 'containment': 114}
20ng mistral prodlda 100 hierarchy: {'taxonomic': 261, 'aspectual': 409, 'containment': 413}
20ng mistral lda 50 hierarchy: {'taxonomic': 55, 'aspectual': 120, 'containment': 109}
20ng mistral bertopic 50 hierarchy: {'taxonomic': 75, 'aspectual': 116, 'containment': 113}
20ng mistral lda 100 hierarchy: {'taxonomic': 290, 'aspectual': 501, 'containment': 504}
20ng mistral-large bertopic 100 hierarchy: {'taxonomic': 1254, 'aspectual': 1518, 'containment': 1088}
20ng mistral-large combinedtm 100 hierarchy: {'taxonomic': 1079, 'aspectual': 1339, 'containment': 1019}
20ng mistral-large combinedtm 50 hierarchy: {'taxonomic'

In [59]:
# clustering_complementarity

for file in Path('.').glob('*/*/clustering_complementarity/*.json'):
    d = json.load(open(file, 'r', encoding='utf-8'))
    semantic_clustering = d['semantic_clustering']['raw']['overall_output']
    semantic_clustering = json.loads(semantic_clustering.split('```')[0])
    complementarity = d['complementarity']['raw']['overall_output']
    complementarity = json.loads(complementarity.split('```')[0])
    print(f"File: {file}")
    print(f"Semantic Clustering: {semantic_clustering}")
    print(f"Complementarity: {complementarity}")

File: tweets_ny/gemma-large/clustering_complementarity/bertopic_50_25_20_20_0.3_0_raw.json
Semantic Clustering: [{'cluster_label': 'Fitness & Wellness', 'topic_ids': [0, 5, 10, 34, 37, 40, 41, 47, 48], 'justification': 'These topics all relate to physical and mental well-being. Topics cover exercise (5, 34), healthy eating (10, 47), motivation (5, 40, 48), stress management (37, 41) and general lifestyle (0). They are all part of the broader concept of self-improvement and a healthy lifestyle.'}, {'cluster_label': 'Social Media & Online Life', 'topic_ids': [3, 7, 15, 25, 33], 'justification': 'These topics are heavily focused on online platforms, social interaction, and digital self-presentation. They cover social media usage (3, 7), selfies and online identity (15), dating (25) and following/engagement (33).'}, {'cluster_label': 'Food & Drink', 'topic_ids': [2, 4, 10], 'justification': 'These topics all center around food and beverages. Topic 2 focuses on drinks, Topic 4 on general fo

JSONDecodeError: Expecting ',' delimiter: line 223 column 119 (char 15613)

In [60]:
complementarity

'[\n  {\n    "topic_pair": [1, 2],\n    "shared_focus": "Online culture and social media",\n    "complementary_aspects": {\n      "Topic 1": "Focuses on internet slang, memes, and online trends (e.g., \'dot\', \'dick\', \'chillin\')",\n      "Topic 2": "Focuses on food-related content and restaurant experiences (e.g., \'food\', \'chipotle\', \'restaurant\')"\n    },\n    "justification": "Both topics relate to online culture, but one is centered around internet language and trends, while the other explores food and dining experiences – they represent different facets of online social life."\n  },\n  {\n    "topic_pair": [3, 4],\n    "shared_focus": "Weight loss and body image",\n    "complementary_aspects": {\n      "Topic 3": "Deals with the desire to lose weight and the associated clichés and restrictive diets (e.g., \'loose\', \'diet\', \'weightloss\')",\n      "Topic 4": "Focuses on social media trends, self-presentation, and the pressure to appear attractive (e.g., \'twitter\', \'

In [66]:
for file in Path('.').glob('*/*/number/diversity/*.txt'):
    dataset = file.parent.parent.parent.parent.name
    llm_model_name = file.parent.parent.parent.name
    topic_model = file.name.split('_')[0]
    number_of_topic = file.name.split('_')[1]
    with open(file, 'r', encoding='utf-8') as f:
        data = f.read()
    print(f"{dataset} {llm_model_name} {topic_model} {number_of_topic} {data}")
    

tweets_ny gemma-large bertopic 25 2.9856711915535445
tweets_ny gemma-large prodlda 100 2.94
tweets_ny gemma-large lda 100 2.957979797979798
tweets_ny gemma-large prodlda 50 2.92
tweets_ny gemma-large combinedtm 50 2.820408163265306
tweets_ny gemma-large combinedtm 100 2.8737373737373737
tweets_ny gemma-large bertopic 35 2.98970398970399
tweets_ny gemma-large lda 50 2.9404081632653063
tweets_ny llama bertopic 25 2.823529411764706
tweets_ny llama prodlda 100 2.728686868686869
tweets_ny llama lda 100 2.756767676767677
tweets_ny llama prodlda 50 2.7583673469387757
tweets_ny llama combinedtm 50 2.737142857142857
tweets_ny llama combinedtm 100 2.751919191919192
tweets_ny llama bertopic 35 2.8278635778635777
tweets_ny llama lda 50 2.726530612244898
tweets_ny mistral bertopic 25 2.622926093514329
tweets_ny mistral prodlda 100 2.1995959595959595
tweets_ny mistral lda 100 2.2276767676767677
tweets_ny mistral prodlda 50 2.1444897959183673
tweets_ny mistral combinedtm 50 2.210612244897959
tweets_n