# Premises > Premises > Conclusions


In [1]:
import os
import xml.etree.ElementTree as ET
import networkx as nx
import matplotlib.pyplot as plt
from networkx.drawing.nx_agraph import graphviz_layout

def sanitize_id(node_id):
    """Ensure node IDs are safe for Graphviz by replacing problematic characters"""
    return node_id.replace('%', 'pct_').replace('|', '_or_')

def visualize_argument_graph(xml_path, output_dir):
    # Parse XML file
    tree = ET.parse(xml_path)
    root = tree.getroot()
    
    # Create directed graph
    G = nx.DiGraph()
    id_mapping = {}  # Map original IDs to sanitized versions
    
    # Add nodes with sanitized IDs
    for elem in root.findall('.//prem') + root.findall('.//conc'):
        orig_id = elem.attrib['ID']
        safe_id = sanitize_id(orig_id)
        id_mapping[orig_id] = safe_id
        arg_type = 'premise' if elem.tag == 'prem' else 'conclusion'
        G.add_node(safe_id, type=arg_type, orig_id=orig_id)
    
    # Add relationships using sanitized IDs
    for elem in root.findall('.//prem') + root.findall('.//conc'):
        source_id = id_mapping[elem.attrib['ID']]
        
        # Process support relationships
        if 'SUP' in elem.attrib:
            for target in elem.attrib['SUP'].split('|'):
                if target in id_mapping:  # Ensure target exists
                    G.add_edge(source_id, id_mapping[target], 
                              relationship='support', color='green')
        
        # Process attack relationships
        if 'ATT' in elem.attrib:
            for target in elem.attrib['ATT'].split('|'):
                if target in id_mapping:
                    G.add_edge(source_id, id_mapping[target], 
                              relationship='attack', color='red')
    
    # Create hierarchical layout
    try:
        pos = graphviz_layout(G, prog='dot')
    except TypeError:
        # Fallback to spring layout if Graphviz fails
        pos = nx.spring_layout(G, seed=42)
    
    # Prepare visualization
    node_colors = ['lightblue' if G.nodes[n]['type'] == 'premise' else 'lightgreen' 
                   for n in G.nodes]
    edge_colors = [G.edges[e]['color'] for e in G.edges]
    labels = {n: G.nodes[n]['orig_id'] for n in G.nodes}  # Use original IDs for labels
    
    # Draw graph
    plt.figure(figsize=(15, 10))
    nx.draw(G, pos, labels=labels, with_labels=True, node_size=1500, 
            node_color=node_colors, edge_color=edge_colors, 
            font_size=9, arrowsize=20)
    
    # Create legend
    legend_elements = [
        plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='lightblue', 
                   markersize=10, label='Premise'),
        plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='lightgreen', 
                   markersize=10, label='Conclusion'),
        plt.Line2D([0], [0], color='green', lw=2, label='Support Relationship'),
        plt.Line2D([0], [0], color='red', lw=2, label='Attack Relationship')
    ]
    plt.legend(handles=legend_elements, loc='upper right')
    
    # Save output
    filename = os.path.basename(xml_path).replace('.xml', '_graph.png')
    plt.savefig(os.path.join(output_dir, filename), bbox_inches='tight')
    plt.close()

# Process all XML files
input_dir = 'xml_files/all_xml'
output_dir = 'graphs2'

for xml_file in os.listdir(input_dir):
    if xml_file.endswith('.xml'):
        visualize_argument_graph(
            os.path.join(input_dir, xml_file),
            output_dir
        )


ImportError: requires pygraphviz http://pygraphviz.github.io/

# <conc ID="A35" SUP="A5|A7|A32"> means A5, A7 and A32 support A35 (opposite of what we have been doing till now)

```mermaid
graph TD
    P1 --> C1
    P2 --> C1
    P3 --> C1
    A1 -->|ATT| P2


In [2]:
import os
import xml.etree.ElementTree as ET
import networkx as nx
import matplotlib.pyplot as plt
from networkx.drawing.nx_agraph import graphviz_layout

def sanitize_id(node_id):
    """Ensure node IDs are safe for Graphviz by replacing problematic characters"""
    return node_id.replace('%', 'pct_').replace('|', '_or_')

def visualize_argument_graph(xml_path, output_dir):
    # Parse XML file
    tree = ET.parse(xml_path)
    root = tree.getroot()
    
    # Create directed graph
    G = nx.DiGraph()
    id_mapping = {}  # Map original IDs to sanitized versions
    
    # Add nodes with sanitized IDs
    for elem in root.findall('.//prem') + root.findall('.//conc'):
        orig_id = elem.attrib['ID']
        safe_id = sanitize_id(orig_id)
        id_mapping[orig_id] = safe_id
        arg_type = 'premise' if elem.tag == 'prem' else 'conclusion'
        G.add_node(safe_id, type=arg_type, orig_id=orig_id)
    
    # Add relationships with REVERSED direction
    for elem in root.findall('.//prem') + root.findall('.//conc'):
        target_id = id_mapping[elem.attrib['ID']]  # This node is the TARGET of relationships
        
        # Process support relationships: sources support this target
        if 'SUP' in elem.attrib:
            for source in elem.attrib['SUP'].split('|'):
                if source in id_mapping:  # Ensure source exists
                    G.add_edge(id_mapping[source], target_id, 
                              relationship='support', color='green')
        
        # Process attack relationships: sources attack this target
        if 'ATT' in elem.attrib:
            for source in elem.attrib['ATT'].split('|'):
                if source in id_mapping:
                    G.add_edge(id_mapping[source], target_id, 
                              relationship='attack', color='red')
    
    # Create hierarchical layout
    try:
        pos = graphviz_layout(G, prog='dot')
    except TypeError:
        # Fallback to spring layout if Graphviz fails
        pos = nx.spring_layout(G, seed=42)
    
    # Prepare visualization
    node_colors = ['lightblue' if G.nodes[n]['type'] == 'premise' else 'lightgreen' 
                   for n in G.nodes]
    edge_colors = [G.edges[e]['color'] for e in G.edges]
    labels = {n: G.nodes[n]['orig_id'] for n in G.nodes}  # Use original IDs for labels
    
    # Draw graph
    plt.figure(figsize=(15, 10))
    nx.draw(G, pos, labels=labels, with_labels=True, node_size=1500, 
            node_color=node_colors, edge_color=edge_colors, 
            font_size=9, arrowsize=20)
    
    # Create legend
    legend_elements = [
        plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='lightblue', 
                   markersize=10, label='Premise'),
        plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='lightgreen', 
                   markersize=10, label='Conclusion'),
        plt.Line2D([0], [0], color='green', lw=2, label='Support Relationship'),
        plt.Line2D([0], [0], color='red', lw=2, label='Attack Relationship')
    ]
    plt.legend(handles=legend_elements, loc='upper right')
    
    # Save output
    filename = os.path.basename(xml_path).replace('.xml', '_graph.png')
    plt.savefig(os.path.join(output_dir, filename), bbox_inches='tight')
    plt.close()

# Process all XML files
input_dir = 'xml_files/all_xml'
output_dir = 'graphs2_opposite_relations'

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

for xml_file in os.listdir(input_dir):
    if xml_file.endswith('.xml'):
        visualize_argument_graph(
            os.path.join(input_dir, xml_file),
            output_dir
        )


# lexical analysis

In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
from collections import Counter
import seaborn as sns
import re
import string  # Added for punctuation handling

# Download ALL required NLTK resources
print("Downloading NLTK resources...")
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('averaged_perceptron_tagger_eng', quiet=True)
print("NLTK downloads completed!")

# Enhanced stop words with legal terms
stop_words = set(stopwords.words('english') + [
    'court', 'case', 'judgment', 'commission', 'appeal', 
    'article', 'section', 'plaintiff', 'defendant'
])

# Define paths
input_dir = 'P vs C vs NA csv/all'
output_dir = 'lexical analysis results'

# Create output directory
os.makedirs(output_dir, exist_ok=True)

# Load all CSV files
all_files = [os.path.join(input_dir, f) for f in os.listdir(input_dir) 
             if f.endswith('.csv') and os.path.isfile(os.path.join(input_dir, f))]
full_df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)

# Add word count column
full_df['word_count'] = full_df['text'].apply(lambda x: len(str(x).split()))

# Class mapping
class_names = {0: 'Non-Argumentative', 1: 'Premise', 2: 'Conclusion'}

Downloading NLTK resources...
NLTK downloads completed!


In [None]:



# Word normalization function
def normalize_text(text):
    """Remove punctuation and lowercase text while preserving word boundaries"""
    # Remove punctuation except intra-word hyphens and apostrophes
    text = re.sub(r'[^\w\s\'-]', '', text)
    return text.lower()

def normalize_word(word):
    """Handle edge cases in word normalization"""
    # Remove leading/trailing punctuation
    word = word.strip(string.punctuation)
    # Handle legal-specific cases
    if word.endswith("'s"):
        word = word[:-2]  # Remove possessive
    return word.lower()

# 1. Word Frequency Analysis with Enhanced Normalization
def generate_word_analysis(df):
    # Word clouds by class
    fig, axs = plt.subplots(1, 3, figsize=(24, 8))
    for i, (class_id, class_name) in enumerate(class_names.items()):
        # Normalize entire text before word cloud generation
        raw_text = ' '.join(df[df['class'] == class_id]['text'].astype(str))
        normalized_text = normalize_text(raw_text)
        
        if normalized_text.strip():
            wordcloud = WordCloud(
                width=800, 
                height=600, 
                background_color='white',
                stopwords=stop_words,
                collocations=True,  # Group multi-word phrases
                normalize_plurals=True,  # Treat plurals as singular
                max_words=200
            ).generate(normalized_text)
            
            axs[i].imshow(wordcloud)
            axs[i].set_title(f'{class_name} Word Cloud', fontsize=16)
            axs[i].axis('off')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'wordclouds.png'), dpi=300, bbox_inches='tight')
    plt.close()
    
    # Top words by class with improved normalization
    plt.figure(figsize=(15, 10))
    for i, (class_id, class_name) in enumerate(class_names.items()):
        raw_text = ' '.join(df[df['class'] == class_id]['text'].astype(str))
        words = normalize_text(raw_text).split()
        
        # Apply word-level normalization and filtering
        filtered = [
            normalize_word(w) 
            for w in words 
            if (normalize_word(w) not in stop_words) and (len(normalize_word(w)) > 3)
        ]
        
        top_words = Counter(filtered).most_common(15)
        
        plt.subplot(2, 2, i+1)
        if top_words:
            ax = sns.barplot(
                x=[count for word, count in top_words], 
                y=[word for word, count in top_words]
            )
            plt.title(f'Top 15 Words: {class_name}', fontsize=14)
            plt.xlabel('Frequency')
            
            # Add count labels to bars
            max_count = max([count for _, count in top_words])
            for p in ax.patches:
                width = p.get_width()
                plt.text(
                    width + max_count * 0.02, 
                    p.get_y() + p.get_height()/2, 
                    f'{int(width)}', 
                    ha='center', 
                    va='center'
                )
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'top_words.png'), dpi=300, bbox_inches='tight')
    plt.close()

# 3. Simplified Syntactic Analysis
def analyze_syntax_simplified(df):
    # Simple word length and sentence analysis
    stats_data = []
    for class_id, class_name in class_names.items():
        class_df = df[df['class'] == class_id]
        stats_data.append({
            'class': class_name,
            'avg_word_length': class_df['text'].apply(lambda x: 
                sum(len(word) for word in str(x).split()) / max(len(str(x).split()), 1)).mean(),
            'avg_sentence_length': class_df['word_count'].mean(),
            'total_words': class_df['word_count'].sum()
        })
    
    stats_df = pd.DataFrame(stats_data)
    
    # Plot statistics
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))
    stats_to_plot = ['avg_word_length', 'avg_sentence_length', 'total_words']
    titles = ['Average Word Length', 'Average Words per Text', 'Total Word Count']
    
    for i, stat in enumerate(stats_to_plot):
        ax = sns.barplot(x='class', y=stat, data=stats_df, ax=axes[i])
        axes[i].set_title(titles[i] + ' by Class')
        axes[i].set_ylabel(titles[i])
        
        # Add value labels to bars
        for p in ax.patches:
            ax.annotate(f'{p.get_height():.1f}' if stat != 'total_words' else f'{int(p.get_height())}', 
                       (p.get_x() + p.get_width() / 2., p.get_height()),
                       ha='center', va='center', 
                       xytext=(0, 5), 
                       textcoords='offset points')
    
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'text_statistics.png'), dpi=300, bbox_inches='tight')
    plt.close()


# 5. Class Distribution Analysis
def analyze_class_distribution(df):
    # Class distribution
    plt.figure(figsize=(12, 8))
    
    # Count plot
    plt.subplot(2, 2, 1)
    class_counts = df['class'].value_counts().sort_index()
    bars = plt.bar([class_names[i] for i in class_counts.index], class_counts.values)
    plt.title('Distribution of Text Classes')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    
    # Add count labels to bars
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height,
                f'{int(height)}', 
                ha='center', va='bottom')
    
    # ... (word count histogram remains the same)
    plt.subplot(2, 2, 2)
    for class_id in [0, 1, 2]:
        data = df[df['class'] == class_id]['word_count']
        plt.hist(data, alpha=0.7, label=class_names[class_id], bins=30)
    plt.title('Word Count Distribution by Class')
    plt.xlabel('Word Count')
    plt.ylabel('Frequency')
    plt.legend()

    
    
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'class_distribution.png'), dpi=300, bbox_inches='tight')
    plt.close()


# Execute all analyses
print("Starting lexical analysis...")
print(f"Total texts: {len(full_df)}")
print(f"Class distribution: {full_df['class'].value_counts().sort_index().to_dict()}")

generate_word_analysis(full_df)
print("Word analysis completed")

analyze_syntax_simplified(full_df)
print("Simplified syntax analysis completed")

analyze_class_distribution(full_df)
print("Class distribution analysis completed")

Downloading NLTK resources...
NLTK downloads completed!
Starting lexical analysis...
Total texts: 7465
Class distribution: {0: 4907, 1: 2399, 2: 159}
Word analysis completed
Simplified syntax analysis completed
Class distribution analysis completed


In [3]:
# ... (previous imports remain)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from textblob import TextBlob
import textstat  # For readability metrics

In [None]:


# ... (previous setup code remains)


# 2. N-gram Analysis
def analyze_ngrams(df):
    # Configure n-gram analyzer
    plt.figure(figsize=(15, 10))
    for i, (class_id, class_name) in enumerate(class_names.items()):
        texts = ' '.join(df[df['class'] == class_id]['text'].astype(str))
        words = normalize_text(texts).split()
        
        # Generate bigrams
        bigrams = list(nltk.ngrams(words, 2))
        bigram_freq = Counter([' '.join(bg) for bg in bigrams]).most_common(10)
        
        plt.subplot(2, 2, i+1)
        if bigram_freq:
            bg, counts = zip(*bigram_freq)
            ax = sns.barplot(x=list(counts), y=list(bg))
            plt.title(f'Top 10 Bigrams: {class_name}', fontsize=14)
            plt.xlabel('Frequency')
            
            # Add labels
            for p in ax.patches:
                plt.text(p.get_width() + max(counts)*0.02,
                         p.get_y() + p.get_height()/2,
                         f'{int(p.get_width())}',
                         va='center')
    
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'bigrams.png'), dpi=300, bbox_inches='tight')
    plt.close()
    print("N-gram analysis completed")





# 5. Keyness Analysis
def analyze_keyness(df):
    from scipy.stats import chi2_contingency
    import math
    
    # Get all words
    all_words = []
    for class_id in [0, 1, 2]:
        texts = ' '.join(df[df['class'] == class_id]['text'].astype(str))
        words = [normalize_word(w) for w in normalize_text(texts).split()]
        all_words.append(words)
    
    # Get vocabulary
    all_vocab = set(word for words in all_words for word in words)
    
    # Calculate keyness (Log Ratio)
    keyness_results = {class_id: [] for class_id in [0, 1, 2]}
    
    for word in all_vocab:
        if word in stop_words or len(word) < 4:
            continue
            
        freqs = [words.count(word) for words in all_words]
        total_freq = sum(freqs)
        
        if total_freq < 10:  # Skip rare words
            continue
            
        for class_id in [0, 1, 2]:
            a = freqs[class_id]  # Frequency in target class
            b = sum(freqs) - a   # Frequency in other classes
            c = len(all_words[class_id]) - a  # Non-word in target
            d = sum(len(words) for words in all_words) - len(all_words[class_id]) - b  # Non-word in others
            
            # Calculate log ratio
            p1 = (a + 1) / (len(all_words[class_id]) + 1)
            p2 = (b + 1) / (sum(len(words) for words in all_words) - len(all_words[class_id]) + 1)
            log_ratio = math.log2(p1 / p2)
            
            keyness_results[class_id].append((word, log_ratio))
    
    # Get top key words for each class
    plt.figure(figsize=(15, 10))
    for i, class_id in enumerate([0, 1, 2]):
        class_name = class_names[class_id]
        top_key = sorted(keyness_results[class_id], key=lambda x: x[1], reverse=True)[:10]
        
        plt.subplot(2, 2, i+1)
        if top_key:
            words, scores = zip(*top_key)
            ax = sns.barplot(x=list(scores), y=list(words))
            plt.title(f'Top Key Words: {class_name}', fontsize=14)
            plt.xlabel('Log Ratio')
            
            # Add labels
            for p in ax.patches:
                plt.text(p.get_width() + 0.1,
                         p.get_y() + p.get_height()/2,
                         f'{p.get_width():.2f}',
                         va='center')
    
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'keyness.png'), dpi=300, bbox_inches='tight')
    plt.close()
    print("Keyness analysis completed")

# Main execution
print("Starting enhanced lexical analysis...")

# Run all analyses
analyze_ngrams(full_df)
analyze_keyness(full_df)


print(f"All results saved to: {output_dir}")


Starting enhanced lexical analysis...
Topic modeling completed
N-gram analysis completed
Sentiment analysis completed
Readability analysis completed
Keyness analysis completed
All results saved to: lexical analysis results


In [4]:
import numpy as np
def analyze_sentiment(df):
    sentiment_data = []
    for class_id, class_name in class_names.items():
        class_df = df[df['class'] == class_id]
        for text in class_df['text'].astype(str):
            blob = TextBlob(text)
            sentiment_data.append({
                'class': class_name,
                'polarity': blob.sentiment.polarity,
                'subjectivity': blob.sentiment.subjectivity
            })
    
    sentiment_df = pd.DataFrame(sentiment_data)
    
    # Plot distributions
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    metrics = ['polarity', 'subjectivity']
    titles = ['Sentiment Polarity by Class', 'Subjectivity by Class']
    
    for i, metric in enumerate(metrics):
        sns.boxplot(x='class', y=metric, data=sentiment_df, ax=axes[i])
        axes[i].set_title(titles[i])
        
        # Calculate min, max, median for each class and add labels
        for cls in sentiment_df['class'].unique():
            vals = sentiment_df[sentiment_df['class'] == cls][metric]
            min_val = np.min(vals)
            max_val = np.max(vals)
            median_val = np.median(vals)
            
            # Find x position for the class
            x_pos = list(sentiment_df['class'].unique()).index(cls)
            
            # Add text labels
            axes[i].text(x_pos, min_val, f'Min: {min_val:.2f}', 
                        ha='center', va='bottom', fontsize=8, color='blue')
            axes[i].text(x_pos, max_val, f'Max: {max_val:.2f}', 
                        ha='center', va='top', fontsize=8, color='red')
            axes[i].text(x_pos, median_val, f'Med: {median_val:.2f}', 
                        ha='center', va='center', fontsize=8, 
                        color='green', bbox=dict(boxstyle="round,pad=0.3", 
                        facecolor='white', alpha=0.8))
    
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'sentiment.png'), dpi=300, bbox_inches='tight')
    plt.close()
    print("Sentiment analysis completed")

    
    
analyze_sentiment(full_df)


Sentiment analysis completed


In [5]:
import numpy as np
def analyze_readability(df):
    readability_data = []
    for class_id, class_name in class_names.items():
        class_df = df[df['class'] == class_id]
        for text in class_df['text'].astype(str):
            readability_data.append({
                'class': class_name,
                'flesch': textstat.flesch_reading_ease(text),
                'gunning_fog': textstat.gunning_fog(text),
                'smog': textstat.smog_index(text)
            })
    
    readability_df = pd.DataFrame(readability_data)
    
    # Plot distributions
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))
    metrics = ['flesch', 'gunning_fog', 'smog']
    titles = ['Flesch Reading Ease', 'Gunning Fog Index', 'SMOG Index']
    
    for i, metric in enumerate(metrics):
        sns.boxplot(x='class', y=metric, data=readability_df, ax=axes[i])
        axes[i].set_title(titles[i])
        
        # Calculate min, max, median for each class and add labels
        for cls in readability_df['class'].unique():
            vals = readability_df[readability_df['class'] == cls][metric]
            min_val = np.min(vals)
            max_val = np.max(vals)
            median_val = np.median(vals)
            
            # Find x position for the class
            x_pos = list(readability_df['class'].unique()).index(cls)
            
            # Add text labels
            axes[i].text(x_pos, min_val, f'Min: {min_val:.1f}', 
                        ha='center', va='bottom', fontsize=8, color='blue')
            axes[i].text(x_pos, max_val, f'Max: {max_val:.1f}', 
                        ha='center', va='top', fontsize=8, color='red')
            axes[i].text(x_pos, median_val, f'Med: {median_val:.1f}', 
                        ha='center', va='center', fontsize=8, 
                        color='green', bbox=dict(boxstyle="round,pad=0.3", 
                        facecolor='white', alpha=0.8))
    
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'readability.png'), dpi=300, bbox_inches='tight')
    plt.close()
    print("Readability analysis completed")


analyze_readability(full_df)


Readability analysis completed


# topic modeling

In [14]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.pipeline import make_pipeline
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import numpy as np
import os

MODELS = {
    "LegalBERT": "nlpaueb/legal-bert-base-uncased",
    "InCaseLawBERT": "law-ai/InCaseLawBERT",
    "RoBERTa": "roberta-base",
    "BERT": "bert-base-uncased"
}

class StopwordVectorizer(CountVectorizer):
    def __init__(self, stop_words=None, **kwargs):
        super().__init__(stop_words=stop_words, **kwargs)
    def fit(self, X, y=None):
        if self.stop_words is not None:
            X = [' '.join([word for word in doc.split() if word not in self.stop_words]) for doc in X]
        return super().fit(X, y)

def run_topic_modeling(df, output_dir, stop_words):
    os.makedirs(output_dir, exist_ok=True)
    class_names = {0: 'Non-Argumentative', 1: 'Premise', 2: 'Conclusion'}
    
    # Define number of topics per class
    topics_per_class = {
        0: 4,  # Non-Argumentative: 4 topics
        1: 3,  # Premise: 3 topics
        2: 5   # Conclusion: 5 topics
    }
    
    for class_id, class_name in class_names.items():
        n_topics = topics_per_class[class_id]  # Get specific number of topics for this class
        texts = df[df['class'] == class_id]['text'].astype(str).tolist()
        
        if len(texts) < n_topics:  # Ensure at least as many docs as topics
            print(f"Skipping {class_name}: too few documents ({len(texts)}) for {n_topics} topics")
            continue
            
        class_output_dir = os.path.join(output_dir, class_name.replace(' ', '_'))
        os.makedirs(class_output_dir, exist_ok=True)
        
        print(f"Processing {class_name} with {n_topics} topics...")
        
        # LDA
        vectorizer = CountVectorizer(max_features=5000, stop_words=list(stop_words), ngram_range=(1, 2))
        dtm = vectorizer.fit_transform(texts)
        lda = LatentDirichletAllocation(n_components=n_topics, learning_method='online', random_state=42)
        lda.fit(dtm)
        with open(os.path.join(class_output_dir, 'LDA_topics.txt'), 'w') as f:
            f.write(f"LDA Topics for {class_name} ({n_topics} topics):\n")
            for idx, topic in enumerate(lda.components_):
                top_words = [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[:-11:-1]]
                f.write(f"Topic {idx+1}: {', '.join(top_words)}\n")
        
        # LSA
        lsa_vectorizer = TfidfVectorizer(max_features=5000, stop_words=list(stop_words), ngram_range=(1, 2))
        lsa_pipeline = make_pipeline(lsa_vectorizer, TruncatedSVD(n_components=n_topics, random_state=42))
        lsa_pipeline.fit(texts)
        with open(os.path.join(class_output_dir, 'LSA_topics.txt'), 'w') as f:
            f.write(f"LSA Topics for {class_name} ({n_topics} topics):\n")
            for idx, component in enumerate(lsa_pipeline[-1].components_):
                top_words = [lsa_vectorizer.get_feature_names_out()[i] for i in component.argsort()[:-11:-1]]
                f.write(f"Topic {idx+1}: {', '.join(top_words)}\n")
        
        # BERTopic with Legal-Specific Models
        for model_name, model_path in MODELS.items():
            try:
                embedding_model = SentenceTransformer(model_path)
                bert_vectorizer = StopwordVectorizer(stop_words=list(stop_words), ngram_range=(1, 2), max_features=5000)
                topic_model = BERTopic(
                    embedding_model=embedding_model,
                    language='english',
                    top_n_words=15,
                    nr_topics=n_topics,
                    vectorizer_model=bert_vectorizer,
                    calculate_probabilities=True,
                    verbose=False
                )
                topics, _ = topic_model.fit_transform(texts)
                filtered_topics = {}
                for topic_id, topic_words in topic_model.get_topics().items():
                    filtered_words = [word for word, _ in topic_words if word not in stop_words][:10]
                    filtered_topics[topic_id] = filtered_words
                with open(os.path.join(class_output_dir, f'{model_name}_topics.txt'), 'w') as f:
                    f.write(f"{model_name} Topics for {class_name} ({n_topics} topics):\n")
                    for topic_id, words in filtered_topics.items():
                        if topic_id != -1:
                            f.write(f"Topic {topic_id+1}: {', '.join(words)}\n")
                fig = topic_model.visualize_topics()
                fig.write_html(os.path.join(class_output_dir, f'{model_name}_topics.html'))
            except Exception as e:
                print(f"Error with {model_name} for {class_name}: {str(e)}")
    
    print("Advanced topic modeling with stopword removal completed")
run_topic_modeling(df=full_df, output_dir='lexical_analysis_results/topic_modeling3', stop_words=stop_words)

Processing Non-Argumentative with 4 topics...


No sentence-transformers model found with name nlpaueb/legal-bert-base-uncased. Creating a new one with mean pooling.

k >= N for N * N square matrix. Attempting to use scipy.linalg.eigh instead.



Error with LegalBERT for Non-Argumentative: Cannot use scipy.linalg.eigh for sparse A with k >= N. Use scipy.linalg.eigh(A.toarray()) or reduce k.


No sentence-transformers model found with name law-ai/InCaseLawBERT. Creating a new one with mean pooling.

k >= N for N * N square matrix. Attempting to use scipy.linalg.eigh instead.



Error with InCaseLawBERT for Non-Argumentative: Cannot use scipy.linalg.eigh for sparse A with k >= N. Use scipy.linalg.eigh(A.toarray()) or reduce k.


No sentence-transformers model found with name roberta-base. Creating a new one with mean pooling.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

k >= N for N * N square matrix. Attempting to use scipy.linalg.eigh instead.



Error with RoBERTa for Non-Argumentative: Cannot use scipy.linalg.eigh for sparse A with k >= N. Use scipy.linalg.eigh(A.toarray()) or reduce k.


No sentence-transformers model found with name bert-base-uncased. Creating a new one with mean pooling.

k >= N for N * N square matrix. Attempting to use scipy.linalg.eigh instead.



Error with BERT for Non-Argumentative: Cannot use scipy.linalg.eigh for sparse A with k >= N. Use scipy.linalg.eigh(A.toarray()) or reduce k.
Processing Premise with 3 topics...


No sentence-transformers model found with name nlpaueb/legal-bert-base-uncased. Creating a new one with mean pooling.


Error with LegalBERT for Premise: zero-size array to reduction operation maximum which has no identity


No sentence-transformers model found with name law-ai/InCaseLawBERT. Creating a new one with mean pooling.

k >= N for N * N square matrix. Attempting to use scipy.linalg.eigh instead.



Error with InCaseLawBERT for Premise: Cannot use scipy.linalg.eigh for sparse A with k >= N. Use scipy.linalg.eigh(A.toarray()) or reduce k.


No sentence-transformers model found with name roberta-base. Creating a new one with mean pooling.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Error with RoBERTa for Premise: zero-size array to reduction operation maximum which has no identity


No sentence-transformers model found with name bert-base-uncased. Creating a new one with mean pooling.


Error with BERT for Premise: zero-size array to reduction operation maximum which has no identity
Processing Conclusion with 5 topics...


No sentence-transformers model found with name nlpaueb/legal-bert-base-uncased. Creating a new one with mean pooling.
No sentence-transformers model found with name law-ai/InCaseLawBERT. Creating a new one with mean pooling.


Error with InCaseLawBERT for Conclusion: zero-size array to reduction operation maximum which has no identity


No sentence-transformers model found with name roberta-base. Creating a new one with mean pooling.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No sentence-transformers model found with name bert-base-uncased. Creating a new one with mean pooling.


Error with BERT for Conclusion: Cannot use scipy.linalg.eigh for sparse A with k >= N. Use scipy.linalg.eigh(A.toarray()) or reduce k.
Advanced topic modeling with stopword removal completed



k >= N for N * N square matrix. Attempting to use scipy.linalg.eigh instead.



In [13]:
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel, LdaModel
from gensim.utils import simple_preprocess
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os

# Download required NLTK data
nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('english'))

def optimize_lda_topics(df, output_dir, min_topics=2, max_topics=15):
    """
    Find optimal number of topics using perplexity and coherence scores
    """
    os.makedirs(output_dir, exist_ok=True)
    
    # Process each class separately
    class_names = {0: 'Non-Argumentative', 1: 'Premise', 2: 'Conclusion'}
    
    for class_id, class_name in class_names.items():
        print(f"\nOptimizing topics for {class_name}...")
        
        # Get texts for this class
        texts = df[df['class'] == class_id]['text'].astype(str).tolist()
        
        if len(texts) < max_topics:
            print(f"Warning: Only {len(texts)} documents for {class_name}. Reducing max_topics to {len(texts)-1}")
            max_topics = min(max_topics, len(texts)-1)
        
        # Preprocess texts
        def preprocess_text(text):
            # Simple preprocessing
            tokens = simple_preprocess(text, deacc=True)
            # Remove stopwords and short words
            return [token for token in tokens if token not in stop_words and len(token) > 3]
        
        processed_texts = [preprocess_text(text) for text in texts]
        processed_texts = [text for text in processed_texts if len(text) > 0]  # Remove empty
        
        # Create dictionary and corpus
        dictionary = corpora.Dictionary(processed_texts)
        dictionary.filter_extremes(no_below=2, no_above=0.8)  # Filter rare/common words
        corpus = [dictionary.doc2bow(text) for text in processed_texts]
        
        # Test different numbers of topics
        topic_range = range(min_topics, max_topics + 1)
        perplexity_scores = []
        coherence_scores = []
        models = []
        
        for num_topics in topic_range:
            print(f"  Testing {num_topics} topics...")
            
            # Train LDA model
            lda_model = LdaModel(
                corpus=corpus,
                id2word=dictionary,
                num_topics=num_topics,
                random_state=42,
                update_every=1,
                chunksize=100,
                passes=10,
                alpha='auto',
                per_word_topics=True
            )
            
            # Calculate perplexity
            perplexity = lda_model.log_perplexity(corpus)
            perplexity_scores.append(perplexity)
            
            # Calculate coherence
            coherence_model = CoherenceModel(
                model=lda_model,
                texts=processed_texts,
                dictionary=dictionary,
                coherence='c_v'
            )
            coherence = coherence_model.get_coherence()
            coherence_scores.append(coherence)
            
            models.append(lda_model)
            
            print(f"    Topics: {num_topics}, Perplexity: {perplexity:.3f}, Coherence: {coherence:.3f}")
        
        # Find optimal number of topics
        # Method 1: Highest coherence
        best_coherence_idx = np.argmax(coherence_scores)
        optimal_topics_coherence = topic_range[best_coherence_idx]
        
        # Method 2: Best perplexity-coherence trade-off (normalize and combine)
        norm_perplexity = [(p - min(perplexity_scores)) / (max(perplexity_scores) - min(perplexity_scores)) 
                          for p in perplexity_scores]
        norm_coherence = [(c - min(coherence_scores)) / (max(coherence_scores) - min(coherence_scores)) 
                         for c in coherence_scores]
        combined_scores = [nc - np for nc, np in zip(norm_coherence, norm_perplexity)]
        best_combined_idx = np.argmax(combined_scores)
        optimal_topics_combined = topic_range[best_combined_idx]
        
        # Plot results
        plt.figure(figsize=(12, 6))
        
        plt.subplot(1, 2, 1)
        plt.plot(topic_range, perplexity_scores, 'bo-', label='Perplexity')
        plt.axvline(x=optimal_topics_combined, color='r', linestyle='--', alpha=0.7)
        plt.xlabel('Number of Topics')
        plt.ylabel('Perplexity (lower is better)')
        plt.title(f'Perplexity vs Topics - {class_name}')
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        plt.subplot(1, 2, 2)
        plt.plot(topic_range, coherence_scores, 'go-', label='Coherence')
        plt.axvline(x=optimal_topics_coherence, color='r', linestyle='--', alpha=0.7)
        plt.xlabel('Number of Topics')
        plt.ylabel('Coherence (higher is better)')
        plt.title(f'Coherence vs Topics - {class_name}')
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, f'{class_name.replace(" ", "_")}_topic_optimization.png'), 
                   dpi=300, bbox_inches='tight')
        plt.close()
        
        # Save results
        results_df = pd.DataFrame({
            'num_topics': topic_range,
            'perplexity': perplexity_scores,
            'coherence': coherence_scores,
            'combined_score': combined_scores
        })
        results_df.to_csv(os.path.join(output_dir, f'{class_name.replace(" ", "_")}_optimization_results.csv'), 
                         index=False)
        
        # Save optimal model topics
        optimal_model = models[best_coherence_idx]
        with open(os.path.join(output_dir, f'{class_name.replace(" ", "_")}_optimal_topics.txt'), 'w') as f:
            f.write(f"Optimal Number of Topics for {class_name}: {optimal_topics_coherence}\n")
            f.write(f"Best Coherence Score: {coherence_scores[best_coherence_idx]:.4f}\n")
            f.write(f"Corresponding Perplexity: {perplexity_scores[best_coherence_idx]:.4f}\n\n")
            f.write("Topics:\n")
            for idx, topic in enumerate(optimal_model.print_topics()):
                f.write(f"Topic {idx+1}: {topic[1]}\n")
        
        print(f"  Optimal topics for {class_name}: {optimal_topics_coherence} (coherence: {coherence_scores[best_coherence_idx]:.3f})")

def run_optimized_topic_modeling(df, output_dir, optimal_topics_dict):
    """
    Run topic modeling with optimized number of topics for each class
    """
    class_names = {0: 'Non_Argumentative', 1: 'Premise', 2: 'Conclusion'}
    
    for class_id, class_name in class_names.items():
        if class_name not in optimal_topics_dict:
            continue
            
        num_topics = optimal_topics_dict[class_name]
        texts = df[df['class'] == class_id]['text'].astype(str).tolist()
        
        # Preprocess
        def preprocess_text(text):
            tokens = simple_preprocess(text, deacc=True)
            return [token for token in tokens if token not in stop_words and len(token) > 3]
        
        processed_texts = [preprocess_text(text) for text in texts]
        processed_texts = [text for text in processed_texts if len(text) > 0]
        
        # Create dictionary and corpus
        dictionary = corpora.Dictionary(processed_texts)
        dictionary.filter_extremes(no_below=2, no_above=0.8)
        corpus = [dictionary.doc2bow(text) for text in processed_texts]
        
        # Train final LDA model
        final_lda = LdaModel(
            corpus=corpus,
            id2word=dictionary,
            num_topics=num_topics,
            random_state=42,
            passes=20,  # More passes for final model
            alpha='auto'
        )
        
        # Save final results
        class_output_dir = os.path.join(output_dir, 'final_models')
        os.makedirs(class_output_dir, exist_ok=True)
        
        with open(os.path.join(class_output_dir, f'{class_name}_final_topics.txt'), 'w') as f:
            f.write(f"Final LDA Model for {class_name} ({num_topics} topics):\n\n")
            for idx, topic in enumerate(final_lda.print_topics(num_words=10)):
                f.write(f"Topic {idx+1}: {topic[1]}\n")
        
        print(f"Final model saved for {class_name} with {num_topics} topics")

# Usage
if __name__ == "__main__":
    # Step 1: Find optimal number of topics
    optimize_lda_topics(
        df=full_df,
        output_dir=os.path.join(output_dir, 'topic_optimization'),
        min_topics=2,
        max_topics=10
    )
    
    # # Step 2: Manually specify optimal topics based on results
    # # (Check the generated plots and files to determine these)
    # optimal_topics = {
    #     'Non_Argumentative': 5,  # Adjust based on your results
    #     'Premise': 4,           # Adjust based on your results  
    #     'Conclusion': 3         # Adjust based on your results
    # }
    
    # # Step 3: Run final topic modeling with optimal parameters
    # run_optimized_topic_modeling(
    #     df=full_df,
    #     output_dir=os.path.join(output_dir, 'topic_optimization'),
    #     optimal_topics_dict=optimal_topics
    # )


The history saving thread hit an unexpected error (OperationalError('database or disk is full')).History will not be written to the database.

Optimizing topics for Non-Argumentative...
  Testing 2 topics...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

    Topics: 2, Perplexity: -7.019, Coherence: 0.578
  Testing 3 topics...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

    Topics: 3, Perplexity: -7.015, Coherence: 0.539
  Testing 4 topics...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

    Topics: 4, Perplexity: -7.046, Coherence: 0.561
  Testing 5 topics...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

    Topics: 5, Perplexity: -7.117, Coherence: 0.481
  Testing 6 topics...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

    Topics: 6, Perplexity: -7.159, Coherence: 0.489
  Testing 7 topics...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

    Topics: 7, Perplexity: -7.178, Coherence: 0.451
  Testing 8 topics...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

    Topics: 8, Perplexity: -7.201, Coherence: 0.401
  Testing 9 topics...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

    Topics: 9, Perplexity: -7.275, Coherence: 0.428
  Testing 10 topics...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

    Topics: 10, Perplexity: -7.370, Coherence: 0.409
  Optimal topics for Non-Argumentative: 2 (coherence: 0.578)

Optimizing topics for Premise...
  Testing 2 topics...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

    Topics: 2, Perplexity: -6.749, Coherence: 0.458
  Testing 3 topics...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

    Topics: 3, Perplexity: -6.752, Coherence: 0.542
  Testing 4 topics...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

    Topics: 4, Perplexity: -6.802, Coherence: 0.499
  Testing 5 topics...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

    Topics: 5, Perplexity: -6.820, Coherence: 0.450
  Testing 6 topics...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

    Topics: 6, Perplexity: -6.836, Coherence: 0.420
  Testing 7 topics...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

    Topics: 7, Perplexity: -6.835, Coherence: 0.442
  Testing 8 topics...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

    Topics: 8, Perplexity: -6.850, Coherence: 0.450
  Testing 9 topics...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

    Topics: 9, Perplexity: -6.904, Coherence: 0.414
  Testing 10 topics...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

    Topics: 10, Perplexity: -6.989, Coherence: 0.421
  Optimal topics for Premise: 3 (coherence: 0.542)

Optimizing topics for Conclusion...
  Testing 2 topics...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

    Topics: 2, Perplexity: -4.088, Coherence: 0.517
  Testing 3 topics...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

    Topics: 3, Perplexity: -4.123, Coherence: 0.409
  Testing 4 topics...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

    Topics: 4, Perplexity: -4.144, Coherence: 0.398
  Testing 5 topics...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

    Topics: 5, Perplexity: -4.181, Coherence: 0.471
  Testing 6 topics...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

    Topics: 6, Perplexity: -4.169, Coherence: 0.448
  Testing 7 topics...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

    Topics: 7, Perplexity: -4.170, Coherence: 0.412
  Testing 8 topics...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

    Topics: 8, Perplexity: -4.211, Coherence: 0.417
  Testing 9 topics...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

    Topics: 9, Perplexity: -4.318, Coherence: 0.387
  Testing 10 topics...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

    Topics: 10, Perplexity: -4.280, Coherence: 0.413
  Optimal topics for Conclusion: 2 (coherence: 0.517)
