In [None]:
#!/usr/bin/env python3
"""
Simple Topic Analyzer - Just Works!
"""

import json
import os
import time
import matplotlib.pyplot as plt
from collections import Counter
import pandas as pd
from matplotlib.backends.backend_pdf import PdfPages
import ipywidgets as widgets
from IPython.display import display, clear_output
from anthropic import Anthropic

class SimpleTopicAnalyzer:
    def __init__(self):
        self.api_key = ""
        self.client = None
        self.results = {}
        self.full_analysis = []  # Store all individual analyses
        self.all_keywords = []   # Store all keywords for topic identification
        
        # Interface
        self.api_input = widgets.Text(
            placeholder='Your Anthropic API key',
            description='API Key:',
            layout=widgets.Layout(width='400px')
        )
        
        self.file_selector = widgets.Dropdown(
            options=self.get_json_files(),
            description='JSON File:',
            layout=widgets.Layout(width='400px')
        )
        
        self.refresh_btn = widgets.Button(description='Refresh Files', button_style='info')
        self.analyze_btn = widgets.Button(description='Analyze', button_style='success')
        
        self.output = widgets.Output()
        
        # Events
        self.refresh_btn.on_click(self.refresh_files)
        self.analyze_btn.on_click(self.analyze)
        
        # Layout
        interface = widgets.VBox([
            widgets.HTML("<h2>Simple Topic Analyzer</h2>"),
            self.api_input,
            widgets.HBox([self.file_selector, self.refresh_btn]),
            self.analyze_btn,
            self.output
        ])
        
        display(interface)
    
    def get_json_files(self):
        """Get JSON files from the directory."""
        directory = r"YOUR_DIRECTORY_HERE"
        try:
            files = [f for f in os.listdir(directory) if f.endswith('.json')]
            return [(f, os.path.join(directory, f)) for f in files]
        except:
            return [("No files found", "")]
    
    def refresh_files(self, btn):
        """Refresh file list."""
        self.file_selector.options = self.get_json_files()
    
    def analyze(self, btn):
        """Run analysis."""
        self.api_key = self.api_input.value.strip()
        file_path = self.file_selector.value
        
        if not self.api_key:
            with self.output:
                clear_output()
                print("❌ Enter API key")
            return
        
        if not file_path or not os.path.exists(file_path):
            with self.output:
                clear_output()
                print("❌ Select valid file")
            return
        
        with self.output:
            clear_output()
            print("Starting analysis...")
        
        try:
            self.client = Anthropic(api_key=self.api_key)
            
            # Load file
            with self.output:
                clear_output()
                print("Loading file...")
            
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            # Extract responses
            responses = [item['response'] for item in data if 'response' in item]
            
            with self.output:
                clear_output()
                print(f"Found {len(responses)} responses. Extracting keywords...")
            
            # Clear previous analysis
            self.full_analysis = []
            self.all_keywords = []
            
            # Extract keywords from each response
            for i, response in enumerate(responses):
                if i % 10 == 0:
                    with self.output:
                        clear_output()
                        print(f"Extracting keywords {i+1}/{len(responses)}")
                
                # Get 5 salient keywords for this response
                keywords = self.extract_keywords(response)
                
                # Store individual analysis
                self.full_analysis.append({
                    'response_id': i + 1,
                    'keywords': ', '.join(keywords),
                    'response_preview': response[:100] + '...' if len(response) > 100 else response
                })
                
                # Add to all keywords
                self.all_keywords.extend(keywords)
                
                time.sleep(0.05)
            
            # Now identify the 10 most relevant topics from all keywords
            with self.output:
                clear_output()
                print("Identifying top topics from all keywords...")
            
            top_topics = self.identify_top_topics()
            
            # Count keyword frequency
            keyword_counts = Counter(self.all_keywords)
            
            # Display and save results
            file_name = os.path.basename(file_path).replace('.json', '')
            self.display_and_save_results(top_topics, keyword_counts, len(responses), file_name)
            
        except Exception as e:
            with self.output:
                clear_output()
                print(f"❌ Error: {e}")
    
    def extract_keywords(self, response):
        """Extract 5 most salient keywords from a response."""
        prompt = f"""Analyze this text and identify the 5 most salient keywords that define the content:

"{response[:500]}..."

Give me exactly 5 keywords that best capture the essence and main concepts of this text.

Format: keyword1, keyword2, keyword3, keyword4, keyword5"""

        try:
            result = self.client.messages.create(
                model="claude-3-5-haiku-20241022",
                max_tokens=50,
                temperature=0,
                messages=[{"role": "user", "content": prompt}]
            )
            
            keywords_text = result.content[0].text.strip()
            
            # Parse keywords
            keywords = [k.strip() for k in keywords_text.split(',')]
            # Ensure we have exactly 5, pad with empty if needed
            while len(keywords) < 5:
                keywords.append("")
            
            return keywords[:5]
            
        except:
            return ["", "", "", "", ""]
    
    def identify_top_topics(self):
        """Use Haiku to identify 10 most relevant topics from all keywords."""
        # Get most frequent keywords (top 50) to feed to Haiku
        keyword_counts = Counter(self.all_keywords)
        top_keywords = [word for word, count in keyword_counts.most_common(50) if word.strip()]
        
        keywords_text = ', '.join(top_keywords)
        
        prompt = f"""Based on these keywords extracted from text responses, identify the 10 most relevant and distinct topics they represent, ordered from MOST FREQUENT to LEAST FREQUENT:

Keywords: {keywords_text}

Analyze these keywords and group them into 10 distinct, meaningful topics. Consider how often keywords related to each topic appear when determining the order.

IMPORTANT: Order your topics from most frequent/common to least frequent/common based on the keyword patterns you see.

Format your response as:
1. Most Frequent Topic Name
2. Second Most Frequent Topic Name
3. Third Most Frequent Topic Name
(etc. up to 10, ordered by frequency)

Make each topic name 2-4 words and distinct from the others."""

        try:
            result = self.client.messages.create(
                model="claude-3-5-haiku-20241022",
                max_tokens=300,
                temperature=0.1,
                messages=[{"role": "user", "content": prompt}]
            )
            
            response_text = result.content[0].text.strip()
            
            # Parse topics
            topics = []
            for line in response_text.split('\n'):
                line = line.strip()
                if line and (line[0].isdigit() or line.startswith('-')):
                    topic = line.split('.', 1)[-1].strip()
                    if topic:
                        topics.append(topic)
            
            return topics[:10]
            
        except Exception as e:
            print(f"Error identifying topics: {e}")
            # Fallback: use most common keywords as topics
            return [word for word, count in Counter(self.all_keywords).most_common(10)]
    
    def display_and_save_results(self, top_topics, keyword_counts, total_responses, file_name):
        """Display results with plots and save everything."""
        with self.output:
            clear_output()
            
            # Get top 10 keywords by frequency
            top_keywords = keyword_counts.most_common(10)
            
            # Create output directory
            output_dir = r"YOUR_OUTPUT_DIRECTORY_PATH_HERE"
            os.makedirs(output_dir, exist_ok=True)
            
            # Create PDF with plots and results
            pdf_path = os.path.join(output_dir, f"{file_name}_analysis.pdf")
            
            with PdfPages(pdf_path) as pdf:
                # Page 1: Plots
                fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 8))
                
                # Topics plot
                if top_topics:
                    y_pos = range(len(top_topics))
                    # For topics, we'll show them as ranked (no frequency count since they're thematically derived)
                    topic_scores = list(range(len(top_topics), 0, -1))  # Reverse ranking as scores
                    ax1.barh(y_pos, topic_scores, color='#0d9488')
                    ax1.set_yticks(y_pos)
                    ax1.set_yticklabels([t[:30] + '...' if len(t) > 30 else t for t in top_topics])
                    ax1.set_xlabel('Relevance Rank')
                    ax1.set_title(f'Top 10 Topics - {file_name}')
                    ax1.invert_yaxis()
                
                # Keywords plot
                if top_keywords:
                    keywords, k_counts = zip(*top_keywords)
                    y_pos = range(len(keywords))
                    ax2.barh(y_pos, k_counts, color='#14b8a6')
                    ax2.set_yticks(y_pos)
                    ax2.set_yticklabels(keywords)
                    ax2.set_xlabel('Frequency')
                    ax2.set_title(f'Top 10 Keywords - {file_name}')
                    ax2.invert_yaxis()
                
                plt.tight_layout()
                pdf.savefig(fig, bbox_inches='tight')
                plt.show()
                
                # Page 2: Summary Table
                fig, ax = plt.subplots(figsize=(12, 8))
                ax.axis('tight')
                ax.axis('off')
                
                # Create summary table
                summary_data = []
                summary_data.append(['ANALYSIS SUMMARY', ''])
                summary_data.append(['File', file_name])
                summary_data.append(['Total Responses', str(total_responses)])
                summary_data.append(['Total Keywords Extracted', str(len(self.all_keywords))])
                summary_data.append(['', ''])
                summary_data.append(['TOP 10 TOPICS (by relevance)', 'RANK'])
                
                for i, topic in enumerate(top_topics, 1):
                    summary_data.append([f"{i}. {topic}", str(i)])
                
                summary_data.append(['', ''])
                summary_data.append(['TOP 10 KEYWORDS (by frequency)', 'COUNT'])
                
                for i, (keyword, count) in enumerate(top_keywords, 1):
                    summary_data.append([f"{i}. {keyword}", str(count)])
                
                # Create table
                table = ax.table(cellText=summary_data, cellLoc='left', loc='center')
                table.auto_set_font_size(False)
                table.set_fontsize(10)
                table.scale(1.2, 1.5)
                
                # Style table
                for i in range(len(summary_data)):
                    if summary_data[i][0] in ['ANALYSIS SUMMARY', 'TOP 10 TOPICS (by relevance)', 'TOP 10 KEYWORDS (by frequency)']:
                        table[(i, 0)].set_facecolor('#0d9488')
                        table[(i, 0)].set_text_props(weight='bold', color='white')
                        table[(i, 1)].set_facecolor('#0d9488')
                        table[(i, 1)].set_text_props(weight='bold', color='white')
                
                plt.title(f'Analysis Summary - {file_name}', fontsize=16, fontweight='bold', pad=20)
                pdf.savefig(fig, bbox_inches='tight')
                plt.close()
            
            # Save full analysis to CSV
            csv_path = os.path.join(output_dir, f"{file_name}_full_analysis.csv")
            df = pd.DataFrame(self.full_analysis)
            df.to_csv(csv_path, index=False, encoding='utf-8')
            
            # Save all keywords to separate CSV
            keywords_csv_path = os.path.join(output_dir, f"{file_name}_all_keywords.csv")
            keywords_df = pd.DataFrame(self.all_keywords, columns=['keyword'])
            keywords_df.to_csv(keywords_csv_path, index=False, encoding='utf-8')
            
            # Save summary to text file
            txt_path = os.path.join(output_dir, f"{file_name}_summary.txt")
            with open(txt_path, 'w', encoding='utf-8') as f:
                f.write(f"TOPIC ANALYSIS RESULTS - {file_name}\n")
                f.write("=" * 60 + "\n\n")
                f.write(f"Total responses analyzed: {total_responses}\n")
                f.write(f"Total keywords extracted: {len(self.all_keywords)}\n\n")
                
                f.write("TOP 10 TOPICS (by relevance):\n")
                f.write("-" * 30 + "\n")
                for i, topic in enumerate(top_topics, 1):
                    f.write(f"{i}. {topic}\n")
                
                f.write("\nTOP 10 KEYWORDS (by frequency):\n")
                f.write("-" * 30 + "\n")
                for i, (keyword, count) in enumerate(top_keywords, 1):
                    f.write(f"{i}. {keyword}: {count}\n")
            
            # Display results in interface
            print("TOP 10 TOPICS (identified by Haiku from keywords):")
            print("=" * 60)
            for i, topic in enumerate(top_topics, 1):
                print(f"{i}. {topic}")
            
            print("\nTOP 10 KEYWORDS (by frequency):")
            print("=" * 50)
            for i, (keyword, count) in enumerate(top_keywords, 1):
                print(f"{i}. {keyword}: {count}")
            
            print(f"\nTotal responses analyzed: {total_responses}")
            print(f"Total keywords extracted: {len(self.all_keywords)}")
            print(f"\n📁 FILES SAVED:")
            print(f"📊 PDF Report: {pdf_path}")
            print(f"📋 Full Analysis CSV: {csv_path}")
            print(f"🔑 All Keywords CSV: {keywords_csv_path}")
            print(f"📄 Summary TXT: {txt_path}")
            print("✅ Done!")

# Create analyzer
analyzer = SimpleTopicAnalyzer()