In [None]:
import os
import sys
import json
import zipfile
from pathlib import Path
from collections import Counter

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Add src to path
sys.path.insert(0, str(Path('.').resolve().parent / 'src'))

from config import RAW_DATA_DIR, PROCESSED_DATA_DIR, TRAIN_FOLDER, TEST_FOLDER
from utils import clean_text, extract_text_features, load_json_annotations, parse_label_studio_export

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

## 1. Adatok betöltése

In [None]:
# Load processed data
train_path = PROCESSED_DATA_DIR / 'train.csv'
test_path = PROCESSED_DATA_DIR / 'test.csv'

if train_path.exists():
    train_df = pd.read_csv(train_path)
    print(f'Training samples: {len(train_df)}')
    display(train_df.head())
else:
    print('Training data not found. Please run 01_data_preprocessing.py first.')

if test_path.exists():
    test_df = pd.read_csv(test_path)
    print(f'\nTest samples: {len(test_df)}')
    display(test_df.head())
else:
    print('Test data not found.')

## 2. Label Distribution Analysis

In [None]:
# Create label distribution plot
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Training data
if 'train_df' in dir():
    train_counts = train_df['label'].value_counts().sort_index()
    axes[0].bar(train_counts.index, train_counts.values, color='steelblue', alpha=0.8)
    axes[0].set_xlabel('Rating')
    axes[0].set_ylabel('Count')
    axes[0].set_title('Training Data - Label Distribution')
    axes[0].set_xticks([1, 2, 3, 4, 5])
    
    for i, v in enumerate(train_counts.values):
        axes[0].text(train_counts.index[i], v + 1, str(v), ha='center')

# Test data
if 'test_df' in dir():
    test_counts = test_df['label'].value_counts().sort_index()
    axes[1].bar(test_counts.index, test_counts.values, color='coral', alpha=0.8)
    axes[1].set_xlabel('Rating')
    axes[1].set_ylabel('Count')
    axes[1].set_title('Test Data (Consensus) - Label Distribution')
    axes[1].set_xticks([1, 2, 3, 4, 5])
    
    for i, v in enumerate(test_counts.values):
        axes[1].text(test_counts.index[i], v + 1, str(v), ha='center')

plt.tight_layout()
plt.show()

In [None]:
# Detailed statistics
print('Training Data Statistics:')
print('=' * 40)
if 'train_df' in dir():
    for label in sorted(train_df['label'].unique()):
        count = (train_df['label'] == label).sum()
        pct = count / len(train_df) * 100
        print(f'Rating {label}: {count:4d} ({pct:5.1f}%)')

print('\nTest Data Statistics:')
print('=' * 40)
if 'test_df' in dir():
    for label in sorted(test_df['label'].unique()):
        count = (test_df['label'] == label).sum()
        pct = count / len(test_df) * 100
        print(f'Rating {label}: {count:4d} ({pct:5.1f}%)')

## 3. Text Length Analysis

In [None]:
# Add text features
def add_text_features(df):
    df = df.copy()
    df['word_count'] = df['text'].apply(lambda x: len(str(x).split()))
    df['char_count'] = df['text'].apply(lambda x: len(str(x)))
    df['avg_word_length'] = df['text'].apply(
        lambda x: np.mean([len(w) for w in str(x).split()]) if str(x).split() else 0
    )
    return df

if 'train_df' in dir():
    train_df = add_text_features(train_df)
if 'test_df' in dir():
    test_df = add_text_features(test_df)

In [None]:
# Text length distribution by rating
if 'train_df' in dir():
    fig = px.box(train_df, x='label', y='word_count', 
                 title='Word Count Distribution by Rating (Training Data)',
                 labels={'label': 'Rating', 'word_count': 'Word Count'})
    fig.update_layout(xaxis_title='Rating (1=Hard, 5=Easy)')
    fig.show()

In [None]:
# Correlation between text length and rating
if 'train_df' in dir():
    correlation = train_df['word_count'].corr(train_df['label'])
    print(f'Correlation between word count and rating: {correlation:.3f}')
    
    # Average word count by rating
    avg_by_rating = train_df.groupby('label')['word_count'].mean()
    print('\nAverage word count by rating:')
    for rating, avg in avg_by_rating.items():
        print(f'  Rating {rating}: {avg:.1f} words')

## 4. Sample Texts by Rating

In [None]:
# Show sample texts for each rating
rating_descriptions = {
    1: 'Nagyon nehezen vagy nem értelmezhető',
    2: 'Nehezen értelmezhető',
    3: 'Valamennyire érthető',
    4: 'Végigolvasva megértem',
    5: 'Könnyen, egyből érthető'
}

if 'train_df' in dir():
    for rating in [1, 2, 3, 4, 5]:
        samples = train_df[train_df['label'] == rating]['text'].head(2)
        print(f'\n{"="*60}')
        print(f'Rating {rating}: {rating_descriptions[rating]}')
        print('=' * 60)
        for i, text in enumerate(samples, 1):
            print(f'\nSample {i}:')
            print(f'{text[:300]}...' if len(text) > 300 else text)

## 5. Annotator Agreement Analysis (Test Data)

In [None]:
if 'test_df' in dir() and 'agreement' in test_df.columns:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Agreement distribution
    axes[0].hist(test_df['agreement'], bins=20, color='teal', alpha=0.7, edgecolor='black')
    axes[0].set_xlabel('Agreement Score')
    axes[0].set_ylabel('Count')
    axes[0].set_title('Annotator Agreement Distribution')
    axes[0].axvline(test_df['agreement'].mean(), color='red', linestyle='--', 
                    label=f'Mean: {test_df["agreement"].mean():.2f}')
    axes[0].legend()
    
    # Agreement by rating
    agreement_by_rating = test_df.groupby('label')['agreement'].mean()
    axes[1].bar(agreement_by_rating.index, agreement_by_rating.values, 
                color='teal', alpha=0.7)
    axes[1].set_xlabel('Rating')
    axes[1].set_ylabel('Average Agreement')
    axes[1].set_title('Average Agreement by Rating')
    axes[1].set_xticks([1, 2, 3, 4, 5])
    
    plt.tight_layout()
    plt.show()
    
    print('\nAgreement Statistics:')
    print(f'Mean: {test_df["agreement"].mean():.3f}')
    print(f'Std: {test_df["agreement"].std():.3f}')
    print(f'Min: {test_df["agreement"].min():.3f}')
    print(f'Max: {test_df["agreement"].max():.3f}')

## 6. Word Frequency Analysis

In [None]:
from collections import Counter
import re

def get_word_freq(texts, top_n=20):
    all_words = []
    for text in texts:
        words = re.findall(r'\b\w+\b', str(text).lower())
        all_words.extend(words)
    return Counter(all_words).most_common(top_n)

# Most common words by rating
if 'train_df' in dir():
    fig, axes = plt.subplots(2, 3, figsize=(16, 10))
    axes = axes.flatten()
    
    # Overall
    word_freq = get_word_freq(train_df['text'], 15)
    words, counts = zip(*word_freq)
    axes[0].barh(words, counts, color='gray')
    axes[0].set_title('Overall Most Common Words')
    axes[0].invert_yaxis()
    
    # By rating
    for i, rating in enumerate([1, 2, 3, 4, 5], 1):
        texts = train_df[train_df['label'] == rating]['text']
        word_freq = get_word_freq(texts, 15)
        if word_freq:
            words, counts = zip(*word_freq)
            axes[i].barh(words, counts, color=plt.cm.RdYlGn(rating/5))
            axes[i].set_title(f'Rating {rating}')
            axes[i].invert_yaxis()
    
    plt.tight_layout()
    plt.show()

## 7. Summary Statistics

In [None]:
# Create summary table
summary_data = []

if 'train_df' in dir():
    summary_data.append({
        'Dataset': 'Training',
        'Samples': len(train_df),
        'Avg Word Count': train_df['word_count'].mean(),
        'Avg Char Count': train_df['char_count'].mean(),
        'Label Mean': train_df['label'].mean(),
        'Label Std': train_df['label'].std()
    })

if 'test_df' in dir():
    summary_data.append({
        'Dataset': 'Test (Consensus)',
        'Samples': len(test_df),
        'Avg Word Count': test_df['word_count'].mean(),
        'Avg Char Count': test_df['char_count'].mean(),
        'Label Mean': test_df['label'].mean(),
        'Label Std': test_df['label'].std()
    })

summary_df = pd.DataFrame(summary_data)
display(summary_df.round(2))

In [None]:
# Save summary
if len(summary_data) > 0:
    summary_df.to_csv(PROCESSED_DATA_DIR / 'data_summary.csv', index=False)
    print(f'Summary saved to {PROCESSED_DATA_DIR / "data_summary.csv"}')