# Medical Transcription Analysis Report

**Objective:** Analyze the `mtsamples.csv` dataset to determine data quality, class balance, and text characteristics for NLP modeling.

---

In [None]:
# Install dependencies (if running from notebooks/ directory)
# !pip install -r ../requirements.txt
%pip install wordcloud

## 1. Environment Configuration

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

# NLTK Downloads
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)

# Visualization Settings
plt.style.use('seaborn-v0_8-whitegrid')
pd.set_option('display.max_colwidth', 50)

## 2. Data Loading and Cleaning Executive Summary

In [None]:
DATA_PATH = '../data/raw/mtsamples.csv'

try:
    df = pd.read_csv(DATA_PATH)
    
    # Remove unnecessary column if exists
    if 'Unnamed: 0' in df.columns:
        df = df.drop(columns=['Unnamed: 0'])
        
    initial_rows = len(df)

    # 1. Deduplication (Critical for preventing data leakage)
    df_deduplicated = df.drop_duplicates(subset=['transcription'])
    duplicates_dropped = initial_rows - len(df_deduplicated)

    # 2. Drop Missing Values
    df_clean = df_deduplicated.dropna(subset=['transcription', 'medical_specialty']).reset_index(drop=True)
    final_rows = len(df_clean)
    nulls_dropped = len(df_deduplicated) - final_rows
    total_dropped = initial_rows - final_rows
    
    print("Data Cleaning Summary:")
    print("="*30)
    print(f"Original Rows:      {initial_rows:,}")
    print(f"Duplicates Dropped: {duplicates_dropped:,}")
    print(f"Nulls Dropped:      {nulls_dropped:,}")
    print(f"Final Clean Rows:   {final_rows:,}")
    print("="*30)

    # Real Data Sample
    print("\nReal Data Sample (Complexity Check):")
    print("-"*30)
    sample_idx = 0
    print(f"Specialty: {df_clean.loc[sample_idx, 'medical_specialty']}\n")
    print(f"Transcription:\n{df_clean.loc[sample_idx, 'transcription']}")
    print("-"*30)

except Exception as e:
    print(f"Error loading dataset: {e}")

## 3. Class Imbalance Analysis (Target Variable)

In [None]:
plt.figure(figsize=(12, 6))

# Top 10 Specialties
top_classes = df_clean['medical_specialty'].value_counts().head(10)
sns.barplot(x=top_classes.values, y=top_classes.index, palette='viridis', hue=top_classes.index)

# Formatting
plt.title('Top 10 Medical Specialties Distribution')
plt.xlabel('Number of Transcriptions')
plt.ylabel('Medical Specialty')
plt.legend([],[], frameon=False)
plt.show()

## 4. Contexto Cl√≠nico y Vocabulario Dominante

In [None]:
# Calculate length in words
df_clean['transcription_length'] = df_clean['transcription'].apply(lambda x: len(str(x).split()))

# Key Metrics
mean_len = df_clean['transcription_length'].mean()
p95_len = df_clean['transcription_length'].quantile(0.95)

print("Transcription Length Statistics:")
print(f"Mean Length: {mean_len:.2f} words")
print(f"95th Percentile (P95): {p95_len:.2f} words")

# Visualization 1: Length Distribution
plt.figure(figsize=(12, 5))
sns.histplot(df_clean['transcription_length'], bins=50, kde=True, color='teal')
plt.axvline(mean_len, color='red', linestyle='--', label=f'Mean: {mean_len:.0f}')
plt.axvline(p95_len, color='orange', linestyle='--', label=f'P95: {p95_len:.0f}')

plt.title('Distribution of Transcription Lengths')
plt.xlabel('Number of Words')
plt.ylabel('Frequency')
plt.legend()
plt.show()

# Visualization 2: WordCloud
majority_class = top_classes.index[0]
text_corpus = " ".join(df_clean[df_clean['medical_specialty'] == majority_class]['transcription'])

stop_words = set(stopwords.words('english'))

wordcloud = WordCloud(width=800, height=400, 
                      background_color='white', 
                      stopwords=stop_words, 
                      colormap='Blues').generate(text_corpus)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title(f'Frequent Vocabulary: {majority_class} (Context Check)')
plt.show()