# LUMIN.AI: Data Exploration

This notebook contains exploratory data analysis for the LUMIN.AI project's Deep Learning track. It focuses on exploring the Austria Democracy Radar dataset and preparing it for sentiment analysis.

## Objectives
- Set up the development environment with necessary libraries
- Explore and understand the Austria Democracy Radar dataset structure
- Perform initial data cleaning and preprocessing
- Generate basic statistics and visualizations
- Prepare the data for sentiment analysis modeling

In [None]:
# Import necessary libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
from collections import Counter
import warnings

# For text preprocessing
import spacy

# For visualization
import plotly.express as px
import plotly.graph_objects as go
from wordcloud import WordCloud

# Set plotting style
sns.set_theme(style="whitegrid", palette="viridis")
plt.rcParams["figure.figsize"] = (12, 8)
warnings.filterwarnings("ignore")

# Download necessary NLTK resources
try:
    nltk.data.find("tokenizers/punkt")
    nltk.data.find("corpora/stopwords")
    nltk.data.find("corpora/wordnet")
except LookupError:
    nltk.download("punkt")
    nltk.download("stopwords")
    nltk.download("wordnet")

# Load spaCy model
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("Downloading spaCy model...")
    os.system("python -m spacy download en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

print("Environment setup complete!")

## Data Loading

In this section, we'll load the Austria Democracy Radar dataset. For now, we'll use the sample data provided in the repository, but instructions are included for downloading and using the full dataset.

In [None]:
# Define paths
SAMPLE_DATA_PATH = "../../data/examples/sample_data.csv"
FULL_DATA_PATH = "../../data/raw/democracy-radar/"

# Check if sample data exists and load it
if os.path.exists(SAMPLE_DATA_PATH):
    print(f"Loading sample data from {SAMPLE_DATA_PATH}")
    data = pd.read_csv(SAMPLE_DATA_PATH)
    print(f"Sample data loaded successfully with {len(data)} rows")
else:
    print(f"Sample data not found at {SAMPLE_DATA_PATH}")
    print("Creating placeholder data for development")
    data = pd.DataFrame(np.random.randint(0, 100, size=(100, 4)), columns=list("ABCD"))

# Instructions for downloading the full dataset
print(
    "\nTo download the full Austria Democracy Radar dataset, run the following command:"
)
print("python ../../data/scripts/download_data.py --dataset democracy-radar")
print("\nAfter downloading, you can load the full dataset with:")
print("data = pd.read_csv('../../data/raw/democracy-radar/wave-1.csv')")

# Display the first few rows of the data
print("\nFirst 5 rows of the data:")
display(data.head())

# Display basic information about the dataset
print("\nBasic information about the dataset:")
display(data.info())

# Display summary statistics
print("\nSummary statistics:")
display(data.describe(include="all"))

In [None]:
# Check for missing values
print("Missing values in each column:")
display(data.isnull().sum())

# Check for duplicates
duplicates = data.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")

# Examine the distribution of sentiment labels
if "sentiment" in data.columns:
    print("\nSentiment distribution:")
    sentiment_counts = data["sentiment"].value_counts()
    display(sentiment_counts)

    # Visualize sentiment distribution
    plt.figure(figsize=(10, 6))
    sns.countplot(x="sentiment", data=data, palette="viridis")
    plt.title("Distribution of Sentiment Labels")
    plt.xlabel("Sentiment")
    plt.ylabel("Count")
    plt.show()

    # Pie chart of sentiment distribution
    plt.figure(figsize=(8, 8))
    plt.pie(
        sentiment_counts,
        labels=sentiment_counts.index,
        autopct="%1.1f%%",
        startangle=90,
        colors=sns.color_palette("viridis", len(sentiment_counts)),
    )
    plt.title("Sentiment Distribution")
    plt.axis("equal")
    plt.show()

In [None]:
# Examine the distribution of categories
if "category" in data.columns:
    print("\nCategory distribution:")
    category_counts = data["category"].value_counts()
    display(category_counts)

    # Visualize category distribution
    plt.figure(figsize=(12, 6))
    sns.countplot(
        y="category", data=data, order=category_counts.index, palette="viridis"
    )
    plt.title("Distribution of Categories")
    plt.xlabel("Count")
    plt.ylabel("Category")
    plt.show()

    # Analyze sentiment distribution by category
    if "sentiment" in data.columns:
        plt.figure(figsize=(14, 8))
        sentiment_by_category = pd.crosstab(data["category"], data["sentiment"])
        sentiment_by_category.plot(kind="bar", stacked=True, colormap="viridis")
        plt.title("Sentiment Distribution by Category")
        plt.xlabel("Category")
        plt.ylabel("Count")
        plt.legend(title="Sentiment")
        plt.xticks(rotation=45)
        plt.show()

        # Heatmap of sentiment by category
        plt.figure(figsize=(12, 8))
        sentiment_category_pivot = pd.crosstab(
            data["category"], data["sentiment"], normalize="index"
        )
        sns.heatmap(sentiment_category_pivot, annot=True, cmap="viridis", fmt=".2%")
        plt.title("Sentiment Proportion by Category")
        plt.xlabel("Sentiment")
        plt.ylabel("Category")
        plt.show()

In [None]:
# Basic text statistics
if "text" in data.columns:
    # Calculate text length
    data["text_length"] = data["text"].apply(len)

    # Calculate word count
    data["word_count"] = data["text"].apply(lambda x: len(str(x).split()))

    # Display text length statistics
    print("Text length statistics:")
    display(data[["text_length", "word_count"]].describe())

    # Plot text length distribution
    plt.figure(figsize=(12, 6))
    sns.histplot(data["text_length"], kde=True, bins=30)
    plt.title("Distribution of Text Length")
    plt.xlabel("Text Length (characters)")
    plt.ylabel("Frequency")
    plt.show()

    # Plot word count distribution
    plt.figure(figsize=(12, 6))
    sns.histplot(data["word_count"], kde=True, bins=30)
    plt.title("Distribution of Word Count")
    plt.xlabel("Word Count")
    plt.ylabel("Frequency")
    plt.show()

    # Analyze text length by sentiment
    if "sentiment" in data.columns:
        plt.figure(figsize=(12, 6))
        sns.boxplot(x="sentiment", y="text_length", data=data, palette="viridis")
        plt.title("Text Length by Sentiment")
        plt.xlabel("Sentiment")
        plt.ylabel("Text Length (characters)")
        plt.show()

        plt.figure(figsize=(12, 6))
        sns.boxplot(x="sentiment", y="word_count", data=data, palette="viridis")
        plt.title("Word Count by Sentiment")
        plt.xlabel("Sentiment")
        plt.ylabel("Word Count")
        plt.show()

In [None]:
# Text preprocessing functions
def preprocess_text(text):
    """
    Preprocess text for sentiment analysis:
    1. Convert to lowercase
    2. Remove special characters and numbers
    3. Remove extra whitespace
    4. Tokenize
    5. Remove stopwords
    6. Lemmatize
    """
    if not isinstance(text, str):
        return ""

    # Convert to lowercase
    text = text.lower()

    # Remove special characters and numbers
    text = re.sub(r"[^a-zA-Z\s]", "", text)

    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text).strip()

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join tokens back into text
    processed_text = " ".join(tokens)

    return processed_text


# Apply preprocessing to text column
if "text" in data.columns:
    # Create a new column with preprocessed text
    data["processed_text"] = data["text"].apply(preprocess_text)

    # Display examples of original and preprocessed text
    print("Examples of original and preprocessed text:")
    text_examples = pd.DataFrame(
        {
            "Original Text": data["text"].head(),
            "Preprocessed Text": data["processed_text"].head(),
        }
    )
    display(text_examples)

In [None]:
# Word frequency analysis
if "processed_text" in data.columns:
    # Combine all processed text
    all_text = " ".join(data["processed_text"].tolist())

    # Count word frequencies
    words = all_text.split()
    word_counts = Counter(words)

    # Get the most common words
    most_common_words = word_counts.most_common(20)

    # Display most common words
    print("Most common words in the dataset:")
    display(pd.DataFrame(most_common_words, columns=["Word", "Count"]))

    # Plot word frequency
    plt.figure(figsize=(14, 8))
    common_words_df = pd.DataFrame(most_common_words, columns=["Word", "Count"])
    sns.barplot(x="Count", y="Word", data=common_words_df, palette="viridis")
    plt.title("Most Common Words")
    plt.xlabel("Count")
    plt.ylabel("Word")
    plt.show()

    # Create word cloud
    plt.figure(figsize=(12, 12))
    wordcloud = WordCloud(
        width=800,
        height=800,
        background_color="white",
        colormap="viridis",
        max_words=100,
    ).generate(all_text)

    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.title("Word Cloud of All Text")
    plt.show()

    # Word frequency by sentiment
    if "sentiment" in data.columns:
        for sentiment in data["sentiment"].unique():
            # Get text for this sentiment
            sentiment_text = " ".join(
                data[data["sentiment"] == sentiment]["processed_text"].tolist()
            )

            # Count word frequencies
            sentiment_words = sentiment_text.split()
            sentiment_word_counts = Counter(sentiment_words)

            # Get the most common words
            sentiment_most_common = sentiment_word_counts.most_common(10)

            # Display most common words for this sentiment
            print(f"\nMost common words in {sentiment} sentiment:")
            display(pd.DataFrame(sentiment_most_common, columns=["Word", "Count"]))

            # Create word cloud for this sentiment
            plt.figure(figsize=(10, 10))
            sentiment_wordcloud = WordCloud(
                width=800,
                height=800,
                background_color="white",
                colormap="viridis",
                max_words=50,
            ).generate(sentiment_text)

            plt.imshow(sentiment_wordcloud, interpolation="bilinear")
            plt.axis("off")
            plt.title(f"Word Cloud for {sentiment.capitalize()} Sentiment")
            plt.show()