# Data Exploration and Analysis

This notebook explores song lyrics data and prepares it for sentiment analysis.

In [None]:
import sys
import os
sys.path.append(os.path.abspath('../src'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sentiment_analysis.data_utils import create_sample_data, get_data_path
from sentiment_analysis.preprocessing import preprocess_lyrics_dataframe, extract_features

%matplotlib inline
sns.set_style('whitegrid')

## Load Data

Load your song lyrics dataset. For this example, we'll use sample data.

In [None]:
# Load data
df = create_sample_data()
print(f"Dataset shape: {df.shape}")
df.head()

## Data Overview

In [None]:
# Basic information
print("Data Types:")
print(df.dtypes)
print("\nMissing Values:")
print(df.isnull().sum())
print("\nBasic Statistics:")
print(df.describe())

## Feature Extraction

In [None]:
# Extract features from lyrics
features = df['lyrics'].apply(extract_features)
features_df = pd.DataFrame(features.tolist())
df = pd.concat([df, features_df], axis=1)
df.head()

## Exploratory Visualizations

In [None]:
# Word count distribution
plt.figure(figsize=(10, 6))
plt.hist(df['word_count'], bins=20, edgecolor='black')
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.title('Distribution of Word Count in Lyrics')
plt.show()

In [None]:
# Genre distribution
if 'genre' in df.columns:
    plt.figure(figsize=(10, 6))
    df['genre'].value_counts().plot(kind='bar')
    plt.xlabel('Genre')
    plt.ylabel('Count')
    plt.title('Distribution of Genres')
    plt.xticks(rotation=45)
    plt.show()

In [None]:
# Artist distribution
if 'artist' in df.columns:
    plt.figure(figsize=(10, 6))
    df['artist'].value_counts().head(10).plot(kind='barh')
    plt.xlabel('Count')
    plt.ylabel('Artist')
    plt.title('Top 10 Artists by Song Count')
    plt.show()

## Data Quality Check

In [None]:
# Check for empty lyrics
empty_lyrics = df[df['lyrics'].str.strip() == '']
print(f"Number of songs with empty lyrics: {len(empty_lyrics)}")

# Check for very short lyrics
short_lyrics = df[df['word_count'] < 10]
print(f"Number of songs with less than 10 words: {len(short_lyrics)}")

## Save Processed Data

In [None]:
# Uncomment to save processed data
# df.to_csv('../data/processed/songs_with_features.csv', index=False)
# print("Data saved successfully!")