# Phase 1 Exploratory Data Analysis

This notebook accompanies the Phase 1 submission for the IMDb Movie Trends project. Run the cells in order after executing the preprocessing pipeline (`python src/preprocessing.py run`). Figures are saved to `reports/figures/` for inclusion in the written report.


In [None]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from config import FIGURES_DIR, PROCESSED_DATA_DIR, ensure_directories

sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)
plt.rcParams["axes.titlesize"] = 14
plt.rcParams["axes.labelsize"] = 12

ensure_directories()
FIGURES_DIR.mkdir(parents=True, exist_ok=True)


In [None]:
data_path = PROCESSED_DATA_DIR / "movies_processed.csv"
stats_path = PROCESSED_DATA_DIR / "preprocessing_stats.json"

if not data_path.exists():
    raise FileNotFoundError(
        f"Processed dataset not found at {data_path}. Run `python src/preprocessing.py run` first."
    )

movies = pd.read_csv(data_path)
for column in ["genre_list", "director_ids", "writer_ids", "top_cast", "top_cast_ids"]:
    if column in movies.columns:
        movies[column] = movies[column].apply(lambda x: json.loads(x) if isinstance(x, str) else [])

stats_summary = json.loads(stats_path.read_text()) if stats_path.exists() else {}

movies.head()

In [None]:
print(f"Rows: {len(movies):,}")
print(f"Year range: {movies['startYear'].min():.0f}-{movies['startYear'].max():.0f}")
print(f"Average rating: {movies['averageRating'].mean():.2f}")
print(f"Average runtime: {movies['runtimeMinutes'].mean():.0f} minutes")
print(f"Unique primary genres: {movies['primary_genre'].nunique()}")

movies[['startYear', 'runtimeMinutes', 'averageRating', 'numVotes']].describe().round(2)

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
fig.suptitle("Univariate Distributions", fontsize=16, fontweight="bold")

# Rating distribution
axes[0, 0].hist(movies["averageRating"], bins=40, color="steelblue", edgecolor="black", alpha=0.75)
axes[0, 0].axvline(movies["averageRating"].mean(), color="red", linestyle="--", label="Mean")
axes[0, 0].axvline(movies["averageRating"].median(), color="green", linestyle=":", label="Median")
axes[0, 0].set_title("Average Rating")
axes[0, 0].set_xlabel("Rating")
axes[0, 0].set_ylabel("Frequency")
axes[0, 0].legend()

# Runtime distribution (trim extreme outliers)
runtime_trim = movies.loc[movies["runtimeMinutes"] <= 200, "runtimeMinutes"]
axes[0, 1].hist(runtime_trim, bins=40, color="coral", edgecolor="black", alpha=0.75)
axes[0, 1].axvline(runtime_trim.mean(), color="red", linestyle="--", label="Mean")
axes[0, 1].set_title("Runtime (<=200 minutes)")
axes[0, 1].set_xlabel("Minutes")
axes[0, 1].legend()

# Release year counts
year_counts = movies["startYear"].value_counts().sort_index()
axes[0, 2].plot(year_counts.index, year_counts.values, color="darkgreen", linewidth=2)
axes[0, 2].set_title("Movies Released by Year")
axes[0, 2].set_xlabel("Year")
axes[0, 2].set_ylabel("Count")
axes[0, 2].grid(alpha=0.3)

# Votes distribution (log scale)
axes[1, 0].hist(np.log10(movies["numVotes"] + 1), bins=40, color="purple", edgecolor="black", alpha=0.75)
axes[1, 0].set_title("Number of Votes (log10)")
axes[1, 0].set_xlabel("log10(votes + 1)")

# Top genres
top_genres = movies["primary_genre"].value_counts().head(10)
axes[1, 1].barh(top_genres.index[::-1], top_genres.values[::-1], color="teal")
axes[1, 1].set_title("Top 10 Primary Genres")
axes[1, 1].set_xlabel("Count")

# Era distribution
era_counts = movies["era"].value_counts().sort_index()
axes[1, 2].bar(era_counts.index, era_counts.values, color="orange", alpha=0.8)
axes[1, 2].set_title("Movies by Era")
axes[1, 2].set_ylabel("Count")
for tick in axes[1, 2].get_xticklabels():
    tick.set_rotation(30)

plt.tight_layout(rect=(0, 0, 1, 0.96))
output_path = FIGURES_DIR / "univariate_distributions.png"
plt.savefig(output_path, dpi=300, bbox_inches="tight")
plt.show()
print(f"Saved figure: {output_path}")

In [None]:
rq1_stats = movies.groupby("decade")["averageRating"].agg(["mean", "median", "std", "count"]).reset_index()
rq1_stats["polarisation"] = movies.groupby("decade")["averageRating"].apply(
    lambda x: 100 * (((x < 4) | (x > 8)).sum() / len(x))
).values

fig, axes = plt.subplots(1, 3, figsize=(18, 5))
fig.suptitle("RQ1: Are modern audiences harder to please?", fontsize=16, fontweight="bold")

axes[0].plot(rq1_stats["decade"], rq1_stats["mean"], marker="o", color="steelblue")
axes[0].fill_between(
    rq1_stats["decade"],
    rq1_stats["mean"] - rq1_stats["std"],
    rq1_stats["mean"] + rq1_stats["std"],
    color="steelblue",
    alpha=0.2,
)
axes[0].set_title("Mean Rating by Decade (?1 SD)")
axes[0].set_xlabel("Decade")
axes[0].set_ylabel("Average Rating")
axes[0].grid(alpha=0.3)

axes[1].plot(rq1_stats["decade"], rq1_stats["std"], marker="s", color="coral")
axes[1].set_title("Rating Variability")
axes[1].set_xlabel("Decade")
axes[1].set_ylabel("Standard Deviation")
axes[1].grid(alpha=0.3)

axes[2].plot(rq1_stats["decade"], rq1_stats["polarisation"], marker="^", color="darkgreen")
axes[2].set_title("Extremes (<4 or >8)")
axes[2].set_xlabel("Decade")
axes[2].set_ylabel("% of Titles")
axes[2].grid(alpha=0.3)

plt.tight_layout(rect=(0, 0, 1, 0.94))
output_path = FIGURES_DIR / "rq1_decade_variance.png"
plt.savefig(output_path, dpi=300, bbox_inches="tight")
plt.show()
print(f"Saved figure: {output_path}")

recent = movies[movies["decade"] >= 1980]
decade_groups = [group["averageRating"].values for _, group in recent.groupby("decade")]
if len(decade_groups) > 1:
    stat, pvalue = stats.levene(*decade_groups)
    print(f"Levene's test statistic={stat:.3f}, p-value={pvalue:.4f}")
else:
    print("Insufficient decades to run Levene's test in filtered data.")

In [None]:
directors_exploded = movies.explode("director_ids").dropna(subset=["director_ids"])
director_summary = (
    directors_exploded.groupby("director_ids")
    .agg(avg_rating=("averageRating", "mean"),
         std_rating=("averageRating", "std"),
         movie_count=("tconst", "count"),
         total_votes=("numVotes", "sum"))
    .reset_index()
)

min_films = 5
prolific_directors = director_summary[director_summary["movie_count"] >= min_films]
print(f"Directors with >= {min_films} credited movies: {len(prolific_directors):,}")
prolific_directors.sort_values("avg_rating", ascending=False).head(10)


In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 5))
fig.suptitle("RQ2: Director consistency", fontsize=16, fontweight="bold")

scatter = axes[0].scatter(
    prolific_directors["movie_count"],
    prolific_directors["avg_rating"],
    c=np.log10(prolific_directors["total_votes"] + 1),
    cmap="viridis",
    alpha=0.7,
    edgecolor="black",
    linewidth=0.4,
)
axes[0].set_xlabel("Number of movies")
axes[0].set_ylabel("Average rating")
axes[0].grid(alpha=0.3)
cbar = plt.colorbar(scatter, ax=axes[0])
cbar.set_label("log10(total votes + 1)")

axes[1].hist(prolific_directors["std_rating"].dropna(), bins=30, color="coral", edgecolor="black", alpha=0.75)
axes[1].axvline(prolific_directors["std_rating"].mean(), color="red", linestyle="--", label="Mean")
axes[1].set_xlabel("Standard deviation of ratings")
axes[1].set_ylabel("Number of directors")
axes[1].set_title("Director rating variability")
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.tight_layout(rect=(0, 0, 1, 0.94))
output_path = FIGURES_DIR / "rq2_director_consistency.png"
plt.savefig(output_path, dpi=300, bbox_inches="tight")
plt.show()
print(f"Saved figure: {output_path}")

In [None]:
top_genres = movies["primary_genre"].value_counts().head(3).index
subset = movies[movies["primary_genre"].isin(top_genres)].copy()
subset["runtime_bin"] = pd.cut(
    subset["runtimeMinutes"],
    bins=range(40, 211, 10),
    labels=[f"{i}-{i+9}" for i in range(40, 200, 10)],
)
runtime_summary = (
    subset.groupby(["primary_genre", "runtime_bin"])
    .agg(success_score=("success_score", "mean"), avg_rating=("averageRating", "mean"), count=("tconst", "count"))
    .dropna()
    .reset_index()
)

fig, axes = plt.subplots(1, 2, figsize=(16, 5))
fig.suptitle("RQ3: Runtime sweet spots", fontsize=16, fontweight="bold")

for genre in top_genres:
    genre_data = runtime_summary[runtime_summary["primary_genre"] == genre]
    axes[0].plot(
        genre_data["runtime_bin"].astype(str),
        genre_data["success_score"],
        marker="o",
        label=genre,
    )
axes[0].set_xlabel("Runtime bin (minutes)")
axes[0].set_ylabel("Success score (rating x log votes)")
axes[0].legend()
axes[0].grid(alpha=0.3)
axes[0].tick_params(axis="x", rotation=45)

pivot = runtime_summary.pivot_table(
    values="avg_rating",
    index="runtime_bin",
    columns="primary_genre",
)
sns.heatmap(pivot, annot=True, fmt=".2f", cmap="YlOrRd", ax=axes[1])
axes[1].set_xlabel("Primary genre")
axes[1].set_ylabel("Runtime bin (minutes)")

plt.tight_layout(rect=(0, 0, 1, 0.94))
output_path = FIGURES_DIR / "rq3_runtime_success.png"
plt.savefig(output_path, dpi=300, bbox_inches="tight")
plt.show()
print(f"Saved figure: {output_path}")

In [None]:
diversity = movies.groupby("num_genres")["averageRating"].agg(["mean", "std", "count"]).reset_index()
filtered = movies[movies["num_genres"].between(1, 5)]

fig, axes = plt.subplots(1, 2, figsize=(16, 5))
fig.suptitle("RQ4: Genre diversity impact", fontsize=16, fontweight="bold")

axes[0].boxplot(
    [filtered.loc[filtered["num_genres"] == i, "averageRating"] for i in sorted(filtered["num_genres"].unique())],
    labels=[str(i) for i in sorted(filtered["num_genres"].unique())],
    patch_artist=True,
)
axes[0].set_xlabel("Number of genres")
axes[0].set_ylabel("Average rating")
axes[0].grid(alpha=0.3, axis="y")

axes[1].errorbar(
    diversity["num_genres"],
    diversity["mean"],
    yerr=diversity["std"],
    marker="o",
    capsize=5,
    color="darkgreen",
)
axes[1].set_xlabel("Number of genres")
axes[1].set_ylabel("Average rating")
axes[1].grid(alpha=0.3)

plt.tight_layout(rect=(0, 0, 1, 0.94))
output_path = FIGURES_DIR / "rq4_genre_diversity.png"
plt.savefig(output_path, dpi=300, bbox_inches="tight")
plt.show()
print(f"Saved figure: {output_path}")

anova_groups = [group["averageRating"].values for _, group in filtered.groupby("num_genres") if len(group) > 0]
if len(anova_groups) > 1:
    stat, pvalue = stats.f_oneway(*anova_groups)
    print(f"ANOVA F={stat:.3f}, p-value={pvalue:.4f}")
else:
    print("Insufficient genre groups for ANOVA.")

In [None]:
movies["votes_percentile"] = movies.groupby("decade")["numVotes"].rank(pct=True)
movies["rating_delta"] = movies["averageRating"] - movies.groupby("decade")["averageRating"].transform("mean")
movies["underdog_score"] = movies["rating_delta"] / (movies["votes_percentile"] + 0.1)

hidden_gems = movies[
    (movies["averageRating"] >= 7.5)
    & (movies["numVotes"] <= 1000)
    & (movies["underdog_score"] >= movies["underdog_score"].quantile(0.95))
].copy()

print(f"Hidden gems identified: {len(hidden_gems):,}")
hidden_gems[["primaryTitle", "startYear", "averageRating", "numVotes", "primary_genre"]].head(10)

In [None]:
sample = movies.sample(min(5000, len(movies)), random_state=42)

plt.figure(figsize=(10, 6))
plt.scatter(sample["numVotes"], sample["averageRating"], alpha=0.25, s=20, color="grey", label="All titles")
plt.scatter(hidden_gems["numVotes"], hidden_gems["averageRating"], color="crimson", edgecolor="black", s=60, label="Hidden gems")
plt.xscale("log")
plt.xlabel("Number of votes (log scale)")
plt.ylabel("Average rating")
plt.title("RQ5: Hidden gems in rating-popularity space")
plt.legend()
plt.grid(alpha=0.3)
output_path = FIGURES_DIR / "rq5_hidden_gems.png"
plt.savefig(output_path, dpi=300, bbox_inches="tight")
plt.show()
print(f"Saved figure: {output_path}")

hidden_genre_counts = hidden_gems["primary_genre"].value_counts().head(10)
plt.figure(figsize=(8, 6))
plt.barh(hidden_genre_counts.index[::-1], hidden_genre_counts.values[::-1], color="crimson")
plt.xlabel("Count")
plt.title("Hidden gems by genre")
plt.tight_layout()
output_path = FIGURES_DIR / "rq5_hidden_gems_genres.png"
plt.savefig(output_path, dpi=300, bbox_inches="tight")
plt.show()
print(f"Saved figure: {output_path}")

In [None]:
feature_cols = ["runtimeMinutes", "num_genres", "startYear", "popularity_score"]
X = movies[feature_cols].copy()
for genre in movies["primary_genre"].value_counts().head(10).index:
    X[f"genre_{genre}"] = (movies["primary_genre"] == genre).astype(int)

y = movies["averageRating"].astype(float)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

rf = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42, n_jobs=-1)
rf.fit(X_train_scaled, y_train)

y_pred_train = rf.predict(X_train_scaled)
y_pred_test = rf.predict(X_test_scaled)

metrics = {
    "train_r2": r2_score(y_train, y_pred_train),
    "test_r2": r2_score(y_test, y_pred_test),
    "test_rmse": mean_squared_error(y_test, y_pred_test) ** 0.5,
    "test_mae": mean_absolute_error(y_test, y_pred_test),
}
metrics

In [None]:
plt.figure(figsize=(6, 6))
plt.scatter(y_test, y_pred_test, alpha=0.4, edgecolor="none")
lims = [min(y_test.min(), y_pred_test.min()), max(y_test.max(), y_pred_test.max())]
plt.plot(lims, lims, "r--", label="Ideal")
plt.xlabel("Actual rating")
plt.ylabel("Predicted rating")
plt.title(f"Random Forest performance (R^2={metrics['test_r2']:.3f})")
plt.legend()
plt.grid(alpha=0.3)
output_path = FIGURES_DIR / "rq6_model_performance.png"
plt.savefig(output_path, dpi=300, bbox_inches="tight")
plt.show()
print(f"Saved figure: {output_path}")

feature_importance = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
plt.figure(figsize=(8, 6))
feature_importance.head(10).iloc[::-1].plot(kind="barh", color="steelblue")
plt.xlabel("Importance")
plt.title("Top feature importances")
plt.tight_layout()
output_path = FIGURES_DIR / "rq6_feature_importance.png"
plt.savefig(output_path, dpi=300, bbox_inches="tight")
plt.show()
print(f"Saved figure: {output_path}")

## Next Steps

- Review generated figures in `reports/figures/` and add captions to the Phase 1 report.
- Validate statistical assumptions flagged during the analyses (e.g., variance homogeneity, normality).
- Gather literature references for each research question to strengthen Section 2 of the report.
- Plan Phase 2 deep dives: collaboration networks (RQ2), regional analysis (RQ7), and career trajectory modelling (RQ8).
