In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import os

# --- Automatically locate project root (folder that contains google_play_reviews) ---
cwd = Path.cwd().resolve()

project_root = None
for p in [cwd] + list(cwd.parents):
    if (p / "google_play_reviews").exists():
        project_root = p
        break

if project_root is None:
    raise RuntimeError("Could not locate project root containing 'google_play_reviews' folder.")

os.chdir(project_root)

print("Project root:", project_root)



Project root: /Users/wyr/data-ingestion-folder/google_play_reviews_project


In [3]:
# Path to processed data folder
processed_dir = Path("google_play_reviews/data/processed")

# Automatically pick the most recent processed CSV
processed_files = sorted(processed_dir.glob("reviews_processed_*.csv"))

if not processed_files:
    raise FileNotFoundError("No processed CSV files found.")

csv_path = processed_files[-1]
print("Using file:", csv_path)

df = pd.read_csv(csv_path)

# Parse dates
df["review_date"] = pd.to_datetime(df["review_date"], errors="coerce")

# Add word length
df["word_len"] = df["review_text"].fillna("").astype(str).str.split().str.len()

print("Rows:", len(df))
df.head()


Using file: google_play_reviews/data/processed/reviews_processed_20260206_173800_01ff77bf.csv
Rows: 5200


Unnamed: 0,review_uid,user_name,rating,review_text,review_date,thumbs_up,app_version,sort_mode,scrape_time,word_len
0,501eade0fcdf4cdb2cd9c7f938a23aaba6e3fc252ad2ed...,Ayush Kandhare,5,10/10 helps me for gooning,2026-02-05 17:37:22,0,1.2026.020,newest,2026-02-06 22:38:00.722671,5
1,b75a54086197e037b97bba8424ec22f30aa2dda3ef4c2e...,Thomas Lay,1,awesome,2026-02-05 17:37:14,0,1.2026.027,newest,2026-02-06 22:38:00.722678,1
2,33e7fe3ba36f2d9d68d684d304490a31189bcb3a801a3d...,Moks_moh Omboga,4,Excellent,2026-02-05 17:36:44,0,1.2026.020,newest,2026-02-06 22:38:00.722681,1
3,429cefedf252d0d26b24e682966ee6be20e8f3fe2ac822...,Ruthie,5,love it,2026-02-05 17:36:19,0,1.2026.027,newest,2026-02-06 22:38:00.722683,2
4,2bdeb57ef5721ce0cd83c7c6b5120603313436a809804c...,Valerie Eseohe,5,the best chat bot,2026-02-05 17:35:22,0,1.2026.027,newest,2026-02-06 22:38:00.722685,4


In [4]:
# Output directory for plots
outdir = Path("google_play_reviews/reports/eda")
outdir.mkdir(parents=True, exist_ok=True)

print("Saving plots to:", outdir.resolve())


Saving plots to: /Users/wyr/data-ingestion-folder/google_play_reviews_project/google_play_reviews/reports/eda


In [5]:
plt.figure(figsize=(6, 4))

rating_counts = df["rating"].value_counts().sort_index()
rating_counts.plot(kind="bar")

plt.title("Rating Distribution")
plt.xlabel("Rating")
plt.ylabel("Count")
plt.tight_layout()

plot_path = outdir / "rating_distribution.png"
plt.savefig(plot_path)
plt.close()

print("Saved:", plot_path)


Saved: google_play_reviews/reports/eda/rating_distribution.png


In [6]:
plt.figure(figsize=(6, 4))

df["word_len"].hist(bins=50)

plt.title("Word Count Distribution")
plt.xlabel("Word Count")
plt.ylabel("Frequency")
plt.tight_layout()

plot_path = outdir / "word_length_distribution.png"
plt.savefig(plot_path)
plt.close()

print("Saved:", plot_path)


Saved: google_play_reviews/reports/eda/word_length_distribution.png


In [7]:
df["month"] = df["review_date"].dt.to_period("M")

monthly_counts = df.groupby("month").size()

plt.figure(figsize=(8, 4))

monthly_counts.plot()

plt.title("Monthly Review Volume")
plt.xlabel("Month")
plt.ylabel("Number of Reviews")
plt.xticks(rotation=45)
plt.tight_layout()

plot_path = outdir / "review_volume_over_time.png"
plt.savefig(plot_path)
plt.close()

print("Saved:", plot_path)


Saved: google_play_reviews/reports/eda/review_volume_over_time.png
