In [16]:
from pathlib import Path
import pandas as pd, numpy as np, re, json, matplotlib.pyplot as plt

# 📂 경로 설정 (현재 구조 기준)
BASE = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
RAW_FILE = BASE / "spotify_cleaned_final_v2.csv"    # 루트 파일 그대로 사용

PROC = BASE / "data" / "processed"
ART  = BASE / "artifacts"
ART_FIG = ART / "figures"
ART_MET = ART / "metrics"

for p in [PROC, ART_FIG, ART_MET]:
    p.mkdir(parents=True, exist_ok=True)

# ✅ 데이터 불러오기
df = pd.read_csv(RAW_FILE)
rows, cols = df.shape
print(f"불러온 데이터: {rows}행 × {cols}열")
df.head()



불러온 데이터: 3120행 × 25열


Unnamed: 0,userid,month,revenue,subscription_plan,timestamp,Age,Gender,spotify_usage_period,spotify_listening_device,spotify_subscription_plan,...,music_Influencial_mood,music_lis_frequency,music_expl_method,music_recc_rating,pod_lis_frequency,fav_pod_genre,preffered_pod_format,pod_host_preference,preffered_pod_duration,pod_variety_satisfaction
0,1,2023-01,60000,Premium,2023-01-01,20-35,Female,More than 2 years,Smart speakers or voice assistants,Premium (paid subscription),...,Sadness or melancholy,leisure time,Playlists,3,Daily,Comedy,Interview,Both,Both,Neutral
1,2,2023-01,60000,Premium,2023-01-01,12-20,Male,More than 2 years,Computer or laptop,Premium (paid subscription),...,Social gatherings or parties,Workout session,Playlists,2,Several times a week,Comedy,Interview,Both,No preference / Not applicable,Satisfied
2,3,2023-01,60000,Premium,2023-01-01,35-60,Others,6 months to 1 year,Smart speakers or voice assistants,Premium (paid subscription),...,Relaxation and stress relief,"Study Hours, While Traveling",Playlists,4,Once a week,Sports,Interview,No preference / Not applicable,Both,Satisfied
3,4,2023-01,60000,Premium,2023-01-01,20-35,Female,1 year to 2 years,"Smartphone, Smart speakers or voice assistants",Premium (paid subscription),...,"Relaxation and stress relief, Social gathering...","Office hours, Workout session, leisure time","recommendations, Playlists",4,Never,No preference / Not applicable,No preference / Not applicable,No preference / Not applicable,No preference / Not applicable,Neutral
4,5,2023-01,60000,Premium,2023-01-01,20-35,Female,1 year to 2 years,Smartphone,Premium (paid subscription),...,Relaxation and stress relief,leisure time,"recommendations, Playlists",4,Rarely,Lifestyle and Health,Story telling,Well known individuals,Both,Neutral


In [None]:
# revenue → 숫자 변환
def to_number(x):
    if pd.isna(x): return np.nan
    s = re.sub(r"[^0-9.\-]", "", str(x))
    return float(s) if s else np.nan

df["revenue_num"] = df["revenue"].apply(to_number)

# 날짜 처리
df["month"] = df["month"].astype(str)
df["month_key"] = df["month"].str.replace("-", "").astype(int)
df["timestamp_dt"] = pd.to_datetime(df.get("timestamp"), errors="coerce")

# 플랜 표준화
plan_col = next((c for c in ["subscription_plan","spotify_subscription_plan"] if c in df.columns), None)
df["subscription_plan_norm"] = df[plan_col].astype(str) if plan_col else "Unknown"

# ✅ 정합성 체크
checks = {
    "rows": len(df),
    "distinct_users": int(df["userid"].nunique()),
    "duplicates": int(df.duplicated(subset=["userid","month"]).sum()),
    "na_top5": df.isna().sum().sort_values(ascending=False).head(5).to_dict(),
    "month_range": {"min": df["month"].min(), "max": df["month"].max()},
    "plan_counts": df["subscription_plan_norm"].value_counts().to_dict()
}

print(json.dumps(checks, indent=2, ensure_ascii=False))

# 저장
with open(ART_MET / "summary.json", "w", encoding="utf-8") as f:
    json.dump(checks, f, ensure_ascii=False, indent=2)


{
  "rows": 3120,
  "distinct_users": 520,
  "duplicates": 0,
  "na_top5": {
    "userid": 0,
    "music_Influencial_mood": 0,
    "timestamp_dt": 0,
    "month_key": 0,
    "revenue_num": 0
  },
  "month_range": {
    "min": "2023-01",
    "max": "2023-06"
  },
  "plan_counts": {
    "Free": 2334,
    "Premium": 786
  }
}


In [18]:
# 📈 월별 매출 추이
monthly = df.groupby("month", as_index=False)["revenue_num"].sum().sort_values("month")
plt.figure(figsize=(7,4))
plt.plot(monthly["month"], monthly["revenue_num"], marker="o", color="#1DB954")
plt.title("Monthly Revenue (₩)")
plt.xlabel("Month"); plt.ylabel("Revenue")
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig(ART_FIG / "monthly_revenue.png", dpi=144)
plt.close()

# 📊 최신월 요금제별 사용자수
latest = df["month"].max()
users_mix = (
    df[df["month"]==latest]
    .groupby("subscription_plan_norm")["userid"].nunique()
    .reset_index(name="users").sort_values("users", ascending=False)
)
plt.figure(figsize=(6,4))
plt.bar(users_mix["subscription_plan_norm"], users_mix["users"], color=["#1DB954" if "rem" in p.lower() else "#BFBFBF" for p in users_mix["subscription_plan_norm"]])
plt.title(f"Active Users by Plan — {latest}")
for i, v in enumerate(users_mix["users"]):
    plt.text(i, v, f"{int(v):,}", ha="center", va="bottom")
plt.tight_layout()
plt.savefig(ART_FIG / "users_by_plan_latest.png", dpi=144)
plt.close()


In [19]:
out_path = PROC / "spotify_processed.parquet"
df.to_parquet(out_path, index=False)
print("저장 완료:", out_path)


저장 완료: /Users/yujoolee/project/StayOrSkip/data/processed/spotify_processed.parquet
