In [None]:
!pip install --upgrade supabase


In [None]:
import os
import pandas as pd
import requests
import base64
import time
import random
from datetime import datetime, timedelta
from zoneinfo import ZoneInfo


In [None]:
client_id = os.getenv("SPOTIFY_CLIENT_ID")
client_secret = os.getenv("SPOTIFY_CLIENT_SECRET")

if not client_id or not client_secret:
    raise ValueError("SPOTIFY_CLIENT_ID/SPOTIFY_CLIENT_SECRET is required")


In [None]:
from supabase import create_client, Client

supabase_url = os.getenv("SUPABASE_URL")
supabase_key = os.getenv("SUPABASE_SERVICE_ROLE_KEY")
if not supabase_url or not supabase_key:
    raise ValueError("SUPABASE_URL/SUPABASE_SERVICE_ROLE_KEY is required")

supabase: Client = create_client(supabase_url, supabase_key)

print("Loading Spotify IDs from Supabase (imd.external_ids)...")

response = supabase.schema("imd").table("external_ids").select(
    "external_id, group_id"
).eq("service", "spotify").execute()

if not response.data:
    raise Exception("No data found in imd.external_ids for service=spotify.")

df_ids = pd.json_normalize(response.data)
df_ids = df_ids.rename(columns={"external_id": "spotify_id"})
df_ids = df_ids[["spotify_id", "group_id"]]

duplicate_ids = df_ids[df_ids.duplicated(subset=["spotify_id"], keep=False)]
if not duplicate_ids.empty:
    print(f"Warning: {len(duplicate_ids['spotify_id'].unique())} duplicated spotify_id found.")
    df_ids = df_ids.drop_duplicates(subset="spotify_id")

print(df_ids.head())


In [None]:

class TokenManager:
    def __init__(self, cid, secret):
        self.client_id = cid
        self.client_secret = secret
        self.token = None
        self.token_expiry_time = datetime.now()

    def _get_new_token(self):
        auth = base64.b64encode(f"{self.client_id}:{self.client_secret}".encode()).decode()
        headers = {'Authorization': f'Basic {auth}'}
        data = {'grant_type': 'client_credentials'}
        try:
            r = requests.post('https://accounts.spotify.com/api/token', headers=headers, data=data)
            r.raise_for_status()
            token_data = r.json()
            self.token = token_data['access_token']
            expires_in = token_data.get('expires_in', 3600)
            self.token_expiry_time = datetime.now() + timedelta(seconds=expires_in - 300)
            print("✅ Spotifyアクセストークンを正常に取得・更新しました。")
        except requests.exceptions.RequestException as e:
            print(f"❌ トークン取得に失敗しました: {e}")
            self.token = None

    def get_token(self):
        if self.token is None or datetime.now() >= self.token_expiry_time:
            print("トークンが期限切れ、または存在しないため、リフレッシュします。")
            self._get_new_token()
        return self.token

        print("401エラー検知。トークンを強制的にリフレッシュします。")
        self._get_new_token()

token_manager = TokenManager(client_id, client_secret)

In [None]:

def make_spotify_request(url, token_manager, max_retries=5):
    for attempt in range(max_retries):
        try:
            token = token_manager.get_token()
            if not token:
                raise Exception("トークンが取得できません。")

            headers = {"Authorization": f"Bearer {token}"}
            r = requests.get(url, headers=headers, timeout=10)

            if r.status_code == 200:
                return r.json()
            elif r.status_code == 401:
                print(f"URL: {url}")
                continue
            elif r.status_code == 429:
                wait_time = int(r.headers.get('Retry-After', 5))
                print(f"レート制限(429)のため、{wait_time}秒待機します。")
                time.sleep(wait_time)
            elif r.status_code >= 500:
                wait_time = (2 ** attempt) + random.uniform(0, 1)
                print(f"サーバーエラー({r.status_code}) リトライ {attempt + 1}。{wait_time:.2f}秒待機します。")
                time.sleep(wait_time)
            else:
                return None
        except requests.exceptions.RequestException as e:
            print(f"リクエスト例外発生: {e} リトライします...")
            wait_time = (2 ** attempt) + random.uniform(0, 1)
            time.sleep(wait_time)
    return None

def get_artist_info(artist_id, token_manager):
    url = f"https://api.spotify.com/v1/artists/{artist_id}"
    return make_spotify_request(url, token_manager)

def get_top_track_popularities(artist_id, token_manager, top_n=5):
    url = f"https://api.spotify.com/v1/artists/{artist_id}/top-tracks?market=JP"
    data = make_spotify_request(url, token_manager)
    if data and "tracks" in data:
        tracks = data["tracks"][:top_n]
        return tracks, [t['popularity'] for t in tracks]
    return [], []

def count_recent_releases(tracks, days=30):
    count = 0
    now = datetime.now().date()
    for t in tracks:
        try:
            release_date_str = t['album']['release_date']
            if len(release_date_str) == 10:
                release_date = datetime.strptime(release_date_str, "%Y-%m-%d").date()
            elif len(release_date_str) == 7:
                release_date = datetime.strptime(release_date_str, "%Y-%m").date()
            elif len(release_date_str) == 4:
                release_date = datetime.strptime(release_date_str, "%Y").date()
            else:
                continue
            if now - release_date <= timedelta(days=days):
                count += 1
        except (ValueError, KeyError):
            continue
    return count

In [None]:

def fetch_snapshot(df_ids, token_manager):
    snapshot = []
    total = len(df_ids)
    for i, row in df_ids.iterrows():
        sid = row['spotify_id']
        print(f"[{i+1}/{total}] {sid} のデータを取得中...")
        info = get_artist_info(sid, token_manager)
        tracks, pops = get_top_track_popularities(sid, token_manager)
        new_count = count_recent_releases(tracks, days=30)

        snapshot.append({
            'spotify_id': sid,
            'name': info.get('name', 'N/A') if info else 'N/A',
            'artist_popularity': info.get('popularity', 0) if info else 0,
            'followers': info.get('followers', {}).get('total', 0) if info else 0,
            'track_popularity_sum': sum(pops) if pops else 0,
            'new_release_count': new_count
        })
    return pd.DataFrame(snapshot)

In [None]:

df_snapshot = fetch_snapshot(df_ids, token_manager)

for round_num in range(3):
    na_ids = df_snapshot[
        (df_snapshot['name'] == 'N/A') |
        ((df_snapshot['followers'] == 0) & (df_snapshot['artist_popularity'] > 0))
    ]['spotify_id']

    if len(na_ids) == 0:
        print("✅ 全てのデータの取得が完了しました。")
        break

    print(f"\n🔁 補完ラウンド {round_num + 1}: {len(na_ids)} 件を再取得します。")
    time.sleep(5)
    retry_df = fetch_snapshot(df_ids[df_ids['spotify_id'].isin(na_ids)], token_manager)
    df_snapshot = df_snapshot.set_index('spotify_id')
    df_snapshot.update(retry_df.set_index('spotify_id'))
    df_snapshot = df_snapshot.reset_index()

df_snapshot = df_snapshot.drop_duplicates(subset='spotify_id', keep='last')

In [None]:
snapshot_date = os.getenv("SNAPSHOT_DATE")
if not snapshot_date:
    snapshot_date = datetime.now(ZoneInfo("Asia/Tokyo")).strftime("%Y-%m-%d")
print(f"Snapshot date: {snapshot_date}")

df_snapshot = df_snapshot.merge(df_ids, on="spotify_id", how="left")

missing = df_snapshot[df_snapshot["group_id"].isna()]
if not missing.empty:
    print(f"Warning: {len(missing)} rows missing group_id. They will be skipped.")
    print(missing["spotify_id"].head())

df_snapshot = df_snapshot.dropna(subset=["group_id"])

def to_int(value):
    try:
        return int(value)
    except (TypeError, ValueError):
        return 0

records = []
for _, row in df_snapshot.iterrows():
    records.append({
        "snapshot_date": snapshot_date,
        "group_id": row["group_id"],
        "spotify_id": row["spotify_id"],
        "name": row["name"] if isinstance(row["name"], str) else "N/A",
        "artist_popularity": to_int(row["artist_popularity"]),
        "followers": to_int(row["followers"]),
        "track_popularity_sum": to_int(row["track_popularity_sum"]),
        "new_release_count": to_int(row["new_release_count"]),
    })

if not records:
    raise Exception("No records to upsert.")

batch_size = int(os.getenv("BATCH_SIZE", "500"))
for start in range(0, len(records), batch_size):
    batch = records[start:start + batch_size]
    supabase.schema("ihc").table("artist_snapshots").upsert(
        batch,
        on_conflict="snapshot_date,group_id"
    ).execute()
    print(f"Upserted {start + len(batch)} / {len(records)}")

print("Done.")
