In [1]:
import json, re, ast, os
from pathlib import Path
from typing import Optional, List

import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
from sklearn.preprocessing import normalize as sk_normalize
from scipy import sparse
from annoy import AnnoyIndex
import joblib

In [2]:
tqdm.pandas()

In [3]:
COMMON_AUDIO_FEATS = [
    "danceability","energy","valence","acousticness","instrumentalness",
    "liveness","speechiness","tempo","loudness","duration_ms",
]

In [4]:
def std_cols(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = [re.sub(r"\s+", "_", c.strip().lower()) for c in df.columns]
    return df

In [5]:
def pick(cols, *cands) -> Optional[str]:
    s = set(cols)
    for c in cands:
        if c in s:
            return c
    return None

In [6]:
def parse_genres_col(s: pd.Series) -> List[List[str]]:
    out = []
    for x in s.fillna(""):
        if isinstance(x, list):
            out.append([str(t).strip().lower() for t in x if str(t).strip()])
        elif isinstance(x, str):
            x = x.strip()
            if not x:
                out.append([]); continue
            try:
                val = ast.literal_eval(x)
                if isinstance(val, (list, tuple)):
                    out.append([str(t).strip().lower() for t in val if str(t).strip()])
                else:
                    out.append([t.strip().lower() for t in x.split(",") if t.strip()])
            except Exception:
                out.append([t.strip().lower() for t in x.split(",") if t.strip()])
        else:
            out.append([])
    return out

In [7]:
DATA_PATH = Path("data/spotify_tracks.csv")
ART_DIR   = Path("artifacts")
ART_DIR.mkdir(parents=True, exist_ok=True)

In [8]:
if DATA_PATH.suffix.lower() in (".parquet", ".pq"):
    df = pd.read_parquet(DATA_PATH)
else:
    df = pd.read_csv(DATA_PATH)

df = std_cols(df)
df.head()

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Movie,Henri Salvador,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0,0.611,0.389,99373,0.91,0.0,C#,0.346,-1.828,Major,0.0525,166.969,4/4,0.814
1,Movie,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,1,0.246,0.59,137373,0.737,0.0,F#,0.151,-5.559,Minor,0.0868,174.003,4/4,0.816
2,Movie,Joseph Williams,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,3,0.952,0.663,170267,0.131,0.0,C,0.103,-13.879,Minor,0.0362,99.488,5/4,0.368
3,Movie,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0,0.703,0.24,152427,0.326,0.0,C#,0.0985,-12.178,Major,0.0395,171.758,4/4,0.227
4,Movie,Fabien Nataf,Ouverture,0IuslXpMROHdEPvSl1fTQK,4,0.95,0.331,82625,0.225,0.123,F,0.202,-21.15,Major,0.0456,140.576,4/4,0.39


In [9]:
track_id    = "track_id"
track_name  = "track_name"
artist_name = "artist_name"
genres_col  = "genre"          
playlist_id = None             

print("track_id   :", track_id)
print("track_name :", track_name)
print("artist_name:", artist_name)
print("genres_col :", genres_col)


track_id   : track_id
track_name : track_name
artist_name: artist_name
genres_col : genre


In [10]:
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
inter = [c for c in COMMON_AUDIO_FEATS if c in num_cols]
audio_cols = inter if len(inter) >= 3 else num_cols
assert len(audio_cols) >= 3, f"Нужно ≥3 числовых признака, найдено {len(audio_cols)} → {audio_cols}"
audio_cols


['danceability',
 'energy',
 'valence',
 'acousticness',
 'instrumentalness',
 'liveness',
 'speechiness',
 'tempo',
 'loudness',
 'duration_ms']

In [None]:
scaler = StandardScaler()
X_num = scaler.fit_transform(df[audio_cols].astype(float).fillna(0.0))
X_parts = [sparse.csr_matrix(X_num)]

In [None]:
mlb = None
if genres_col in df.columns:
    genres_lists = parse_genres_col(df[genres_col])
    mlb = MultiLabelBinarizer(sparse_output=True)
    X_gen = mlb.fit_transform(genres_lists)
    if hasattr(X_gen, "tocsr"):
        X_gen = X_gen.tocsr()
    X_parts.append(X_gen)

In [None]:
X = sparse.hstack(X_parts).tocsr()
X = sk_normalize(X)  
X.shape

(232725, 37)

In [None]:
dim = X.shape[1]
trees = 100 

In [15]:
index = AnnoyIndex(dim, metric="angular")
for i in tqdm(range(X.shape[0]), desc="Building Annoy"):
    v = X.getrow(i).toarray().ravel().astype("float32")
    index.add_item(i, v)

index.build(trees)
index.save(str(ART_DIR / "annoy_index.ann"))

joblib.dump(scaler, ART_DIR / "scaler.joblib")
if mlb is not None:
    joblib.dump(mlb, ART_DIR / "mlb_genres.joblib")

keep_cols = [c for c in [track_id, track_name, artist_name, "image_url","preview_url","album_name","popularity","year"] if c in df.columns]
if not keep_cols:
    keep_cols = [track_name]

id_map = df[keep_cols].copy()
id_map.insert(0, "row_id", np.arange(len(id_map)))
id_map.to_parquet(ART_DIR / "id_map.parquet", index=False)

with open(ART_DIR / "feature_cols.json","w") as f:
    json.dump({"audio_cols": audio_cols, "genres_col": genres_col}, f, ensure_ascii=False, indent=2)

with open(ART_DIR / "meta.json","w") as f:
    json.dump({
        "track_id_col": track_id,
        "track_name_col": track_name,
        "artist_name_col": artist_name,
        "playlist_id_col": playlist_id,
        "n_rows": int(df.shape[0]),
        "dim": int(dim),
        "trees": int(trees)
    }, f, ensure_ascii=False, indent=2)

print("✅ Готово! Артефакты в:", ART_DIR.resolve())

Building Annoy: 100%|██████████| 232725/232725 [00:05<00:00, 44744.96it/s]


✅ Готово! Артефакты в: /Users/akdodovkhusrav/Desktop/Zypl_ai/hw2/artifacts


In [2]:
import pandas as pd
df = pd.read_csv("/Users/akdodovkhusrav/Desktop/Zypl_ai/hw2/spotify_rec_system_project/artifacts/id_map.csv")
print(df.columns.tolist())
print(df.head())


['row_id', 'track_id', 'track_name', 'artist_name', 'popularity']
   row_id                track_id                        track_name  \
0       0  0BRjO6ga9RKCKjfDqeFgWV       C'est beau de faire un Show   
1       1  0BjC1NfoEOOusryehmNudP  Perdu d'avance (par Gad Elmaleh)   
2       2  0CoSDzoNIKCRs124s9uTVy    Don't Let Me Be Lonely Tonight   
3       3  0Gc6TVm52BwZD07Ki6tIvf    Dis-moi Monsieur Gordon Cooper   
4       4  0IuslXpMROHdEPvSl1fTQK                         Ouverture   

         artist_name  popularity  
0     Henri Salvador           0  
1  Martin & les fées           1  
2    Joseph Williams           3  
3     Henri Salvador           0  
4       Fabien Nataf           4  
