# Music & Mental Health recommendation system preprocess
Genre-level aggregation으로 spotify dataset feature 통합 (genre=item 부족한 feature 보강)

In [1]:
# 프로젝트 루트 설정
from pathlib import Path

PROJECT_ROOT = Path(__file__).parent.parent if '__file__' in globals() else Path.cwd().parent

In [2]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction import DictVectorizer
import pickle

## mxmh dataset

- kaggle: https://www.kaggle.com/datasets/catherinerasgaitis/mxmh-survey-results

In [3]:
mxmh_raw_df = pd.read_csv(PROJECT_ROOT / "data" / "raw" / "mxmh_survey_results.csv")
mxmh_raw_df

Unnamed: 0,Timestamp,Age,Primary streaming service,Hours per day,While working,Instrumentalist,Composer,Fav genre,Exploratory,Foreign languages,...,Frequency [R&B],Frequency [Rap],Frequency [Rock],Frequency [Video game music],Anxiety,Depression,Insomnia,OCD,Music effects,Permissions
0,8/27/2022 19:29:02,18.0,Spotify,3.0,Yes,Yes,Yes,Latin,Yes,Yes,...,Sometimes,Very frequently,Never,Sometimes,3.0,0.0,1.0,0.0,,I understand.
1,8/27/2022 19:57:31,63.0,Pandora,1.5,Yes,No,No,Rock,Yes,No,...,Sometimes,Rarely,Very frequently,Rarely,7.0,2.0,2.0,1.0,,I understand.
2,8/27/2022 21:28:18,18.0,Spotify,4.0,No,No,No,Video game music,No,Yes,...,Never,Rarely,Rarely,Very frequently,7.0,7.0,10.0,2.0,No effect,I understand.
3,8/27/2022 21:40:40,61.0,YouTube Music,2.5,Yes,No,Yes,Jazz,Yes,Yes,...,Sometimes,Never,Never,Never,9.0,7.0,3.0,3.0,Improve,I understand.
4,8/27/2022 21:54:47,18.0,Spotify,4.0,Yes,No,No,R&B,Yes,No,...,Very frequently,Very frequently,Never,Rarely,7.0,2.0,5.0,9.0,Improve,I understand.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
731,10/30/2022 14:37:28,17.0,Spotify,2.0,Yes,Yes,No,Rock,Yes,Yes,...,Never,Rarely,Very frequently,Never,7.0,6.0,0.0,9.0,Improve,I understand.
732,11/1/2022 22:26:42,18.0,Spotify,1.0,Yes,Yes,No,Pop,Yes,Yes,...,Never,Never,Sometimes,Sometimes,3.0,2.0,2.0,5.0,Improve,I understand.
733,11/3/2022 23:24:38,19.0,Other streaming service,6.0,Yes,No,Yes,Rap,Yes,No,...,Sometimes,Sometimes,Rarely,Rarely,2.0,2.0,2.0,2.0,Improve,I understand.
734,11/4/2022 17:31:47,19.0,Spotify,5.0,Yes,Yes,No,Classical,No,No,...,Never,Never,Never,Sometimes,2.0,3.0,2.0,1.0,Improve,I understand.


In [4]:
mxmh_raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 736 entries, 0 to 735
Data columns (total 33 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Timestamp                     736 non-null    object 
 1   Age                           735 non-null    float64
 2   Primary streaming service     735 non-null    object 
 3   Hours per day                 736 non-null    float64
 4   While working                 733 non-null    object 
 5   Instrumentalist               732 non-null    object 
 6   Composer                      735 non-null    object 
 7   Fav genre                     736 non-null    object 
 8   Exploratory                   736 non-null    object 
 9   Foreign languages             732 non-null    object 
 10  BPM                           629 non-null    float64
 11  Frequency [Classical]         736 non-null    object 
 12  Frequency [Country]           736 non-null    object 
 13  Frequ

- 33개의 피처가 존재한다. 7개의 피처가 float형 26개의 피처가 object형으로 대부분의 피처가 문자열이다.
- main feature 
    - **Mental Health related featrues** <br/>
        Anxiety, Depression, Insomnia, OCD 
    - **User-Genre Intreaction: Frequency** <br/>
        Frequency [Classical] Frequency [Country] Frequency [EDM] Frequency [Folk] Frequency [Gospel] Frequency [Hip hop] Frequency [Jazz] Frequency [K pop] Frequency [Latin] Frequency [Lofi] Frequency [Metal] Frequency [Pop] Frequency [R&B] Frequency [Rock]
    - **제외된 장르**: Rap, Video game music (Spotify 데이터에 없음)

- main feature에는 결측치가 없으므로 별도로 Null 처리를 하지 않는다.

### genre frequency preprocessing

In [5]:
mxmh_df = mxmh_raw_df.copy()

In [6]:
frequency_cols = [col for col in mxmh_df.columns if col.startswith('Frequency [')]
all_unique_values = set()
for col in frequency_cols:
    all_unique_values.update(mxmh_df[col].dropna().unique())
print(sorted(all_unique_values))

['Never', 'Rarely', 'Sometimes', 'Very frequently']


칼럼 값이 Never, Rarely, Sometimes, Very frequently임을 확인할 수 있다. 문자열을 숫자로 매핑해야 한다.

In [7]:
frequency_mapping = {
    'Never': 0,
    'Rarely': 1,
    'Sometimes': 2,
    'Very frequently': 3
}

for col in frequency_cols:
    mxmh_df[col] = (
        mxmh_df[col]
        .astype(str)
        .str.strip()
        .map(frequency_mapping)
        .fillna(0)
        .astype(int)
    )

- 유저별 대응되는 id 가 없으므로 user_id 생성

In [8]:
mxmh_df['user_id'] = mxmh_df.index.astype(str)

- genre indexing

In [9]:
mxmh_genres = [
    'Classical', 'Country', 'EDM', 'Folk', 'Gospel', 'Hip hop',
    'Jazz', 'K pop', 'Latin', 'Lofi', 'Metal', 'Pop', 'R&B',
    'Rock'
]
# Rap과 Video game music 제거 (Spotify에 해당 장르가 없음)

각 장르 이름을 모델링에 사용할 수 있는 item id로 변환

In [10]:
genre_to_item_id = {genre: f"genre_{genre.lower().replace(' ', '_')}" for genre in mxmh_genres}
item_id_to_genre = {v: k for k, v in genre_to_item_id.items()}

문자열 item id와 숫자 index 매핑

In [11]:
genre_enc = LabelEncoder()
genre_item_ids = [genre_to_item_id[g] for g in mxmh_genres]
genre_enc.fit(genre_item_ids)
genre_to_idx = {g: genre_enc.transform([g])[0] for g in genre_item_ids}
idx_to_genre = {v: k for k, v in genre_to_idx.items()}

- fav genre 칼럼의 값을 활용하기 위해 fav_genre일경우 weight를 부여한다. 
    - 이는 이후에 조정해야할 hyperparameter 값으로 모델의 결과에 따라 weight를 drop하거나 값을 조정할 예정이다.

In [12]:
FAV_GENRE_WEIGHT = 1.5

### User-Item Interaction Matrix 생성 (COO sparse)

- mxmh 데이터셋에서 아이템은 music genre에 해당하므로, 유저-아이템 행렬(User-Genre Interaction Matrix)을 준비한다.
    - 각 유저가 14개 장르를 얼마나 자주 듣는지를 interaction 값으로 변환한다. (Rap, Video game music 제외)
    - 이를 LightFM 모델에서 학습할 수 있는 COO sparse matrix 형태로 생성한다.

In [13]:
rows, cols, vals = [], [], []

for _, row in mxmh_df.iterrows():
    user_idx = row['user_id']
    fav_genre = row['Fav genre']
    
    for genre in mxmh_genres:
        freq_value = float(row[f'Frequency [{genre}]'])
        
        if genre == fav_genre:
            freq_value = freq_value * FAV_GENRE_WEIGHT
        
        genre_item_id = genre_to_item_id[genre]
        genre_idx = genre_to_idx[genre_item_id]
        rows.append(user_idx)
        cols.append(genre_idx)
        vals.append(freq_value)

- 추후 여러 모델 비교를 위해, sparsity가 0%인 matrix이어도 호환성이 가장 용이한 coo matrix로 생성

In [14]:
interaction_matrix = coo_matrix(
    (vals, (rows, cols)),
    shape=(len(mxmh_df), len(mxmh_genres)),
    dtype='float32'
)

In [15]:
print(f"{interaction_matrix.shape}")
print(f"non-zero interactions: {interaction_matrix.nnz}")

(736, 14)
non-zero interactions: 10304


## Spotify Dataset

In [16]:
spotify_file = PROJECT_ROOT / "data" / "raw" / "spotify_tracks_dataset.csv"
spotify_raw_df = pd.read_csv(spotify_file)

In [17]:
spotify_raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114000 entries, 0 to 113999
Data columns (total 21 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        114000 non-null  int64  
 1   track_id          114000 non-null  object 
 2   artists           113999 non-null  object 
 3   album_name        113999 non-null  object 
 4   track_name        113999 non-null  object 
 5   popularity        114000 non-null  int64  
 6   duration_ms       114000 non-null  int64  
 7   explicit          114000 non-null  bool   
 8   danceability      114000 non-null  float64
 9   energy            114000 non-null  float64
 10  key               114000 non-null  int64  
 11  loudness          114000 non-null  float64
 12  mode              114000 non-null  int64  
 13  speechiness       114000 non-null  float64
 14  acousticness      114000 non-null  float64
 15  instrumentalness  114000 non-null  float64
 16  liveness          11

In [18]:
spotify_raw_df.describe()

Unnamed: 0.1,Unnamed: 0,popularity,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
count,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0
mean,56999.5,33.238535,228029.2,0.5668,0.641383,5.30914,-8.25896,0.637553,0.084652,0.31491,0.15605,0.213553,0.474068,122.147837,3.904035
std,32909.109681,22.305078,107297.7,0.173542,0.251529,3.559987,5.029337,0.480709,0.105732,0.332523,0.309555,0.190378,0.259261,29.978197,0.432621
min,0.0,0.0,0.0,0.0,0.0,0.0,-49.531,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,28499.75,17.0,174066.0,0.456,0.472,2.0,-10.013,0.0,0.0359,0.0169,0.0,0.098,0.26,99.21875,4.0
50%,56999.5,35.0,212906.0,0.58,0.685,5.0,-7.004,1.0,0.0489,0.169,4.2e-05,0.132,0.464,122.017,4.0
75%,85499.25,50.0,261506.0,0.695,0.854,8.0,-5.003,1.0,0.0845,0.598,0.049,0.273,0.683,140.071,4.0
max,113999.0,100.0,5237295.0,0.985,1.0,11.0,4.532,1.0,0.965,0.996,1.0,1.0,0.995,243.372,5.0


In [19]:
spotify_df = spotify_raw_df.copy()

### spotify genre mapping 
mxmh ds와 공통 키인 genre를 기반으로 매핑

In [20]:
genre_col = 'track_genre'

print(f"고유 장르 수: {spotify_df[genre_col].nunique()}")
print(f"고유 장르 목록:{sorted(spotify_df[genre_col].unique())}")

고유 장르 수: 114
고유 장르 목록:['acoustic', 'afrobeat', 'alt-rock', 'alternative', 'ambient', 'anime', 'black-metal', 'bluegrass', 'blues', 'brazil', 'breakbeat', 'british', 'cantopop', 'chicago-house', 'children', 'chill', 'classical', 'club', 'comedy', 'country', 'dance', 'dancehall', 'death-metal', 'deep-house', 'detroit-techno', 'disco', 'disney', 'drum-and-bass', 'dub', 'dubstep', 'edm', 'electro', 'electronic', 'emo', 'folk', 'forro', 'french', 'funk', 'garage', 'german', 'gospel', 'goth', 'grindcore', 'groove', 'grunge', 'guitar', 'happy', 'hard-rock', 'hardcore', 'hardstyle', 'heavy-metal', 'hip-hop', 'honky-tonk', 'house', 'idm', 'indian', 'indie', 'indie-pop', 'industrial', 'iranian', 'j-dance', 'j-idol', 'j-pop', 'j-rock', 'jazz', 'k-pop', 'kids', 'latin', 'latino', 'malay', 'mandopop', 'metal', 'metalcore', 'minimal-techno', 'mpb', 'new-age', 'opera', 'pagode', 'party', 'piano', 'pop', 'pop-film', 'power-pop', 'progressive-house', 'psych-rock', 'punk', 'punk-rock', 'r-n-b', 'reggae'

In [21]:
if spotify_df is not None:
    spotify_to_mxmh_genre_map = {
        # Lofi
        'chill': 'Lofi',
        
        # Hip hop
        'hip-hop': 'Hip hop',
        'trip-hop': 'Hip hop',
        
        # Pop
        'pop': 'Pop',
        'indie-pop': 'Pop',
        'synth-pop': 'Pop',
        'power-pop': 'Pop',
        'pop-film': 'Pop',
        
        # K pop
        'k-pop': 'K pop',
        
        # Classical
        'classical': 'Classical',
        'opera': 'Classical',
        
        # Metal
        'metal': 'Metal',
        'heavy-metal': 'Metal',
        'metalcore': 'Metal',
        'death-metal': 'Metal',
        'black-metal': 'Metal',
        'hardcore': 'Metal',
        'grindcore': 'Metal',
        
        # Rock
        'rock': 'Rock',
        'alt-rock': 'Rock',
        'hard-rock': 'Rock',
        'punk-rock': 'Rock',
        'rock-n-roll': 'Rock',
        'rockabilly': 'Rock',
        'grunge': 'Rock',
        'psych-rock': 'Rock',
        'alternative': 'Rock',
        'indie': 'Rock',
        'punk': 'Rock',
        'emo': 'Rock',
        
        # EDM
        'edm': 'EDM',
        'electronic': 'EDM',
        'house': 'EDM',
        'techno': 'EDM',
        'trance': 'EDM',
        'dubstep': 'EDM',
        'deep-house': 'EDM',
        'detroit-techno': 'EDM',
        'minimal-techno': 'EDM',
        'progressive-house': 'EDM',
        'chicago-house': 'EDM',
        'electro': 'EDM',
        'drum-and-bass': 'EDM',
        'dance': 'EDM',
        'club': 'EDM',
        'disco': 'EDM',
        'dancehall': 'EDM',
        
        # Folk
        'folk': 'Folk',
        'acoustic': 'Folk',
        
        # Latin
        'latin': 'Latin',
        'latino': 'Latin',
        'reggaeton': 'Latin',
        'salsa': 'Latin',
        'samba': 'Latin',
        'brazil': 'Latin',
        'forro': 'Latin',
        'pagode': 'Latin',
        'sertanejo': 'Latin',
        'reggae': 'Latin',
        
        # R&B
        'r-n-b': 'R&B',
        'soul': 'R&B',
        
        # Country
        'country': 'Country',
        'honky-tonk': 'Country',
        'bluegrass': 'Country',
        
        # Jazz
        'jazz': 'Jazz',
        
        # Gospel
        'gospel': 'Gospel',
    }

In [22]:
spotify_df['genre_lower'] = spotify_df['track_genre'].str.lower().str.strip()
spotify_df['mxmh_genre'] = spotify_df['genre_lower'].map(spotify_to_mxmh_genre_map)
        
spotify_mapped = spotify_df[spotify_df['mxmh_genre'].notna()].copy()

### Item Features 생성

- temp featrue의 스케일이 달라서 정규화

In [23]:
feature_cols = [
    'valence', 'energy', 'danceability', 'acousticness',
    'instrumentalness', 'tempo'
]

In [24]:
tempo_min = spotify_mapped['tempo'].min()
tempo_max = spotify_mapped['tempo'].max()
spotify_mapped['tempo_normalized'] = (
        (spotify_mapped['tempo'] - tempo_min) / (tempo_max - tempo_min)
)

In [25]:
genre_aggregated = spotify_mapped.groupby('mxmh_genre')[feature_cols].mean().reset_index()

In [32]:
genre_features_with_spotify = []

for genre_item_id in genre_item_ids:
    genre_name = item_id_to_genre[genre_item_id]
    features = {}

    genre_key = f"genre_{genre_name.lower().replace(' ', '_')}"
    features[genre_key] = 1

    # Spotify 속성 추가
    genre_row = genre_aggregated[genre_aggregated['mxmh_genre'] == genre_name]
    
    for col in feature_cols:
        if col in genre_row.columns and pd.notna(genre_row[col].values[0]):
            features[col] = float(genre_row[col].values[0])
    
    genre_features_with_spotify.append(features)

In [33]:
pd.DataFrame(genre_features_with_spotify[:5]) # 샘플 확인

Unnamed: 0,genre_classical,valence,energy,danceability,acousticness,instrumentalness,tempo,genre_country,genre_edm,genre_folk,genre_gospel
0,1.0,0.298138,0.253441,0.347743,0.85749,0.391061,106.767175,,,,
1,,0.596065,0.498014,0.553736,0.52994,0.066496,122.36633,1.0,,,
2,,0.434287,0.742981,0.658669,0.11126,0.270693,125.698718,,1.0,,
3,,0.457313,0.490588,0.554053,0.521074,0.038005,118.740075,,,1.0,
4,,0.320641,0.576256,0.473298,0.376891,0.003307,125.625127,,,,1.0


In [None]:
item_feature_vectorizer = DictVectorizer(sparse=True)
item_features = item_feature_vectorizer.fit_transform(genre_features_with_spotify).astype('float32')
item_feature_labels = item_feature_vectorizer.get_feature_names_out()

### User Features 생성

- 프로젝트 목적에 맞게 'Anxiety', 'Depression', 'Insomnia', 'OCD'와 같은 mental health 지수에만 집중
- age, hours per use 와 같은 칼럼은 미사용

In [37]:
user_feature_dicts = []
for _, row in mxmh_df.iterrows():
    features = {}
    
    for mh in ['Anxiety', 'Depression', 'Insomnia', 'OCD']:
        if mh in mxmh_df.columns and pd.notna(row[mh]):
            features[mh] = float(row[mh])
    
    user_feature_dicts.append(features)

In [38]:
user_feature_vectorizer = DictVectorizer(sparse=True)
user_features = user_feature_vectorizer.fit_transform(user_feature_dicts).astype('float32')
user_feature_labels = user_feature_vectorizer.get_feature_names_out()

In [41]:
# mxmh 전처리 데이터 저장
mxmh_with_spotify_stg_df = {
    "interaction_matrix": interaction_matrix.tocsr(),  # CSR format for LightFM
    "item_features": item_features.tocsr(),
    "item_feature_labels": item_feature_labels,
    "user_features": user_features.tocsr(),
    "user_feature_labels": user_feature_labels,
    "user_id_to_idx": {uid: idx for idx, uid in enumerate(mxmh_df['user_id'])},
    "idx_to_user_id": {idx: uid for idx, uid in enumerate(mxmh_df['user_id'])},
    "genre_to_idx": genre_to_idx,
    "idx_to_genre": idx_to_genre,
    "genre_item_ids": genre_item_ids,
    "mxmh_genres": mxmh_genres,
}

In [43]:
with open(PROJECT_ROOT / "data" / "stg" / "mxmh_with_spotify_preprocessed.pkl", "wb") as f:
    pickle.dump(mxmh_with_spotify_stg_df, f)