# Spotify Track Popularity Prediction

## Initialization

In [None]:
from kaggle import KaggleApi
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier

## Configuration

In [None]:
NUMERICAL_FEATURES = [
    "danceability",
    "loudness",
    "energy",
    "tempo",
    "valence",
    "speechiness",
    "liveness",
    "acousticness",
    "instrumentalness",
    "duration_ms",
    "year",
]

CATEGORICAL_FEATURES = [
    "genre",
]

TARGET = "verdict"

RANDOM_STATE = 42

## Data Ingestion

In [None]:
api = KaggleApi()
api.authenticate()

api.dataset_download_files(
    dataset="amitanshjoshi/spotify-1million-tracks", path="./data", unzip=True
)

In [None]:
spotify_tracks = pd.read_csv("./data/spotify_data.csv")
spotify_tracks.head()

## Data preparation

In [None]:
# Add the popularity verdict
spotify_tracks[TARGET] = spotify_tracks.apply(
    lambda row: 1 if row["popularity"] >= 50 else 0, axis=1
)

In [None]:
feature_columns = NUMERICAL_FEATURES + CATEGORICAL_FEATURES
features = spotify_tracks[feature_columns + [TARGET]]
features.head()

In [None]:
train_data, test_data = train_test_split(features, random_state=RANDOM_STATE)

train_input = train_data[feature_columns]
train_output = train_data[TARGET]

train_input_ros, train_output_ros = RandomOverSampler(random_state=RANDOM_STATE).fit_resample(train_input, train_output)

## Modeling

In [None]:
numerical_pipeline = Pipeline([("encoder", StandardScaler())])
categorical_pipeline = Pipeline([("encoder", OneHotEncoder())])

preprocessing_pipeline = ColumnTransformer(
    [
        ("numerical_preprocessor", numerical_pipeline, NUMERICAL_FEATURES),
        ("categorical_pipeline", categorical_pipeline, CATEGORICAL_FEATURES),
    ]
)

pipeline = Pipeline(
    [
        ("preprocessor", preprocessing_pipeline),
        ("estimator", XGBClassifier(random_state=RANDOM_STATE)),
    ]
)

In [None]:
pipeline.fit(train_input_ros, train_output_ros)