In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

This is a classifier trained on my top3 among many different Kpop artists. Then from selected "rookie" groups, the classifier will select songs with high top3-probability. This will indicate similar metadata features and imply similar musical style.

First, I define helper functions to process the data from my CSV files.

In [9]:
def length_mask(df,upper=2.7e5,lower=1.5e5):
    """
    This mask filters songs based on duration (ms). My default limits are 2.5 and 4.5 minutes.
    """
    length_mask = (df['duration_ms']>upper)|(df['duration_ms']<lower)
    return length_mask

def split_table(music_table):
    """
    This function loads a music table and separates song features from labels. 
    """
    labels = music_table.select_dtypes('object')
    labels['release date'] = pd.to_datetime(labels['release date'],errors='coerce')
    music_table[['key','mode','time_signature']] = music_table[['key','mode','time_signature']].astype('object')
    features = music_table.drop(labels,1)
    mask = length_mask(features)
    labels = labels.drop(features[mask].index)
    features = features.drop(features[mask].index)

    return features, labels


def merge_tables(target_feats,target_labels,others_feats,others_labels):
    """
    This function merges both target and other tables for a one-vs-many classification
    """
    target_labels['rating']=1
    others_labels['rating']=0
    labels=pd.concat([target_labels,others_labels],0).reset_index(drop=True)
    feats =pd.concat([target_feats,others_feats],0).reset_index(drop=True)
    ind=pd.isnull(feats).any(1).nonzero()[0]
    feats = feats.drop(list(ind),0).reset_index(drop=True)
    labels = labels.drop(list(ind),0).reset_index(drop=True)
    return labels, feats

def load_data(target_filename,reference_filename):
    """
    This function loads training data from two CSV files, using the functions above
    """    
    mm_feats, mm_labels = split_table(pd.read_csv(target_filename))
    other_feats, other_labels = split_table(pd.read_csv(reference_filename))
    other_feats = other_feats.drop('0',1)
    
    sum_labels, sum_feats = merge_tables(mm_feats,mm_labels,other_feats,other_labels)
    
    y = sum_labels['rating']
    x = sum_feats
    return x,y, sum_labels


In my top3 EDA, I found danceability and acousticness to be a feature that correlated somewhat strongly with the EXID and Mamamoo targets, respectively. Here, I'll use simple preprocessing and a powerful tree-based classifier. I use SMOTE to oversample top3 songs, because otherwise there would not be enough top3 songs in the training data.

In [18]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, roc_auc_score

In [58]:
x,y,sum_labels = load_data('top3_2019-features.csv','total3-comparison-old-features.csv')
x_train, x_val, y_train, y_val = train_test_split(x,y)
x_smote, y_smote = SMOTE().fit_resample(x_train,y_train)
x_smote = pd.DataFrame(x_smote,columns=x_train.columns)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [59]:
num_processor = Pipeline([('scaler',StandardScaler()),
                          ])
cat_processor = Pipeline([('ohe',OneHotEncoder()),
                         ])
processor = ColumnTransformer([('num',num_processor,x.select_dtypes('float64').columns),
                               ('cat',cat_processor,x.select_dtypes('object').columns),
                               ])
modelA = Pipeline([('processor',processor),
                  ('classifier',GradientBoostingClassifier(learning_rate=0.5,n_estimators=1000)),
                 ])

modelB = Pipeline([('processor',processor),
                  ('classifier',KNeighborsClassifier()),
                 ])

modelC = Pipeline([('processor',processor),
                  ('classifier',RandomForestClassifier()),
                 ])

I want to do a quick round of classifier validation before making recommendations. First, we fit the model to the training data and check the classification report from the validation data.

In [60]:
modelA.fit(x_smote,y_smote)
modelB.fit(x_smote,y_smote)
modelC.fit(x_smote,y_smote)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Pipeline(memory=None,
         steps=[('processor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('scaler',
                                                                   StandardScaler(copy=True,
                                                                                  with_mean=True,
                                                                                  with_std=True))],
                                                           verbose=False),
                                                  Index(['acousticness', 'danceability', 'duration_ms', 'energy',
       'instrumentalness', 'liven...
                 RandomForestClassifier(b

In [61]:
composite_model = [modelA.predict_proba(x_val),
                   modelA.predict_proba(x_val), 
                   modelA.predict_proba(x_val), 
                   modelB.predict_proba(x_val), 
                   modelB.predict_proba(x_val), 
                   modelC.predict_proba(x_val)
                  ]
avg_model = sum(composite_model)/len(composite_model)
avg_guess = np.round(avg_model[:,1])

In [62]:
print(classification_report(y_val,modelA.predict(x_val)))

              precision    recall  f1-score   support

           0       0.98      0.97      0.98      3294
           1       0.36      0.44      0.40       109

    accuracy                           0.96      3403
   macro avg       0.67      0.71      0.69      3403
weighted avg       0.96      0.96      0.96      3403



In [63]:
print(classification_report(y_val,avg_guess))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      3294
           1       0.40      0.48      0.44       109

    accuracy                           0.96      3403
   macro avg       0.69      0.73      0.71      3403
weighted avg       0.96      0.96      0.96      3403



A decent model. Now to repeat the process on the "rookies."

In [68]:

rookies = pd.read_csv('rookies-features.csv').drop('0',1).dropna(0).reset_index(drop=True)
rf, rl = split_table(rookies)

composite_model = [modelA.predict_proba(x_val),
                   modelA.predict_proba(x_val), 
                   modelA.predict_proba(x_val), 
                   modelB.predict_proba(x_val), 
                   modelB.predict_proba(x_val), 
                   modelC.predict_proba(x_val)
                  ]
avg_model = sum(composite_model)/len(composite_model)
avg_guess = np.round(avg_model[:,1])

guesses = pd.Series(avg_model[:,1],name='probability')
results = pd.concat([rl,guesses],1).sort_values('probability',ascending=False)
results = results.dropna()
fp = results[['artist name','album title','song title','probability']][:50]
#fp.to_csv('top3-rookies.csv',header=True,index=None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


In [69]:
fp

Unnamed: 0,artist name,album title,song title,probability
255,Jimin Park,Orange Marmalade OST,If I Had,0.999723
150,WJSN,WJ PLEASE?,Masquerade,0.999642
190,Jimin Park,jiminxjamie,별 (Prod. by LambC) Stars (Prod. by LambC),0.999638
1326,CHUNG HA,Blooming Blue,Cherry Kisses,0.999552
1146,TWICE,twicetagram,날 바라바라봐 LOOK AT ME,0.988022
1168,TWICE,TWICEcoaster : LANE2,KNOCK KNOCK,0.985382
1207,TWICE,FANCY YOU,STRAWBERRY,0.982781
1176,TWICE,TWICEcoaster : LANE2,ONE IN A MILLION,0.981153
401,OH MY GIRL,OH MY GIRL Japan 2nd Album,Sixteen - Japanese Version,0.96703
715,BLACKPINK,BLACKPINK IN YOUR AREA,SEE U LATER,0.95794
