In [14]:
import pandas as pd
df = pd.read_csv("../data/df_baseline.csv")
# df.loc[484, 'genre'] = 'soul'

## Load Data

In [15]:
print(df.shape)
df.head()

(1091, 18)


Unnamed: 0,label,artist,album,genre,single_count,freq_billboard,freq_genius,freq_theSource,freq_xxl,rating_AOTY,rating_meta,rating_pitch,twitter,instagram,facebook,spotify,soundcloud,youtube
0,0,Flash Bang Grenada,10 Haters,hiphop,0,0,0,0,0,,,,0,0,0,346,0,0
1,0,Aggro Santos,AggroSantos.com,hiphop,3,0,0,0,0,,,,63771,27415,596562,4675,585,7975
2,0,AKA,Altar Ego,hiphop,4,0,0,1,2,,,,38958,0,0,22298,0,3643764
3,0,Shlohmo,Bad Vibes,hiphop,0,0,0,0,67,,,7.8,91790,59340,209063,141268,283539,0
4,0,Glasses Malone,Beach Cruiser,hiphop,4,0,0,0,4,,,,68417,48745,93741,16402,0,0


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1091 entries, 0 to 1090
Data columns (total 18 columns):
label             1091 non-null int64
artist            1091 non-null object
album             1091 non-null object
genre             1091 non-null object
single_count      1091 non-null int64
freq_billboard    1091 non-null int64
freq_genius       1091 non-null int64
freq_theSource    1091 non-null int64
freq_xxl          1091 non-null int64
rating_AOTY       61 non-null float64
rating_meta       324 non-null float64
rating_pitch      220 non-null float64
twitter           1091 non-null int64
instagram         1091 non-null int64
facebook          1091 non-null int64
spotify           1091 non-null int64
soundcloud        1091 non-null int64
youtube           1091 non-null int64
dtypes: float64(3), int64(12), object(3)
memory usage: 153.5+ KB


**Note**
- 온라인매체 기사의 양, 평론가 평점은 Null Value가 있기 때문에, 당장 Decision Tree를 통해 학습을 시킬 수 없어, Feature에서 제외를 한다.

## Data Preparation for Modeling

#### 장르 `hiphop`, `R&B`, `Soul`, `Funk`, `Pop`

In [19]:
df = pd.get_dummies(df, columns=['genre'])

In [20]:
df.columns

Index(['label', 'artist', 'album', 'single_count', 'freq_billboard',
       'freq_genius', 'freq_theSource', 'freq_xxl', 'rating_AOTY',
       'rating_meta', 'rating_pitch', 'twitter', 'instagram', 'facebook',
       'spotify', 'soundcloud', 'youtube', 'genre_funk', 'genre_hiphop',
       'genre_pop', 'genre_rnb', 'genre_soul'],
      dtype='object')

In [25]:
feature_names = ['single_count', 'freq_billboard',
                 'freq_genius', 'freq_theSource', 'freq_xxl',
                 'twitter', 'instagram', 'facebook',
                 'spotify', 'soundcloud', 'youtube',
                 'genre_funk', 'genre_hiphop', 'genre_pop', 'genre_rnb', 'genre_soul']
dfX = df[feature_names].copy()
dfy = df['label'].copy()

In [26]:
dfX.tail()

Unnamed: 0,single_count,freq_billboard,freq_genius,freq_theSource,freq_xxl,twitter,instagram,facebook,spotify,soundcloud,youtube,genre_funk,genre_hiphop,genre_pop,genre_rnb,genre_soul
1086,0,0,0,200,9,3708,7855,10703,15130,1043,14066,0,0,1,0,0
1087,3,2,0,0,9,600,286554,355355,329507,21954,1728617,0,0,1,0,0
1088,2,0,0,0,0,1913,23837,9187,16544,756,743,0,0,1,0,0
1089,6,2,0,0,0,11571,27192,27615,60983,6295,43228,0,0,1,0,0
1090,4,1,0,0,213,19974,27428,60138,37009,2341,22642,0,0,1,0,0


In [27]:
dfy.tail()

1086    0
1087    0
1088    0
1089    0
1090    0
Name: label, dtype: int64

In [92]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dfX, dfy, test_size=0.25, random_state=0)

In [100]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(criterion='entropy', max_depth=30, min_samples_leaf=5).fit(X_train, y_train)

In [101]:
from sklearn.metrics import confusion_matrix

In [102]:
confusion_matrix(y_train, model.predict(X_train))

array([[623,  19],
       [ 25, 151]])

In [103]:
confusion_matrix(y_test, model.predict(X_test))

array([[202,  20],
       [ 16,  35]])

In [104]:
from sklearn.metrics import classification_report

print(classification_report(y_train, model.predict(X_train)))

             precision    recall  f1-score   support

          0       0.96      0.97      0.97       642
          1       0.89      0.86      0.87       176

avg / total       0.95      0.95      0.95       818



In [105]:
print(classification_report(y_test, model.predict(X_test)))

             precision    recall  f1-score   support

          0       0.93      0.91      0.92       222
          1       0.64      0.69      0.66        51

avg / total       0.87      0.87      0.87       273



In [106]:
for name, importance in zip(dfX.columns, model.feature_importances_):
    print(name, importance)

single_count 0.011682324030170664
freq_billboard 0.010780481350691394
freq_genius 0.011997739530850938
freq_theSource 0.06112999042819946
freq_xxl 0.054338875263576045
twitter 0.058338297627095005
instagram 0.11278980907194645
facebook 0.013946878806751481
spotify 0.12821663460621743
soundcloud 0.3732185870082315
youtube 0.027181339737873175
genre_funk 0.0
genre_hiphop 0.0
genre_pop 0.13637904253839633
genre_rnb 0.0
genre_soul 0.0
