# Loading packages

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectFromModel

# static

In [None]:
TRAIN_PATH = '/content/drive/MyDrive/student_cup_2021/dataset/train.csv'
TEST_PATH = '/content/drive/MyDrive/student_cup_2021/dataset/test.csv'

# function

# Loading data

In [None]:
df_train = pd.read_csv(TRAIN_PATH)

print('df_train shape: {0}'.format(df_train.shape))

df_train shape: (4046, 14)


In [None]:
df_train.head()

Unnamed: 0,index,genre,popularity,duration_ms,acousticness,positiveness,danceability,loudness,energy,liveness,speechiness,instrumentalness,tempo,region
0,0,10,11,201094,0.112811,0.157247,0.187841,-1.884852,0.893918,0.363568,0.390108,0.888884,121-152,region_H
1,1,8,69,308493,0.101333,0.346563,0.554444,-5.546495,0.874409,0.193892,0.161497,0.12391,153-176,region_I
2,2,3,43,197225,0.49642,0.265391,0.457642,-9.25567,0.439933,0.217146,0.369057,0.16647,64-76,region_E
3,3,10,45,301092,0.165667,0.245533,0.356578,-5.088788,0.868704,0.377025,0.226677,0.175399,177-192,region_C
4,4,3,57,277348,0.19072,0.777578,0.830479,-3.933896,0.650149,0.169323,0.222488,0.22603,97-120,unknown


# Preprocessing

## 欠損値

In [None]:
df_train = df_train.dropna()

df_train = df_train.reset_index(drop=True)

df_genre = df_train['genre']
df_train = df_train.drop(['index', 'genre'], axis=1)

print('df_train shape: {0}'.format(df_train.shape))

df_train shape: (4036, 12)


## popularity

In [None]:
# df_popularity = 'popularity_'+df_train['popularity'].astype(str)
# df_popularity = pd.get_dummies(df_popularity)

# df_train = pd.concat([df_train, df_popularity], axis=1)

# print('df_train shape: {0}'.format(df_train.shape))

## duration_ms

In [None]:
df_train['duration_s'] = df_train['duration_ms'] / 1000
df_train['duration_m'] = df_train['duration_ms'] / 60000
df_train['duration_log10'] = np.log10(df_train['duration_ms']+1)

print('df_train shape: {0}'.format(df_train.shape))

df_train shape: (4036, 15)


## tempo

In [None]:
df_tempo = df_train['tempo'].str.split('-', expand=True).astype(int)
df_tempo.columns = ['tempo_min', 'tempo_max']
df_train = pd.concat([df_train, df_tempo], axis=1)

print('df_train shape: {0}'.format(df_train.shape))

df_train shape: (4036, 17)


## tempo(range of bpm) and duration_m

In [None]:
df_train['tempo_min_bpm'] = df_train['tempo_min']*df_train['duration_m']
df_train['tempo_max_bpm'] = df_train['tempo_max']*df_train['duration_m']
df_train['tempo_avg'] = (df_train['tempo_min']+df_train['tempo_max']) / 2
df_train['tempo_avg_bpm'] = df_train['tempo_avg']*df_train['duration_ms']

print('df_train shape: {0}'.format(df_train.shape))

df_train shape: (4036, 21)


## One-hot-encoding & Label-encoding

In [None]:
df_train.head()

Unnamed: 0,popularity,duration_ms,acousticness,positiveness,danceability,loudness,energy,liveness,speechiness,instrumentalness,tempo,region,duration_s,duration_m,duration_log10,tempo_min,tempo_max,tempo_min_bpm,tempo_max_bpm,tempo_avg,tempo_avg_bpm
0,11,201094,0.112811,0.157247,0.187841,-1.884852,0.893918,0.363568,0.390108,0.888884,121-152,region_H,201.094,3.351567,5.303401,121,152,405.539567,509.438133,136.5,27449331.0
1,69,308493,0.101333,0.346563,0.554444,-5.546495,0.874409,0.193892,0.161497,0.12391,153-176,region_I,308.493,5.14155,5.489247,153,176,786.65715,904.9128,164.5,50747098.5
2,43,197225,0.49642,0.265391,0.457642,-9.25567,0.439933,0.217146,0.369057,0.16647,64-76,region_E,197.225,3.287083,5.294964,64,76,210.373333,249.818333,70.0,13805750.0
3,45,301092,0.165667,0.245533,0.356578,-5.088788,0.868704,0.377025,0.226677,0.175399,177-192,region_C,301.092,5.0182,5.478701,177,192,888.2214,963.4944,184.5,55551474.0
4,57,277348,0.19072,0.777578,0.830479,-3.933896,0.650149,0.169323,0.222488,0.22603,97-120,unknown,277.348,4.622467,5.443027,97,120,448.379267,554.696,108.5,30092258.0


In [None]:
col_list = ['tempo', 'region']

for col in col_list:

  df_temp = pd.get_dummies(df_train[col])
  df_train = pd.concat([df_train, df_temp], axis=1)

  le = LabelEncoder()
  df_train[col] = le.fit_transform(df_train[col])

print('df_train shape: {0}'.format(df_train.shape))

df_train shape: (4036, 54)


## plus & times

In [None]:
# col_list = [col for col in df_train.columns]
# col_list.remove('tempo')
# col_list.remove('region')

# print('col_list len: {0}'.format(len(col_list)))

In [None]:
# for a in col_list:
#   for b in col_list:
#     df_train[a+'_plus_'+b] = df_train[a] + df_train[b]
#     df_train[a+'_times_'+b] = df_train[a] * df_train[b]

# print('df_train shape: {0}'.format(df_train.shape))

## 関係ありそうなカラムに焦点を当てて特徴量生成

In [None]:
# from itertools import combinations

# col_list = ['acousticness', 'positiveness', 'danceability', 'energy', 'liveness', 'speechiness', 'instrumentalness']

# for a, b in combinations(col_list, 2):
#   df_train[a+'_plus_'+b] = df_train[a]+df_train[b]
#   df_train[a+'_times_'+b] = df_train[a]*df_train[b]

# for a, b, c in combinations(col_list, 3):
#   df_train[a+'_plus_'+b+'_plus_'+c] = df_train[a]+df_train[b]+df_train[c]
#   df_train[a+'_times_'+b+'_times_'+c] = df_train[a]*df_train[b]*df_train[c]

# for a, b, c, d in combinations(col_list, 4):
#   df_train[a+'_plus_'+b+'_plus_'+c+'_plus_'+d] = df_train[a]+df_train[b]+df_train[c]+df_train[d]
#   df_train[a+'_times_'+b+'_times_'+c+'_times_'+d] = df_train[a]*df_train[b]*df_train[c]*df_train[d]

# for a, b, c, d, e in combinations(col_list, 5):
#   df_train[a+'_plus_'+b+'_plus_'+c+'_plus_'+d+'_plus_'+e] = df_train[a]+df_train[b]+df_train[c]+df_train[d]+df_train[e]
#   df_train[a+'_times_'+b+'_times_'+c+'_times_'+d+'_times_'+e] = df_train[a]*df_train[b]*df_train[c]*df_train[d]*df_train[e]

# for a, b, c, d, e, f in combinations(col_list, 6):
#   df_train[a+'_plus_'+b+'_plus_'+c+'_plus_'+d+'_plus_'+e+'_plus_'+f] = df_train[a]+df_train[b]+df_train[c]+df_train[d]+df_train[e]+df_train[f]
#   df_train[a+'_times_'+b+'_times_'+c+'_times_'+d+'_times_'+e+'_times_'+f] = df_train[a]*df_train[b]*df_train[c]*df_train[d]*df_train[e]*df_train[f]

# for a, b, c, d, e, f, g in combinations(col_list, 7):
#   df_train[a+'_plus_'+b+'_plus_'+c+'_plus_'+d+'_plus_'+e+'_plus_'+f+'_plus_'+g] = df_train[a]+df_train[b]+df_train[c]+df_train[d]+df_train[e]+df_train[f]+df_train[g]
#   df_train[a+'_times_'+b+'_times_'+c+'_times_'+d+'_times_'+e+'_times_'+f+'_times_'+g] = df_train[a]*df_train[b]*df_train[c]*df_train[d]*df_train[e]*df_train[f]*df_train[g]

# print('df_train shape: {0}'.format(df_train.shape))

# Evaluate model

|notes|scores}|
|:--:|:--:|
|LightGBM|0.563|
|LightGBM slecting features|0.556|
|knn slecting features|max=0.182, min=0.144|
|LightGBM creating other features instead of "plus & times"|0.500|
|LightGBM creating temp*bpm|0.510|
|After selecting features, LightGBM creating temp*bpm|0.516|
|LightGBM|0.529|

## k-fold cross validation

In [None]:
print('df_train shape: {0}, df_genre shape: {1}'.format(df_train.shape, df_genre.shape))

df_train shape: (4036, 54), df_genre shape: (4036,)


### LightGBM

In [None]:
clf = lgb.LGBMClassifier()

scores = cross_val_score(clf, df_train, df_genre, scoring='f1_macro', cv=10)

print('f1 macro (CV=10): {0:.3f}'.format(scores.mean()))

f1 macro (CV=10): 0.529


### k-nn

In [None]:
# for num in range(1, 11):

#   clf = KNeighborsClassifier(n_neighbors=num)

#   scores = cross_val_score(clf, df_train, df_genre, scoring='f1_macro', cv=10)

#   print('n_neighbors='+str(num)+', f1 macro of cross validation: {0:.3f}'.format(scores.mean()))

## Selecting features & k-fold validation

In [None]:
# print('before selecting features df_train shape: {0}'.format(df_train.shape))

# sfm = SelectFromModel(clf).fit(df_train, df_genre)
# X_train = sfm.transform(df_train)

# print('after selecting features, df_train shape: {0}'.format(X_train.shape))

### LightGBM

In [None]:
# clf = lgb.LGBMClassifier()

# scores = cross_val_score(clf, X_train, df_genre, scoring='f1_macro', cv=10)
# print('f1 macro (CV=10): {0:.3f}'.format(scores.mean()))

### knn

In [None]:
# for num in range(1, 11):

#   clf = KNeighborsClassifier(n_neighbors=num)

#   scores = cross_val_score(clf, X_train, df_genre, scoring='f1_macro', cv=10)

#   print('n_neighbors='+str(num)+', f1 macro of cross validation: {0:.3f}'.format(scores.mean()))

# feature importance

In [None]:
# clf = lgb.LGBMClassifier()

# clf.fit(df_train, df_genre)

In [None]:
# df_feature = pd.DataFrame({'columns': df_train.columns, 'importances': clf.feature_importances_}).sort_values('importances', ascending=False)
# df_feature.head(50)

# Colab

```javascript
function ClickConnect(){ 
console.log("Working"); 
document.querySelector("#comments > span").click()
}
setInterval(ClickConnect,500000)
```