# Music Genre Prediction

In [1]:
class Config:
    NB = '102'

    raw_data_dir = '../data/raw/'
    processed_data_dir = '../data/processed/'
    interim_dir = '../data/interim/'
    submission_dir = '../data/submission/'

    random_seed = 42
    n_folds = 5

    row_id = 'index'
    target = 'genre'

## Import libralies

In [2]:
import gc
import warnings
warnings.filterwarnings('ignore')

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
import itertools

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set(style='white', context='notebook', palette='deep')

In [3]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

plotly_template = dict(
    layout=go.Layout(
        template='plotly_dark',
        font=dict(
            family="Franklin Gothic",
            size=12
        ),
        height=500,
        width=1000,
    )
)

color_palette = {
    'Bin': ['#016CC9','#E876A3'],
    'Cat5': ['#E876A3', '#E0A224', '#63B70D', '#6BCFF6', '#13399E'],
    'Cat10': ['#E876A3', '#E0A224', '#63B70D', '#6BCFF6', '#13399E', '#E876A3', '#E0A224', '#63B70D', '#6BCFF6', '#13399E'],
}

In [4]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.impute import KNNImputer

## Load data

In [5]:
df_train = pd.read_csv(Config.raw_data_dir + 'train.csv')
df_test = pd.read_csv(Config.raw_data_dir + 'test.csv')

## 欠損値補完
- objectで同じ条件になっているものの中央値で補完する

In [6]:
display(df_train.info())
display(df_test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4046 entries, 0 to 4045
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   index             4046 non-null   int64  
 1   genre             4046 non-null   int64  
 2   popularity        4046 non-null   int64  
 3   duration_ms       4046 non-null   int64  
 4   acousticness      4046 non-null   float64
 5   positiveness      4036 non-null   float64
 6   danceability      4038 non-null   float64
 7   loudness          4046 non-null   float64
 8   energy            4046 non-null   float64
 9   liveness          4043 non-null   float64
 10  speechiness       4038 non-null   float64
 11  instrumentalness  4045 non-null   float64
 12  tempo             4046 non-null   object 
 13  region            4046 non-null   object 
dtypes: float64(8), int64(4), object(2)
memory usage: 442.7+ KB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4046 entries, 0 to 4045
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   index             4046 non-null   int64  
 1   popularity        4046 non-null   int64  
 2   duration_ms       4046 non-null   int64  
 3   acousticness      4045 non-null   float64
 4   positiveness      4032 non-null   float64
 5   danceability      4035 non-null   float64
 6   loudness          4046 non-null   float64
 7   energy            4045 non-null   float64
 8   liveness          4040 non-null   float64
 9   speechiness       4035 non-null   float64
 10  instrumentalness  4044 non-null   float64
 11  tempo             4046 non-null   object 
 12  region            4046 non-null   object 
dtypes: float64(8), int64(3), object(2)
memory usage: 411.0+ KB


None

In [7]:
col_list = ['acousticness', 'positiveness', 'danceability', 'energy', 'liveness', 'speechiness', 'instrumentalness']

imputer = KNNImputer(n_neighbors=3)
imputer.fit(df_train[col_list])
for df in [df_train, df_test]:
    df[col_list] = imputer.transform(df[col_list])

In [8]:
display(df_train.info())
display(df_test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4046 entries, 0 to 4045
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   index             4046 non-null   int64  
 1   genre             4046 non-null   int64  
 2   popularity        4046 non-null   int64  
 3   duration_ms       4046 non-null   int64  
 4   acousticness      4046 non-null   float64
 5   positiveness      4046 non-null   float64
 6   danceability      4046 non-null   float64
 7   loudness          4046 non-null   float64
 8   energy            4046 non-null   float64
 9   liveness          4046 non-null   float64
 10  speechiness       4046 non-null   float64
 11  instrumentalness  4046 non-null   float64
 12  tempo             4046 non-null   object 
 13  region            4046 non-null   object 
dtypes: float64(8), int64(4), object(2)
memory usage: 442.7+ KB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4046 entries, 0 to 4045
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   index             4046 non-null   int64  
 1   popularity        4046 non-null   int64  
 2   duration_ms       4046 non-null   int64  
 3   acousticness      4046 non-null   float64
 4   positiveness      4046 non-null   float64
 5   danceability      4046 non-null   float64
 6   loudness          4046 non-null   float64
 7   energy            4046 non-null   float64
 8   liveness          4046 non-null   float64
 9   speechiness       4046 non-null   float64
 10  instrumentalness  4046 non-null   float64
 11  tempo             4046 non-null   object 
 12  region            4046 non-null   object 
dtypes: float64(8), int64(3), object(2)
memory usage: 411.0+ KB


None

## TrainとTestの統合

In [9]:
df_dataset = pd.concat(objs=[df_train, df_test], axis=0).reset_index(drop=True)
df_dataset

Unnamed: 0,index,genre,popularity,duration_ms,acousticness,positiveness,danceability,loudness,energy,liveness,speechiness,instrumentalness,tempo,region
0,0,10.0,11,201094,0.112811,0.157247,0.187841,-1.884852,0.893918,0.363568,0.390108,0.888884,121-152,region_H
1,1,8.0,69,308493,0.101333,0.346563,0.554444,-5.546495,0.874409,0.193892,0.161497,0.123910,153-176,region_I
2,2,3.0,43,197225,0.496420,0.265391,0.457642,-9.255670,0.439933,0.217146,0.369057,0.166470,64-76,region_E
3,3,10.0,45,301092,0.165667,0.245533,0.356578,-5.088788,0.868704,0.377025,0.226677,0.175399,177-192,region_C
4,4,3.0,57,277348,0.190720,0.777578,0.830479,-3.933896,0.650149,0.169323,0.222488,0.226030,97-120,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8087,8087,,34,366668,0.117208,0.316026,0.502289,-2.909329,0.824708,0.320241,0.206592,0.198859,121-152,region_B
8088,8088,,53,240081,0.225540,0.391226,0.483655,-5.013634,0.630808,0.163884,0.205218,0.179438,121-152,region_I
8089,8089,,48,217839,0.215571,0.828567,0.689747,-8.974036,0.697981,0.318250,0.123748,0.135281,121-152,region_H
8090,8090,,45,249612,0.205074,0.730554,0.580581,-1.915320,0.828423,0.360663,0.165837,0.162821,121-152,region_B


## tempoをEncodingする

In [10]:
df_dataset['tempo_int'] = df_dataset['tempo'].str.split('-', expand=True)[1]
df_dataset = df_dataset.astype({'tempo_int': int})
df_dataset

Unnamed: 0,index,genre,popularity,duration_ms,acousticness,positiveness,danceability,loudness,energy,liveness,speechiness,instrumentalness,tempo,region,tempo_int
0,0,10.0,11,201094,0.112811,0.157247,0.187841,-1.884852,0.893918,0.363568,0.390108,0.888884,121-152,region_H,152
1,1,8.0,69,308493,0.101333,0.346563,0.554444,-5.546495,0.874409,0.193892,0.161497,0.123910,153-176,region_I,176
2,2,3.0,43,197225,0.496420,0.265391,0.457642,-9.255670,0.439933,0.217146,0.369057,0.166470,64-76,region_E,76
3,3,10.0,45,301092,0.165667,0.245533,0.356578,-5.088788,0.868704,0.377025,0.226677,0.175399,177-192,region_C,192
4,4,3.0,57,277348,0.190720,0.777578,0.830479,-3.933896,0.650149,0.169323,0.222488,0.226030,97-120,unknown,120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8087,8087,,34,366668,0.117208,0.316026,0.502289,-2.909329,0.824708,0.320241,0.206592,0.198859,121-152,region_B,152
8088,8088,,53,240081,0.225540,0.391226,0.483655,-5.013634,0.630808,0.163884,0.205218,0.179438,121-152,region_I,152
8089,8089,,48,217839,0.215571,0.828567,0.689747,-8.974036,0.697981,0.318250,0.123748,0.135281,121-152,region_H,152
8090,8090,,45,249612,0.205074,0.730554,0.580581,-1.915320,0.828423,0.360663,0.165837,0.162821,121-152,region_B,152


In [11]:
df_dataset.drop(columns=['tempo'], inplace=True)
df_dataset

Unnamed: 0,index,genre,popularity,duration_ms,acousticness,positiveness,danceability,loudness,energy,liveness,speechiness,instrumentalness,region,tempo_int
0,0,10.0,11,201094,0.112811,0.157247,0.187841,-1.884852,0.893918,0.363568,0.390108,0.888884,region_H,152
1,1,8.0,69,308493,0.101333,0.346563,0.554444,-5.546495,0.874409,0.193892,0.161497,0.123910,region_I,176
2,2,3.0,43,197225,0.496420,0.265391,0.457642,-9.255670,0.439933,0.217146,0.369057,0.166470,region_E,76
3,3,10.0,45,301092,0.165667,0.245533,0.356578,-5.088788,0.868704,0.377025,0.226677,0.175399,region_C,192
4,4,3.0,57,277348,0.190720,0.777578,0.830479,-3.933896,0.650149,0.169323,0.222488,0.226030,unknown,120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8087,8087,,34,366668,0.117208,0.316026,0.502289,-2.909329,0.824708,0.320241,0.206592,0.198859,region_B,152
8088,8088,,53,240081,0.225540,0.391226,0.483655,-5.013634,0.630808,0.163884,0.205218,0.179438,region_I,152
8089,8089,,48,217839,0.215571,0.828567,0.689747,-8.974036,0.697981,0.318250,0.123748,0.135281,region_H,152
8090,8090,,45,249612,0.205074,0.730554,0.580581,-1.915320,0.828423,0.360663,0.165837,0.162821,region_B,152


## regionをone-hotEncodingする

In [12]:
col_list = ['region']
output_col_list = [i for i in sorted(df_dataset['region'].unique())]

ohe = OneHotEncoder(sparse=False)
ohe.fit(df_dataset[col_list])

df_dataset[output_col_list] = ohe.transform(df_dataset[col_list])
df_dataset.drop(columns=col_list, inplace=True)
df_dataset

Unnamed: 0,index,genre,popularity,duration_ms,acousticness,positiveness,danceability,loudness,energy,liveness,speechiness,instrumentalness,tempo_int,region_A,region_B,region_C,region_D,region_E,region_F,region_G,region_H,region_I,region_J,region_K,region_L,region_M,region_N,region_O,region_P,region_Q,region_R,region_S,region_T,unknown
0,0,10.0,11,201094,0.112811,0.157247,0.187841,-1.884852,0.893918,0.363568,0.390108,0.888884,152,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,8.0,69,308493,0.101333,0.346563,0.554444,-5.546495,0.874409,0.193892,0.161497,0.123910,176,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,3.0,43,197225,0.496420,0.265391,0.457642,-9.255670,0.439933,0.217146,0.369057,0.166470,76,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,10.0,45,301092,0.165667,0.245533,0.356578,-5.088788,0.868704,0.377025,0.226677,0.175399,192,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,3.0,57,277348,0.190720,0.777578,0.830479,-3.933896,0.650149,0.169323,0.222488,0.226030,120,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8087,8087,,34,366668,0.117208,0.316026,0.502289,-2.909329,0.824708,0.320241,0.206592,0.198859,152,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8088,8088,,53,240081,0.225540,0.391226,0.483655,-5.013634,0.630808,0.163884,0.205218,0.179438,152,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8089,8089,,48,217839,0.215571,0.828567,0.689747,-8.974036,0.697981,0.318250,0.123748,0.135281,152,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8090,8090,,45,249612,0.205074,0.730554,0.580581,-1.915320,0.828423,0.360663,0.165837,0.162821,152,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## duration_msでflagを作成する
- 600000以上であれば全てTarget_10になる

In [13]:
df_dataset['duration_long'] = np.where(df_dataset['duration_ms'] > 600000, 1, 0)
df_dataset

Unnamed: 0,index,genre,popularity,duration_ms,acousticness,positiveness,danceability,loudness,energy,liveness,speechiness,instrumentalness,tempo_int,region_A,region_B,region_C,region_D,region_E,region_F,region_G,region_H,region_I,region_J,region_K,region_L,region_M,region_N,region_O,region_P,region_Q,region_R,region_S,region_T,unknown,duration_long
0,0,10.0,11,201094,0.112811,0.157247,0.187841,-1.884852,0.893918,0.363568,0.390108,0.888884,152,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,1,8.0,69,308493,0.101333,0.346563,0.554444,-5.546495,0.874409,0.193892,0.161497,0.123910,176,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,2,3.0,43,197225,0.496420,0.265391,0.457642,-9.255670,0.439933,0.217146,0.369057,0.166470,76,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,3,10.0,45,301092,0.165667,0.245533,0.356578,-5.088788,0.868704,0.377025,0.226677,0.175399,192,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,4,3.0,57,277348,0.190720,0.777578,0.830479,-3.933896,0.650149,0.169323,0.222488,0.226030,120,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8087,8087,,34,366668,0.117208,0.316026,0.502289,-2.909329,0.824708,0.320241,0.206592,0.198859,152,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
8088,8088,,53,240081,0.225540,0.391226,0.483655,-5.013634,0.630808,0.163884,0.205218,0.179438,152,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
8089,8089,,48,217839,0.215571,0.828567,0.689747,-8.974036,0.697981,0.318250,0.123748,0.135281,152,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
8090,8090,,45,249612,0.205074,0.730554,0.580581,-1.915320,0.828423,0.360663,0.165837,0.162821,152,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


## PCAで特徴量生成

In [27]:
train_len = len(df_train)
df_train = df_dataset[:train_len]
df_test = df_dataset[train_len:]
df_test.drop(columns=[Config.target], inplace=True)

df_test

Unnamed: 0,index,popularity,duration_ms,acousticness,positiveness,danceability,loudness,energy,liveness,speechiness,instrumentalness,tempo_int,region_A,region_B,region_C,region_D,region_E,region_F,region_G,region_H,region_I,region_J,region_K,region_L,region_M,region_N,region_O,region_P,region_Q,region_R,region_S,region_T,unknown,duration_long
4046,4046,43,192465,0.559812,0.565035,0.522237,-12.217087,0.620510,0.389727,0.214989,0.161924,96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4047,4047,40,226536,0.134002,0.542547,0.493128,-2.612846,0.853089,0.163106,0.184183,0.155460,152,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
4048,4048,58,215385,0.140537,0.643086,0.533189,-3.439684,0.834416,0.429304,0.167352,0.179191,152,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4049,4049,49,254494,0.266406,0.456733,0.410583,-10.830424,0.534261,0.151830,0.174403,0.186061,96,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4050,4050,50,230934,0.351907,0.526235,0.645625,-9.099337,0.612474,0.319161,0.157594,0.184142,120,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8087,8087,34,366668,0.117208,0.316026,0.502289,-2.909329,0.824708,0.320241,0.206592,0.198859,152,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
8088,8088,53,240081,0.225540,0.391226,0.483655,-5.013634,0.630808,0.163884,0.205218,0.179438,152,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
8089,8089,48,217839,0.215571,0.828567,0.689747,-8.974036,0.697981,0.318250,0.123748,0.135281,152,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
8090,8090,45,249612,0.205074,0.730554,0.580581,-1.915320,0.828423,0.360663,0.165837,0.162821,152,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [23]:
col_list = [col for col in df_test.columns if col not in [Config.row_id, Config.target]]

pca = PCA()
pca.fit(df_train[col_list])

train_pca = pca.transform(df_train[col_list])
train_pca_cols = ["PCA{}".format(x + 1) for x in range(len(col_list))]
df_train_pca = pd.DataFrame(train_pca, columns=train_pca_cols)

test_pca = pca.transform(df_test[col_list])
test_pca_cols = ["PCA{}".format(x + 1) for x in range(len(col_list))]
df_test_pca = pd.DataFrame(test_pca, columns=test_pca_cols)

display(df_train_pca.head())
display(df_test_pca.head())

Unnamed: 0,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,PCA7,PCA8,PCA9,PCA10,PCA11,PCA12,PCA13,PCA14,PCA15,PCA16,PCA17,PCA18,PCA19,PCA20,PCA21,PCA22,PCA23,PCA24,PCA25,PCA26,PCA27,PCA28,PCA29,PCA30,PCA31,PCA32,PCA33
0,-41047.028616,-18.457388,30.293345,-5.422681,-0.104217,-0.234368,-0.068311,-0.002252,-0.062199,0.491445,-0.331957,-0.248435,0.86949,-0.040969,-0.168649,-0.086663,0.003225,0.008698,-0.324209,0.401266,-0.026403,-0.057022,-0.01231,-0.143558,0.084192,-0.014335,0.012365,-0.018503,0.170605,-0.007789,0.003003,0.001297,-2.488499e-15
1,66351.971231,-40.207097,-28.864875,-1.711876,0.778478,0.388915,0.003601,-0.020309,-0.078451,-0.011753,0.023422,-0.175051,0.121121,0.102128,0.022288,-0.014716,-0.08115,-0.068024,-0.000156,-0.102287,-0.007207,0.015808,0.011844,0.02152,-0.002228,-0.159161,-0.055892,-0.009195,-0.073661,-0.001253,-0.007616,0.001789,2.731804e-16
2,-44916.030091,57.856874,-1.052937,0.660017,-0.671912,0.540201,0.027787,-0.007419,-0.147084,0.126547,0.004782,-0.008503,-0.02056,0.013635,-0.010916,0.026007,-0.020794,-0.009641,0.014598,-0.058904,-0.016021,-0.002627,-0.015658,0.00205,-0.013158,-0.006481,-0.027968,-0.036385,0.172099,-0.002751,0.004051,-0.000704,1.036459e-15
3,58950.971693,-56.536491,-4.939926,-1.791299,-0.026038,-0.180815,-0.031525,0.019107,0.003455,0.177299,-0.079375,-0.181878,0.16636,0.209821,0.209278,0.006396,-0.066603,0.056852,0.859911,0.285185,-0.304252,-0.000195,-0.057497,-0.024787,-0.029264,-0.114671,-0.058884,-0.020611,0.012615,-0.006272,-0.024358,1.2e-05,-2.460744e-15
4,35206.970466,15.167407,-16.097608,-4.069444,-0.008173,-0.451815,0.768188,-0.267631,-0.021553,-0.383528,0.097777,0.094918,-0.031021,-0.109564,-0.08096,-0.052722,0.024221,-0.170375,-0.047499,0.072487,-0.007138,-0.064069,0.114143,-0.085954,0.025962,0.052613,0.005468,-0.015419,0.01142,0.010743,-0.012012,0.001413,1.248763e-16


Unnamed: 0,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,PCA7,PCA8,PCA9,PCA10,PCA11,PCA12,PCA13,PCA14,PCA15,PCA16,PCA17,PCA18,PCA19,PCA20,PCA21,PCA22,PCA23,PCA24,PCA25,PCA26,PCA27,PCA28,PCA29,PCA30,PCA31,PCA32,PCA33
0,-49676.029781,37.827254,-1.203157,3.896051,0.756954,0.435221,0.0276,-0.009972,0.007484,-0.108471,0.014498,-0.010551,-0.028144,-0.040303,-0.042421,0.01167,0.034089,0.160576,0.006025,0.043478,-0.035205,0.018208,-0.067187,0.053,-0.024333,-0.099363,-0.030289,0.005213,-0.017307,0.000269,0.018075,-0.000743,1.147481e-15
1,-15605.028875,-17.794582,1.067705,-4.859575,-0.054965,-0.442305,0.767888,-0.271657,-0.187066,-0.177795,0.091139,-0.015145,-0.003415,0.004428,-0.042162,-0.029125,-0.057647,-0.120531,-0.025401,-0.026525,0.003523,0.031864,-0.056627,0.023175,-0.008313,-0.029586,-0.014264,-0.001368,-0.017718,-0.007041,0.007637,-9.3e-05,9.852992e-16
2,-26756.029032,-17.801624,-16.839697,-4.15,-0.016922,-0.278409,-0.053773,0.084222,0.548842,0.236511,0.718866,-0.238457,0.000449,-0.020219,-0.103764,-0.047764,-0.019516,0.176301,-0.05353,0.025637,-0.037573,0.019947,-0.008436,-0.009287,0.007428,0.015649,0.012389,-0.005611,-0.0419,0.001946,0.011182,-4.9e-05,-3.21309e-17
3,12352.970158,38.834857,-7.729714,2.510336,-0.000576,-0.226795,-0.051288,0.054015,0.32216,0.056849,-0.559449,-0.587421,-0.438442,-0.10142,-0.170519,0.014509,-0.100922,-0.137282,0.003446,-0.113302,-0.016801,0.03169,-0.068217,0.037827,-0.030516,0.040329,0.000504,-0.015224,-0.002507,-0.003999,-0.013284,-0.00251,-1.29501e-15
4,-11207.02947,14.451739,-8.728795,1.107609,-0.645761,0.548067,0.016421,-0.029261,0.028785,-0.167191,0.032402,-0.036965,0.008429,-0.016018,-0.027262,-0.01043,0.001631,0.049398,-4.7e-05,0.00588,-0.019334,-0.021789,0.045183,-0.017118,0.002036,-0.034561,-0.008988,0.006817,-0.06067,0.00109,0.013702,-0.000145,4.535917e-16


In [16]:
train_exp_df = pd.DataFrame(pca.explained_variance_ratio_, index=train_pca_cols)
train_exp_df

Unnamed: 0,0
PCA1,0.9999998
PCA2,1.273644e-07
PCA3,3.591685e-08
PCA4,2.303174e-09
PCA5,2.489359e-11
PCA6,1.90453e-11
PCA7,1.233094e-11
PCA8,1.144326e-11
PCA9,9.223502e-12
PCA10,9.13029e-12


In [28]:
df_train = pd.concat(objs=[df_train, df_train_pca[["PCA{}".format(x + 1) for x in range(5)]]], axis=1)

df_test = df_test.reset_index()
df_test = pd.concat(objs=[df_test, df_test_pca[["PCA{}".format(x + 1) for x in range(5)]]], axis=1)
df_test = df_test.set_index('level_0')
display(df_train.head())
display(df_test.head())

Unnamed: 0,index,genre,popularity,duration_ms,acousticness,positiveness,danceability,loudness,energy,liveness,speechiness,instrumentalness,tempo_int,region_A,region_B,region_C,region_D,region_E,region_F,region_G,region_H,region_I,region_J,region_K,region_L,region_M,region_N,region_O,region_P,region_Q,region_R,region_S,region_T,unknown,duration_long,PCA1,PCA2,PCA3,PCA4,PCA5
0,0,10.0,11,201094,0.112811,0.157247,0.187841,-1.884852,0.893918,0.363568,0.390108,0.888884,152,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,-41047.028616,-18.457388,30.293345,-5.422681,-0.104217
1,1,8.0,69,308493,0.101333,0.346563,0.554444,-5.546495,0.874409,0.193892,0.161497,0.12391,176,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,66351.971231,-40.207097,-28.864875,-1.711876,0.778478
2,2,3.0,43,197225,0.49642,0.265391,0.457642,-9.25567,0.439933,0.217146,0.369057,0.16647,76,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,-44916.030091,57.856874,-1.052937,0.660017,-0.671912
3,3,10.0,45,301092,0.165667,0.245533,0.356578,-5.088788,0.868704,0.377025,0.226677,0.175399,192,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,58950.971693,-56.536491,-4.939926,-1.791299,-0.026038
4,4,3.0,57,277348,0.19072,0.777578,0.830479,-3.933896,0.650149,0.169323,0.222488,0.22603,120,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,35206.970466,15.167407,-16.097608,-4.069444,-0.008173


Unnamed: 0_level_0,index,popularity,duration_ms,acousticness,positiveness,danceability,loudness,energy,liveness,speechiness,instrumentalness,tempo_int,region_A,region_B,region_C,region_D,region_E,region_F,region_G,region_H,region_I,region_J,region_K,region_L,region_M,region_N,region_O,region_P,region_Q,region_R,region_S,region_T,unknown,duration_long,PCA1,PCA2,PCA3,PCA4,PCA5
level_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1
4046,4046,43,192465,0.559812,0.565035,0.522237,-12.217087,0.62051,0.389727,0.214989,0.161924,96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,-49676.029781,37.827254,-1.203157,3.896051,0.756954
4047,4047,40,226536,0.134002,0.542547,0.493128,-2.612846,0.853089,0.163106,0.184183,0.15546,152,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,-15605.028875,-17.794582,1.067705,-4.859575,-0.054965
4048,4048,58,215385,0.140537,0.643086,0.533189,-3.439684,0.834416,0.429304,0.167352,0.179191,152,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,-26756.029032,-17.801624,-16.839697,-4.15,-0.016922
4049,4049,49,254494,0.266406,0.456733,0.410583,-10.830424,0.534261,0.15183,0.174403,0.186061,96,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,12352.970158,38.834857,-7.729714,2.510336,-0.000576
4050,4050,50,230934,0.351907,0.526235,0.645625,-9.099337,0.612474,0.319161,0.157594,0.184142,120,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,-11207.02947,14.451739,-8.728795,1.107609,-0.645761


## Save Dataset

In [29]:
Config.NB

'102'

In [31]:
pd.to_pickle(df_train, Config.processed_data_dir + f"nb{Config.NB}_train.pkl", compression='zip')
pd.to_pickle(df_test, Config.processed_data_dir + f"nb{Config.NB}_test.pkl", compression='zip')

## 検証メモ