# MCA dimension reduction

In [3]:
import os
import numpy as np
import pandas as pd
import random
import json
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,classification_report,roc_auc_score
import prince

import gc

In [4]:
train_X = pd.read_csv('music_data/train_X_date_converted.csv')
train_y = pd.read_csv('music_data/train_Y.csv')

val_X = pd.read_csv('music_data/valid_X_date_converted.csv')
val_y = pd.read_csv('music_data/valid_Y.csv')

test_X = pd.read_csv('music_data/test_X_date_converted.csv')
test_y = pd.read_csv('music_data/test_Y.csv')


In [5]:
song_features = ['song_id','genre_ids', 'artist_name', 'composer', 'lyricist','language']
user_features = ['msno','city', 'gender', 'registered_via']
ui_features = ['source_screen_name', 'source_system_tab', 'source_type']
cat_features = ['song_id','genre_ids', 'artist_name', 'composer', 'lyricist','language','msno','city', 'gender', 'registered_via',
               'source_screen_name', 'source_system_tab', 'source_type']

In [11]:
def mca_transform(df,feature,n_comp):
    mca = prince.MCA(
    n_components=n_comp,
    n_iter=3,
    copy=False, #for full dataset
    check_input=True,
    engine='auto',
    random_state=42
)
    mca_fit = mca.fit(df[feature])
    mca_transform = mca.transform(df[feature])
    return mca_fit, mca_transform

In [10]:
gc.collect()

0

# Tiny Dataset

In [4]:
train_tiny_X=train_X[:10000].copy()
train_tiny_y = train_y[:10000].copy()

In [5]:
# val_tiny_X = val_X[:1000].copy()
val_tiny_X = val_X[:1000].copy()
val_tiny_y = val_y[:1000].copy()

In [6]:
train_tiny_X.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,msno,song_id,source_screen_name,source_system_tab,source_type,song_length,genre_ids,artist_name,composer,lyricist,language,city,bd,gender,registered_via,time,registration_init_time_int,expiration_date_int
0,0,2942719,8145,253733,16,6,8,267517.0,371,4252,8389,2681,9,0,0,2,1,0.296221,6193,6196
1,1,4875524,5224,145235,16,6,8,200620.0,371,34892,74276,26024,6,3,41,1,2,0.490781,5000,6475
2,2,6589819,5474,22231,11,0,7,213342.0,371,20609,27775,9110,9,0,0,2,1,0.663346,6150,6436
3,3,1172060,23177,70181,8,3,3,262246.0,371,44425,83027,34734,2,0,0,2,0,0.117982,5181,6498
4,4,2069395,3269,128141,12,2,2,310753.0,371,42400,81151,32836,2,0,0,2,2,0.20831,4393,6275


In [8]:
tiny_X_all = pd.concat([train_tiny_X, val_tiny_X ])
tiny_X_all.shape

(11000, 20)

In [10]:
mca_song_tiny_fit, mca_song_tiny_transform =  mca_transform(tiny_X_all,song_features,6)
mca_user_tiny_fit, mca_user_tiny_transform =  mca_transform(tiny_X_all,user_features,4)
mca_ui_tiny_fit, mca_ui_tiny_transform =  mca_transform(tiny_X_all,ui_features,3)

In [29]:
print(mca_song_tiny_fit.eigenvalues_)
print(mca_user_tiny_fit.eigenvalues_)
print(mca_ui_tiny_fit.eigenvalues_)

[0.6781931102918013, 0.6716826044192571, 0.661456515227754, 0.6580465654469605, 0.6496924942356715, 0.6422578906306438]
[0.7549708709036239, 0.3840739546418269, 0.3572200452476017, 0.35549666945672825]
[0.9825064669000568, 0.9538624661367507, 0.9353457708381208]


In [11]:
mca_song_tiny_transform.columns = ['sf1', 'sf2','sf3','sf4','sf5','sf6']
mca_user_tiny_transform.columns = ['us1', 'us2','us3','us4']
mca_ui_tiny_transform.columns = ['ui1', 'ui2','ui3']

In [12]:
mca_song_tiny_transform.head()

Unnamed: 0,sf1,sf2,sf3,sf4,sf5,sf6
0,-1.264929,0.586057,1.150332,1.241411,0.121936,-0.130327
1,0.020211,0.42711,-0.045508,-0.401574,-0.110475,0.011278
2,-0.354512,-0.071367,-1.735856,-2.082567,0.794944,0.159937
3,-0.37244,0.216244,-0.160072,-0.392358,0.939935,0.274837
4,-1.052685,-0.047159,0.637116,-0.255817,0.231235,0.124874


In [13]:
tiny_X_all = tiny_X_all.drop(columns=cat_features)
tiny_X_all.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,song_length,bd,time,registration_init_time_int,expiration_date_int
0,0,2942719,267517.0,0,0.296221,6193,6196
1,1,4875524,200620.0,41,0.490781,5000,6475
2,2,6589819,213342.0,0,0.663346,6150,6436
3,3,1172060,262246.0,0,0.117982,5181,6498
4,4,2069395,310753.0,0,0.20831,4393,6275


In [14]:
tiny_X_all_2 = pd.concat([tiny_X_all, mca_song_tiny_transform,mca_user_tiny_transform,mca_ui_tiny_transform],axis=1)
tiny_X_all_2.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,song_length,bd,time,registration_init_time_int,expiration_date_int,sf1,sf2,sf3,sf4,sf5,sf6,us1,us2,us3,us4,ui1,ui2,ui3
0,0,2942719,267517.0,0,0.296221,6193,6196,-1.264929,0.586057,1.150332,1.241411,0.121936,-0.130327,1.086989,0.139762,0.138821,-0.136126,3.807323,-0.037351,-0.176017
1,1,4875524,200620.0,41,0.490781,5000,6475,0.020211,0.42711,-0.045508,-0.401574,-0.110475,0.011278,-0.291223,-0.418983,-0.418402,-0.182379,3.807323,-0.037351,-0.176017
2,2,6589819,213342.0,0,0.663346,6150,6436,-0.354512,-0.071367,-1.735856,-2.082567,0.794944,0.159937,0.922629,0.102376,0.373968,-0.973457,-0.208867,0.267235,1.260445
3,3,1172060,262246.0,0,0.117982,5181,6498,-0.37244,0.216244,-0.160072,-0.392358,0.939935,0.274837,0.378772,-0.24447,0.036884,0.151021,-0.309365,-0.553155,-0.836436
4,4,2069395,310753.0,0,0.20831,4393,6275,-1.052685,-0.047159,0.637116,-0.255817,0.231235,0.124874,1.200798,-0.13032,-0.204738,-0.360238,-0.311507,5.302674,-2.109578


In [20]:
train_tiny_X = tiny_X_all_2[:10000]
val_tiny_X = tiny_X_all_2[10000:]
print(val_tiny_X.shape)

(1000, 20)


In [21]:
train_tiny_X.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,song_length,bd,time,registration_init_time_int,expiration_date_int,sf1,sf2,sf3,sf4,sf5,sf6,us1,us2,us3,us4,ui1,ui2,ui3
0,0,2942719,267517.0,0,0.296221,6193,6196,-1.264929,0.586057,1.150332,1.241411,0.121936,-0.130327,1.086989,0.139762,0.138821,-0.136126,3.807323,-0.037351,-0.176017
1,1,4875524,200620.0,41,0.490781,5000,6475,0.020211,0.42711,-0.045508,-0.401574,-0.110475,0.011278,-0.291223,-0.418983,-0.418402,-0.182379,3.807323,-0.037351,-0.176017
2,2,6589819,213342.0,0,0.663346,6150,6436,-0.354512,-0.071367,-1.735856,-2.082567,0.794944,0.159937,0.922629,0.102376,0.373968,-0.973457,-0.208867,0.267235,1.260445
3,3,1172060,262246.0,0,0.117982,5181,6498,-0.37244,0.216244,-0.160072,-0.392358,0.939935,0.274837,0.378772,-0.24447,0.036884,0.151021,-0.309365,-0.553155,-0.836436
4,4,2069395,310753.0,0,0.20831,4393,6275,-1.052685,-0.047159,0.637116,-0.255817,0.231235,0.124874,1.200798,-0.13032,-0.204738,-0.360238,-0.311507,5.302674,-2.109578


In [22]:
train_tiny_X.to_csv('output/mca_train_tiny_X.csv',index=False)
val_tiny_X.to_csv('output/mca_val_tiny_X.csv',index = False)
train_tiny_y.to_csv('output/mca_train_tiny_y.csv',index=False)
val_tiny_y.to_csv('output/mca_val_tiny_y.csv',index = False)

In [23]:
train_tiny_X = pd.read_csv('output/mca_train_tiny_X.csv')
train_tiny_y = pd.read_csv('output/mca_train_tiny_y.csv')
val_tiny_X = pd.read_csv('output/mca_val_tiny_X.csv')
val_tiny_y = pd.read_csv('output/mca_val_tiny_y.csv')

In [24]:
train_tiny_y.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,target
0,0,2942719,0.0
1,1,4875524,0.0
2,2,6589819,0.0
3,3,1172060,1.0
4,4,2069395,0.0


In [26]:
train_tiny_X.shape

(10000, 20)

In [21]:
# catCols = ['msno', 'song_id', 'source_screen_name', 'source_system_tab', 'source_type', 'genre_ids', 'artist_name', 
#            'composer', 'lyricist', 'language', 'city', 'gender', 'registered_via']
# numCols = ['Unnamed: 0','Unnamed: 0.1','bd', 'song_length','time','registration_init_time_int','expiration_date_int']

# Full Dataset

In [7]:
train_X_all = pd.concat([train_X, val_X ])
print(train_X_all.shape,train_X.shape,val_X.shape)

(5901843, 20) (4426382, 20) (1475461, 20)


In [12]:
mca_song_fit, mca_song_transform =  mca_transform(train_X_all,song_features,6)
mca_user_fit, mca_user_transform =  mca_transform(train_X_all,user_features,4)
mca_ui_fit, mca_ui_transform =  mca_transform(train_X_all,ui_features,3)

MemoryError: 

In [None]:
print(mca_song_fit.eigenvalues_)
print(mca_user_fit.eigenvalues_)
print(mca_ui_fit.eigenvalues_)

In [None]:
mca_song_transform.columns = ['sf1', 'sf2','sf3','sf4','sf5','sf6']
mca_user_transform.columns = ['us1', 'us2','us3','us4']
mca_ui_transform.columns = ['ui1', 'ui2','ui3']

In [None]:
mca_song_transform.head()

In [None]:
train_X_all = train_X_all.drop(columns=cat_features)
train_X_all.head()

In [None]:
train_X_all = pd.concat([train_X_all, mca_song_transform,mca_user_transform,mca_ui_transform],axis=1)
train_X_all.head()

In [None]:
train_X = train_X_all[:train_X.shape[0]]
val_X = train_X_all[train_X.shape[0]:]
print(train_X.shape,val_X.shape)

In [None]:
train_X.head()

In [None]:
train_X.to_csv('output/mca_train_X.csv',index=False)
val_X.to_csv('output/mca_val_X.csv',index = False)
train_y.to_csv('output/mca_train_y.csv',index=False)
val_y.to_csv('output/mca_val_y.csv',index = False)