In [None]:
%matplotlib inline

In [None]:
# import default scientific libraries
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt

In [None]:
# import pandas for easy data management, keras for deep learning and scikit for feature generation
import pandas as pd
import keras
import keras.models as km
import keras.layers as kl
import keras.optimizers as ko
import mca
from sklearn import metrics, model_selection, manifold, decomposition

In [None]:
# load data into a table by extracting user and item numbers
ratings = pd.read_csv('data/data_train.csv', dtype={'Prediction': np.int})
pos = ratings.Id.str.extract('r([0-9]+)_c([0-9]+)', expand=True)
ratings['User'] = pos[0].astype(np.int)
ratings['Item'] = pos[1].astype(np.int)
ratings.head()

In [None]:
# split data into a train and validation sets
train_x, test_x = model_selection.train_test_split(ratings, test_size=0.1)

In [None]:
# save train and test set

In [None]:
train_x.to_csv('train_x.csv', index=False)

In [None]:
test_x.to_csv('test_x.csv', index=False)

In [None]:
# build a matrix user x item for feature engineering 
ratings_mat = np.array(sp.sparse.bsr_matrix((train_x.Prediction, (train_x.User - 1, train_x.Item - 1))).todense())

In [None]:
# extract a representation for each item using spectral embedding
items = manifold.SpectralEmbedding(n_components=128, n_jobs=16).fit_transform(ratings_mat)

In [None]:
# extract a representation for each user using spectral embedding
users = manifold.SpectralEmbedding(n_components=128, n_jobs=16).fit_transform(ratings_mat.T)

In [None]:
# save feature 1

In [None]:
np.save('items', items)

In [None]:
np.save('users', users)

In [None]:
# extract a representation for each item using factor analysis
items2 = decomposition.FactorAnalysis(n_components=128, iterated_power=5).fit_transform(ratings_mat)

In [None]:
# extract a representation for each user using factor analysis
users2 = decomposition.FactorAnalysis(n_components=128, iterated_power=5).fit_transform(ratings_mat.T)

In [None]:
# save feature 2

In [None]:
np.save('items2', items2)

In [None]:
np.save('users2', users2)

In [None]:
# extract a representation for each item using multiplce component analysis
items3 = mca.MCA(pd.DataFrame(ratings_mat)).fs_r(1)

In [None]:
# extract a representation for each user using multiplce component analysis
users3 = mca.MCA(pd.DataFrame(ratings_mat.T)).fs_r(1)

In [None]:
# save feature 3

In [None]:
np.save('items3', items3)

In [None]:
np.save('users3', users3)