#### Data Preparation and PCA

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
%store -r condensed

In [3]:
def create_dummy(df, column, split):
    f_set = set()
    for feature in df[column]:
        feature_list = feature.split(split)
        for f in feature_list:
            f_set.add(f)
    f_lst = list(f_set)
    for item in f_lst:
        df[item] = df[column].apply(lambda x: 1 if item in x else 0)
    new_df = df.drop(columns = column)
    return new_df

In [4]:
preprocessed = condensed.copy()

In [5]:
preprocessed.drop(columns = "title", inplace=True)

In [6]:
preprocessed = create_dummy(preprocessed, "genresb", '|')

In [8]:
preprocessed = create_dummy(preprocessed, "directors", ',')

MemoryError: Unable to allocate 3.50 GiB for an array with shape (4661, 100760) and data type int64

In [None]:
features = preprocessed.drop(columns = ["userId", "movieId", "rating"])

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
scaler = StandardScaler()
features_transformed = scaler.fit_transform(features)

In [None]:
pca = PCA(n_components=21)
principalComponents = pca.fit_transform(features_transformed)

In [None]:
pca.explained_variance_ratio_.sum()

In [None]:
components_df = pd.DataFrame(principalComponents, columns = ["PC" + str(num) for num in range(1, 22)])

In [None]:
from lightfm.data import Dataset
from lightfm import LightFM

In [None]:
consolidated = pd.concat([user_item.reset_index(), components_df], axis=1)

In [None]:
consolidated.head()

In [None]:
item_df = consolidated.groupby("movieId").mean().drop(columns = ["index", "userId", "rating"], axis = 1).reset_index()

In [None]:
def binning(df, features, bins):
    new_df = pd.DataFrame([])
    for feature in features:
        fdf = pd.cut(x = df[feature], bins=bins)
        new = pd.concat([new_df, fdf], axis=1)
        new_df = new
    return new_df

In [None]:
binned_pc = binning(item_df, ["PC"+str(num) for num in range(1, 22)], 6)

In [None]:
binned_items = pd.concat([item_df["movieId"], binned_pc], axis=1)

In [None]:
features_df = pd.get_dummies(binned_items).set_index("movieId")

#### Modeling with LightFM

In [None]:
item_features=list(features_df.columns)
item_features[:5]

In [None]:
data_fm = Dataset()
data_fm.fit(user_item.userId.unique(), user_item.movieId.unique(), item_features = item_features)

In [None]:
interactions_matrix, weights_matrix = data_fm.build_interactions([tuple(i) for i in user_item.values])

print(repr(interactions_matrix))

In [None]:
feature_dict = dict(list(features_df.groupby(features_df.index)))

In [None]:
l = []
for k, v in feature_dict.items():
    check = v.columns[(v == 1).any()]
    if len(check) > 0:
        l.append((k, check.to_list()))

In [None]:
features_tuples = l

In [None]:
features_matrix = data_fm.build_item_features(features_tuples)

print(repr(features_matrix))

In [None]:
model = LightFM(loss='warp')
model.fit(interactions_matrix, item_features=features_matrix)

In [None]:
score = auc_score(model, interactions_matrix, 
        item_features=features_matrix).mean()