<a href="https://colab.research.google.com/github/tunaemirkandemir/movielens_lightfm/blob/main/movielens_lightfm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [37]:
!pip install lightfm



In [101]:
import os
import sys
import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import lightfm
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm import cross_validation
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import recall_at_k

In [39]:
# Select MovieLens data size
MOVIELENS_DATA_SIZE = '100k'

# default number of recommendations
K = 10
# percentage of data used for testing
TEST_PERCENTAGE = 0.25
# model learning rate
LEARNING_RATE = 0.25
# no of latent factors
NO_COMPONENTS = 20
# no of epochs to fit model
NO_EPOCHS = 20
# no of threads to fit model
NO_THREADS = 32
# regularisation for both user and item features
ITEM_ALPHA = 1e-6
USER_ALPHA = 1e-6

# seed for pseudonumber generations
SEED = 42

In [40]:
data =pd.read_csv('/content/u.data',sep='\t',header=None)

In [42]:
headers =  ["userID", "movie id", "rating","timestemp"]
data.columns = headers

In [44]:
data = data.drop(columns={'timestemp'})

In [53]:
data

Unnamed: 0,userID,movie id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1
...,...,...,...
99995,880,476,3
99996,716,204,5
99997,276,1090,1
99998,13,225,2


In [45]:
item_data = pd.read_csv('/content/u.item',sep='|',header = None,encoding='latin-1')

In [46]:
item_data = item_data.drop(columns={1,2,3,4})

In [47]:
item_data.head()

Unnamed: 0,0,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23
0,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


In [48]:
item_data.columns = ['movie id', 'unknown', 'Action',
                'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
                'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

In [49]:
item_data

Unnamed: 0,movie id,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1678,1679,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
1679,1680,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
1680,1681,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [67]:
def process_data(df1, df2):
    for index, row in df1.iterrows():
        new_string_values = []
        for col, value in row.iloc[1:-1].items():  # Exclude the 'movie id' column
            if value == 1:
                new_string_values.append(str(col))

        new_col_string = '|'.join(new_string_values)


        for movie_id, genre in df2['movie id'].items():
            if movie_id == row['movie id']:
                if 'genre' in df2:
                    df2.at[movie_id, 'genre'] = new_col_string
                else:
                    df2['genre'] = ''  # Create 'genre' column if it doesn't exist
                    df2.at[movie_id, 'genre'] = new_col_string


In [55]:
data2 = data.head(1000)
item_data2 = item_data.copy()

In [60]:
data2.insert(3,'genre','' )

In [61]:
data2

Unnamed: 0,userID,movie id,rating,genre
0,196,242,3,
1,186,302,3,
2,22,377,1,
3,244,51,2,
4,166,346,1,
...,...,...,...,...
995,194,77,3,
996,208,663,5,
997,109,178,3,
998,230,172,4,


In [69]:
process_data(item_data2,data2)

In [78]:
data2 = data2.drop(0)

In [79]:
data2

Unnamed: 0,userID,movie id,rating,genre
1,186,302,3,Animation|Children's|Comedy
2,22,377,1,Action|Adventure|Thriller
3,244,51,2,Thriller
4,166,346,1,Action|Comedy|Drama
5,298,474,4,Crime|Drama|Thriller
...,...,...,...,...
995,194,77,3,Comedy
996,208,663,5,Children's|Comedy
997,109,178,3,Comedy
998,230,172,4,Comedy


In [128]:
data2['userID'].unique().shape

(249,)

In [80]:

columns = ['userID','age','gender','occupation','zipcode']
user_data = pd.read_csv('/content/u.user', sep='|', header=None, names=columns)



In [81]:
user_data

Unnamed: 0,userID,age,gender,occupation,zipcode
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213
...,...,...,...,...,...
938,939,26,F,student,33319
939,940,32,M,administrator,02215
940,941,20,M,student,97229
941,942,48,F,librarian,78209


In [82]:
# merging user feature with existing data
new_data = data2.merge(user_data[['userID','occupation']], left_on='userID', right_on='userID')
# quick look at the merged data
new_data.sample(5)

Unnamed: 0,userID,movie id,rating,genre,occupation
570,82,56,3,Comedy|Romance,programmer
457,11,558,3,Drama,other
710,125,478,4,Action|Thriller,lawyer
84,224,583,1,Documentary,educator
116,291,686,5,Comedy,student


In [83]:
all_jobs = sorted(list(set(new_data['occupation'])))

In [84]:
all_jobs

['administrator',
 'artist',
 'doctor',
 'educator',
 'engineer',
 'entertainment',
 'executive',
 'healthcare',
 'homemaker',
 'lawyer',
 'librarian',
 'marketing',
 'none',
 'other',
 'programmer',
 'retired',
 'salesman',
 'scientist',
 'student',
 'technician',
 'writer']

In [85]:
movie_genre = [x.split('|') for x in data2['genre']]
all_movie_genre = sorted(list(set(itertools.chain.from_iterable(movie_genre))))
# quick look at the all the genres within the data
all_movie_genre

['Action',
 'Adventure',
 'Animation',
 "Children's",
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western',
 'unknown']

In [119]:
dataset = Dataset()
dataset.fit(data2['userID'],
            data2['itemID'],
            item_features=all_movie_genre,
            user_features=all_jobs)

In [169]:
len(dataset.mapping())

4

In [170]:
user_mappings = dataset.mapping()[0]
item_mappings = dataset.mapping()[2]

len(user_mappings), len(item_mappings)

(249, 551)

In [175]:
# Have a look at the mappings

print(list(user_mappings.items())[:5]) # first 5 mappings
print(list(item_mappings.items())[:5])

[(186, 0), (22, 1), (244, 2), (166, 3), (298, 4)]
[(302, 0), (377, 1), (51, 2), (346, 3), (474, 4)]


In [176]:
inv_user_mappings = {v:k for k, v in user_mappings.items()}
inv_item_mappings = {v:k for k, v in item_mappings.items()}
print(list(inv_user_mappings.items())[:5])
print(list(inv_item_mappings.items())[:5])

[(0, 186), (1, 22), (2, 244), (3, 166), (4, 298)]
[(0, 302), (1, 377), (2, 51), (3, 346), (4, 474)]


In [118]:
data2 = data2.rename(columns={"movie id": "itemID"})

In [120]:
item_features = dataset.build_item_features((x, y) for x,y in zip(data2.itemID, movie_genre))
user_features = dataset.build_user_features((x, [y]) for x,y in zip(new_data.userID, new_data['occupation']))

In [121]:
item_features
user_features

<249x270 sparse matrix of type '<class 'numpy.float32'>'
	with 498 stored elements in Compressed Sparse Row format>

In [122]:
interactions2, weights2 = dataset.build_interactions(data2.iloc[:, 0:3].values)

train_interactions2, test_interactions2 = cross_validation.random_train_test_split(
    interactions2,
    test_percentage=TEST_PERCENTAGE

)

In [141]:
model = LightFM(loss='warp',
                 no_components=NO_COMPONENTS,
                 learning_rate=LEARNING_RATE,
                 item_alpha=ITEM_ALPHA,
                 user_alpha=USER_ALPHA,
                k=K
                )

In [142]:
model.fit(interactions=train_interactions2,
           user_features=user_features,
           item_features=item_features,
           epochs=NO_EPOCHS
           )

<lightfm.lightfm.LightFM at 0x7ec076f22650>

In [130]:
item_data

Unnamed: 0,movie id,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1678,1679,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
1679,1680,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
1680,1681,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [154]:
train_interactions2

<249x551 sparse matrix of type '<class 'numpy.float32'>'
	with 749 stored elements in COOrdinate format>

In [157]:
n_users , n_items = train_interactions2.shape

scoring_user_ids = np.concatenate([np.full((n_items, ), i) for i in range(n_users)]) # repeat user ID for number of prods
scoring_item_ids = np.concatenate([np.arange(n_items) for i in range(n_users)]) # repeat entire range of item IDs x number of user


scores = model.predict(user_ids = scoring_user_ids,
                       item_ids = scoring_item_ids)

scores = scores.reshape(-1, n_items) # get 1 row per user
recommendations = pd.DataFrame(scores)
recommendations.shape

# Have a look at the predicted scores for the first 5 users and first 5 items
recommendations.iloc[:5,:5]

Unnamed: 0,0,1,2,3,4
0,-0.81847,-4.700586,-4.944959,-7.383132,-3.840201
1,-2.306543,-3.672745,-2.753914,-2.459066,-1.602107
2,-3.559994,-6.737756,-3.654866,-2.783844,-2.198302
3,4.192063,-6.130074,-11.849838,0.785175,1.504939
4,-5.491751,-5.742009,-6.396718,-5.964223,-2.903533


In [160]:
scores.shape

(249, 551)

In [168]:
# Load latent representations to try computing predictions manually
item_biases, item_embeddings = model.get_item_representations()
user_biases, user_embeddings = model.get_user_representations()

#Combine item_embeddings with biases for dot product
manual_scores = ((user_embeddings @ item_embeddings.T + item_biases).T + user_biases).T
manual_scores.shape
manual_scores = manual_scores[:249, :551]
# They match apart from some tiny rounding!
np.allclose(manual_scores, scores, rtol=0, atol=1e-5)

True

In [179]:
# Top 10 predictions for every user

top_10 = np.argsort(-scores, axis=1) [::, :K]

# Get the previous purchases for every user
previous = np.array(train_interactions2.todense())

# Get the previous purchases and the top predictions for user 2
user = user_mappings.get(2)

print("Previous purchases:", *[inv_item_mappings.get(key) for key in np.array(range(previous.shape[1]))[previous[user]>0]], sep="\n")
print("Top 10 recommendations:", *sorted(zip([inv_item_mappings.get(key) for key in top_10[user]], range(K)), key = lambda x: x[1]), sep="\n")

Previous purchases:
292
251
Top 10 recommendations:
(198, 0)
(1, 1)
(292, 2)
(367, 3)
(522, 4)
(153, 5)
(749, 6)
(507, 7)
(430, 8)
(323, 9)
