# Requirements

In [157]:
%%capture
%pip install numpy -q
%pip install pandas -q
%pip install matplotlib -q
%pip install networkx -q
%pip install torch -q
%pip install torch_geometric -q
%pip install tqdm -q
%pip install scipy -q
%pip install scikit-learn -q
%pip install gensim -q
%pip install nltk -q

In [1]:
# Standard library imports
import random
import time

# Third-party imports
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', None)

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
# from torch.utils.data import Dataset, DataLoader
import torch_geometric
from torch_geometric.nn.conv import MessagePassing
from torch_geometric.utils import degree

from tqdm.notebook import tqdm
from sklearn import preprocessing as pp
from sklearn.model_selection import train_test_split
import scipy.sparse as sp

# Data exploration

### Unzip dataset

In [2]:
import zipfile

file = 'data/raw/ml-100k.zip'
target_dir = 'data/interim/'

with zipfile.ZipFile(file, 'r') as zip_ref:
    zip_ref.extractall(target_dir)

print(f"Successfully extracted to {target_dir}")

Successfully extracted to data/interim/


### Load dataset

In [34]:
import os
import pandas as pd

def load_data():
    ml_100k_folder = 'data/interim/ml-100k/'

    user_file = 'u.user'
    item_file = 'u.item'
    data_file = 'u.data'
    genre_file = 'u.genre'
    info_file = 'u.info'
    occupation_file = 'u.occupation'

    # column names
    user_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
    data_cols = ['user_id', 'movie_id', 'rating', 'timestamp']

    # Load data into Pandas DataFrames
    users = pd.read_csv(os.path.join(ml_100k_folder, user_file), sep='|', names=user_cols)
    data = pd.read_csv(os.path.join(ml_100k_folder, data_file), sep='\t', names=data_cols)

    genre = pd.read_csv(os.path.join(ml_100k_folder, genre_file), sep='|', header=None, names=['genre_id', 'genre'])
    info = pd.read_csv(os.path.join(ml_100k_folder, info_file), sep=' ', header=None, names=['info'])
    occupation = pd.read_csv(os.path.join(ml_100k_folder, occupation_file), header=None, names=['occupation'])

    item_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url', *[g[0] for g in genre.values]]
    items = pd.read_csv(os.path.join(ml_100k_folder, item_file), sep='|', names=item_cols, encoding='latin-1')

    return users, items, data, genre, info, occupation


users_df, items_df, ratings_df, genre_df, info_df, occupation_df = load_data()

# Print the first few rows of each DataFrame to verify the data loading
print("Users DataFrame:")
print(users_df.head())

print("\nItems DataFrame:")
print(items_df.head())

print("\nRatings DataFrame:")
print(ratings_df.head())

print("\nGenre DataFrame:")
print(genre_df)

print("\nInfo DataFrame:")
print(info_df)

print("\nOccupation DataFrame:")
print(occupation_df)

Users DataFrame:
   user_id  age gender  occupation zip_code
0        1   24      M  technician    85711
1        2   53      F       other    94043
2        3   23      M      writer    32067
3        4   24      M  technician    43537
4        5   33      F       other    15213

Items DataFrame:
   movie_id              title release_date  video_release_date  \
0         1   Toy Story (1995)  01-Jan-1995                 NaN   
1         2   GoldenEye (1995)  01-Jan-1995                 NaN   
2         3  Four Rooms (1995)  01-Jan-1995                 NaN   
3         4  Get Shorty (1995)  01-Jan-1995                 NaN   
4         5     Copycat (1995)  01-Jan-1995                 NaN   

                                            imdb_url  unknown  Action  \
0  http://us.imdb.com/M/title-exact?Toy%20Story%2...        0       0   
1  http://us.imdb.com/M/title-exact?GoldenEye%20(...        0       1   
2  http://us.imdb.com/M/title-exact?Four%20Rooms%...        0       0   
3  htt

In [35]:
items_df[items_df['video_release_date'].notna()]

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western


In [82]:
users = pd.merge(ratings_df, users_df, on='user_id')
users_items = pd.merge(users, items_df, on='movie_id')
l = users['zip_code'].apply(lambda x: x[0]).unique()
l.sort()
l, len(l)

(array(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'E', 'K', 'L',
        'M', 'N', 'R', 'T', 'V', 'Y'], dtype=object),
 19)

In [114]:
users_items['timestamp'] = pd.to_datetime(users_items['timestamp'], unit='s')
users_items['year'] = users_items['timestamp'].dt.year
users_items['month'] = users_items['timestamp'].dt.month

In [142]:
users_items['zip_code'].unique()

array(['55105', '94086', '98101', '94702', '75240', '21218', 'E2A4H',
       '84103', '29206', '85251', 'V3N4P', '78155', '22932', '37212',
       '85711', '92037', '55106', '95032', '20008', '16803', '53703',
       '63119', '20784', '20910', '30068', '95628', '90630', '07039',
       '15237', '92103', '73132', '22306', '94043', '06371', '93555',
       '17604', '60201', '53115', '42459', '42141', '10960', '55436',
       '85710', '01002', '06405', '27514', '91344', '46005', '08904',
       '70802', '48197', '98027', '60135', '92626', '61755', '48103',
       '55414', '48076', '20817', '48446', '60630', '98102', '75218',
       '94708', '93101', '94618', '50112', '94305', '04102', '12603',
       '92705', '95821', '02324', '22902', '80303', '14627', '53188',
       '01915', '97520', 'M4J2K', '84116', '11201', '30067', '94306',
       '12065', '02903', '60657', '60626', '22906', '77042', '90405',
       '19146', '38115', '28814', '11238', '40243', '16506', '92660',
       '22206', '857

In [262]:
users_items[['user_id', 'movie_id']].duplicated().sum()

0

In [145]:
users_items['rating'].value_counts()

rating
4    34174
3    27145
5    21201
2    11370
1     6110
Name: count, dtype: int64

In [118]:
users_items['year'].unique()

array([1997, 1998], dtype=int32)

### Conclusions

- All video_release_date values are None


# Basic data preprocessing

### Encoding


In [304]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

preprocessed = users_items.drop(columns=['video_release_date', 'imdb_url'])
preprocessed = pd.get_dummies(preprocessed, columns=['occupation'], prefix='ocp', drop_first=True)
preprocessed = pd.get_dummies(preprocessed, columns=['gender'], prefix='gender', drop_first=True)
preprocessed['age'] = scaler.fit_transform(preprocessed[['age']])
preprocessed['year'] = scaler.fit_transform(preprocessed[['year']])
preprocessed['month'] = scaler.fit_transform(preprocessed[['month']])
preprocessed['zipcode'] = preprocessed['zip_code'].apply(lambda z: z[0])
preprocessed = pd.get_dummies(preprocessed, columns=['zipcode'], prefix='zipcode', drop_first=True)
preprocessed['date'] = pd.to_datetime(preprocessed['release_date'], format='%d-%b-%Y')
preprocessed['date'] = (preprocessed['date'].max() - preprocessed['date']).dt.days
preprocessed['date'] = scaler.fit_transform(preprocessed[['date']])
preprocessed['date'] = preprocessed['date'].fillna(preprocessed['date'].mean())

preprocessed.drop(columns=['timestamp', 'zip_code', 'user_id', 'movie_id', 'release_date'], inplace=True)
# preprocessed[preprocessed['release_date'].isna()]
preprocessed[preprocessed['title'] == 'unknown']
# preprocessed

Unnamed: 0,rating,age,title,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,...,zipcode_E,zipcode_K,zipcode_L,zipcode_M,zipcode_N,zipcode_R,zipcode_T,zipcode_V,zipcode_Y,date
99312,3,0.257576,unknown,1,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,0.139337
99313,4,0.393939,unknown,1,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,0.139337
99314,5,0.19697,unknown,1,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,0.139337
99315,4,0.257576,unknown,1,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,0.139337
99316,3,0.333333,unknown,1,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,0.139337
99317,4,0.469697,unknown,1,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,0.139337
99318,4,0.287879,unknown,1,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,0.139337
99319,3,0.19697,unknown,1,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,0.139337
99320,1,0.409091,unknown,1,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,0.139337


### Title tokenization

In [305]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

tokenized_titles = [word_tokenize(title.lower()) for title in preprocessed['title']]
word2vec_model = Word2Vec(sentences=tokenized_titles, vector_size=50, window=5, min_count=1)

# Get the vector representation of a title
vector_representation = word2vec_model.wv

preprocessed['title'] = preprocessed['title'].apply(lambda t: vector_representation[word_tokenize(t.lower())].mean(axis=0))
preprocessed = pd.concat([preprocessed, preprocessed['title'].apply(pd.Series)], axis=1)
preprocessed = preprocessed.drop(columns=['title'])


[nltk_data] Downloading package punkt to /home/anaconda/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [306]:
print(f"Columns: {list(preprocessed.columns)}")
preprocessed.head()

Columns: ['rating', 'age', 'unknown', 'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western', 'year', 'month', 'ocp_artist', 'ocp_doctor', 'ocp_educator', 'ocp_engineer', 'ocp_entertainment', 'ocp_executive', 'ocp_healthcare', 'ocp_homemaker', 'ocp_lawyer', 'ocp_librarian', 'ocp_marketing', 'ocp_none', 'ocp_other', 'ocp_programmer', 'ocp_retired', 'ocp_salesman', 'ocp_scientist', 'ocp_student', 'ocp_technician', 'ocp_writer', 'gender_M', 'zipcode_1', 'zipcode_2', 'zipcode_3', 'zipcode_4', 'zipcode_5', 'zipcode_6', 'zipcode_7', 'zipcode_8', 'zipcode_9', 'zipcode_E', 'zipcode_K', 'zipcode_L', 'zipcode_M', 'zipcode_N', 'zipcode_R', 'zipcode_T', 'zipcode_V', 'zipcode_Y', 'date', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45

Unnamed: 0,rating,age,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,40,41,42,43,44,45,46,47,48,49
0,3,0.636364,0,0,0,0,0,1,0,0,...,-0.382257,-0.288519,0.441967,-0.232965,-0.56895,-0.60156,-0.373678,-0.046179,-0.127178,-0.032442
1,5,0.242424,0,0,0,0,0,1,0,0,...,-0.382257,-0.288519,0.441967,-0.232965,-0.56895,-0.60156,-0.373678,-0.046179,-0.127178,-0.032442
2,4,0.530303,0,0,0,0,0,1,0,0,...,-0.382257,-0.288519,0.441967,-0.232965,-0.56895,-0.60156,-0.373678,-0.046179,-0.127178,-0.032442
3,4,0.80303,0,0,0,0,0,1,0,0,...,-0.382257,-0.288519,0.441967,-0.232965,-0.56895,-0.60156,-0.373678,-0.046179,-0.127178,-0.032442
4,3,0.363636,0,0,0,0,0,1,0,0,...,-0.382257,-0.288519,0.441967,-0.232965,-0.56895,-0.60156,-0.373678,-0.046179,-0.127178,-0.032442


In [312]:
preprocessed[['year', 'month', 'gender_M', 'date']].head()


Unnamed: 0,year,month,gender_M,date
0,0.0,1.0,True,0.022706
1,1.0,0.090909,True,0.022706
2,0.0,1.0,True,0.022706
3,1.0,0.181818,True,0.022706
4,0.0,0.818182,True,0.022706


# Test Random forest

In [351]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

X = preprocessed[preprocessed.duplicated() == False].drop('rating', axis=1)
y = preprocessed[preprocessed.duplicated() == False]['rating']

X.columns = X.columns.astype(str)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

mae = mean_absolute_error(y_test, predictions)
print(f'Mean Absolute Error: {mae}')


Mean Absolute Error: 0.8094942685204449


In [352]:
# users_items.iloc[75721]

index = X_test.index
good = 0
users = set()
for i,p in zip(index, predictions):
    ui = users_items.iloc[i]
    u = ui['user_id']
    t = ui['title']
    r = ui['rating']
    users.add(u)

    if (r >= 3 and p >= 2.5 or r < 3 and p < 2.5): 
        good +=1
        print(f'User: {u}. Film: {t}. Rating: {r}. Prediction: {p}')

print(good/len(predictions))
print(f'Number of users: {len(users)}. Number of predictions: {len(predictions)}')


User: 87. Film: GoodFellas (1990). Rating: 4. Prediction: 3.72
User: 218. Film: Monty Python's Life of Brian (1979). Rating: 4. Prediction: 3.34
User: 474. Film: Farewell to Arms, A (1932). Rating: 4. Prediction: 4.11
User: 315. Film: Get Shorty (1995). Rating: 4. Prediction: 3.17
User: 605. Film: Lone Star (1996). Rating: 3. Prediction: 4.345
User: 641. Film: To Kill a Mockingbird (1962). Rating: 4. Prediction: 4.34
User: 284. Film: Boogie Nights (1997). Rating: 4. Prediction: 3.83
User: 782. Film: Boogie Nights (1997). Rating: 3. Prediction: 3.74
User: 532. Film: Black Sheep (1996). Rating: 3. Prediction: 2.58
User: 437. Film: Pump Up the Volume (1990). Rating: 4. Prediction: 2.97
User: 487. Film: Hunt for Red October, The (1990). Rating: 5. Prediction: 4.04
User: 289. Film: Time to Kill, A (1996). Rating: 3. Prediction: 3.4666666666666663
User: 87. Film: Singin' in the Rain (1952). Rating: 4. Prediction: 3.72
User: 70. Film: Fugitive, The (1993). Rating: 4. Prediction: 4.19
User: 42