# Requirements

In [198]:
%%capture
%pip install numpy -q
%pip install pandas -q
%pip install matplotlib -q
%pip install networkx -q
%pip install torch -q
%pip install torch_geometric -q
%pip install tqdm -q
%pip install scipy -q
%pip install scikit-learn -q
%pip install gensim -q
%pip install nltk -q

In [199]:
# Standard library imports
import random
import time

# Third-party imports
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', None)

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
# from torch.utils.data import Dataset, DataLoader
import torch_geometric
from torch_geometric.nn.conv import MessagePassing
from torch_geometric.utils import degree

from tqdm.notebook import tqdm
from sklearn import preprocessing as pp
from sklearn.model_selection import train_test_split
import scipy.sparse as sp

# Data exploration

### Unzip dataset

In [200]:
import zipfile

file = 'data/raw/ml-100k.zip'
target_dir = 'data/interim/'

with zipfile.ZipFile(file, 'r') as zip_ref:
    zip_ref.extractall(target_dir)

print(f"Successfully extracted to {target_dir}")

Successfully extracted to data/interim/


### Load dataset

In [201]:
import os
import pandas as pd

def load_data(data_file):
    ml_100k_folder = 'data/interim/ml-100k/'

    user_file = 'u.user'
    item_file = 'u.item'
    genre_file = 'u.genre'

    # column names
    user_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
    data_cols = ['user_id', 'movie_id', 'rating', 'timestamp']

    # Load data into Pandas DataFrames
    users = pd.read_csv(os.path.join(ml_100k_folder, user_file), sep='|', names=user_cols)
    data = pd.read_csv(os.path.join(ml_100k_folder, data_file), sep='\t', names=data_cols)

    genre = pd.read_csv(os.path.join(ml_100k_folder, genre_file), sep='|', header=None, names=['genre_id', 'genre'])

    item_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url', *[g[0] for g in genre.values]]
    items = pd.read_csv(os.path.join(ml_100k_folder, item_file), sep='|', names=item_cols, encoding='latin-1')

    return users, items, data

users_df, items_df, ratings_df = load_data('u.data')

# Print the first few rows of each DataFrame to verify the data loading
print("Users DataFrame:")
print(users_df.head())

print("\nRatings DataFrame:")
ratings_df.head()

Users DataFrame:
   user_id  age gender  occupation zip_code
0        1   24      M  technician    85711
1        2   53      F       other    94043
2        3   23      M      writer    32067
3        4   24      M  technician    43537
4        5   33      F       other    15213

Ratings DataFrame:


Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [202]:
print("\nItems DataFrame:")
items_df.head()


Items DataFrame:


Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%20(1995),0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(1995),0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%20(1995),0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


### Check null values

In [203]:
items_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1682 entries, 0 to 1681
Data columns (total 24 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   movie_id            1682 non-null   int64  
 1   title               1682 non-null   object 
 2   release_date        1681 non-null   object 
 3   video_release_date  0 non-null      float64
 4   imdb_url            1679 non-null   object 
 5   unknown             1682 non-null   int64  
 6   Action              1682 non-null   int64  
 7   Adventure           1682 non-null   int64  
 8   Animation           1682 non-null   int64  
 9   Children's          1682 non-null   int64  
 10  Comedy              1682 non-null   int64  
 11  Crime               1682 non-null   int64  
 12  Documentary         1682 non-null   int64  
 13  Drama               1682 non-null   int64  
 14  Fantasy             1682 non-null   int64  
 15  Film-Noir           1682 non-null   int64  
 16  Horror

In [204]:
users_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 943 entries, 0 to 942
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   user_id     943 non-null    int64 
 1   age         943 non-null    int64 
 2   gender      943 non-null    object
 3   occupation  943 non-null    object
 4   zip_code    943 non-null    object
dtypes: int64(2), object(3)
memory usage: 37.0+ KB


In [205]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   user_id    100000 non-null  int64
 1   movie_id   100000 non-null  int64
 2   rating     100000 non-null  int64
 3   timestamp  100000 non-null  int64
dtypes: int64(4)
memory usage: 3.1 MB


#### Check duplicates in data

In [206]:
ratings_df[['user_id', 'movie_id']].duplicated().sum()

0

#### Check zip codes variety

In [207]:
sorted(users_df['zip_code'].unique())

['00000',
 '01002',
 '01040',
 '01080',
 '01331',
 '01375',
 '01581',
 '01602',
 '01701',
 '01720',
 '01754',
 '01810',
 '01824',
 '01913',
 '01915',
 '01940',
 '01945',
 '01960',
 '01970',
 '02110',
 '02113',
 '02125',
 '02136',
 '02138',
 '02139',
 '02140',
 '02143',
 '02146',
 '02154',
 '02159',
 '02176',
 '02215',
 '02320',
 '02324',
 '02341',
 '02859',
 '02903',
 '02918',
 '03052',
 '03060',
 '03062',
 '03261',
 '03755',
 '03869',
 '04102',
 '04988',
 '05001',
 '05146',
 '05201',
 '05452',
 '05464',
 '05779',
 '06059',
 '06260',
 '06333',
 '06355',
 '06365',
 '06371',
 '06405',
 '06437',
 '06472',
 '06492',
 '06512',
 '06513',
 '06518',
 '06779',
 '06811',
 '06906',
 '06927',
 '07029',
 '07030',
 '07039',
 '07102',
 '07204',
 '07310',
 '07733',
 '08034',
 '08043',
 '08052',
 '08105',
 '08360',
 '08403',
 '08534',
 '08610',
 '08816',
 '08832',
 '08904',
 '09645',
 '10003',
 '10010',
 '10011',
 '10016',
 '10018',
 '10019',
 '10021',
 '10022',
 '10025',
 '10309',
 '10314',
 '10522',


### Preparation after exploration

### Extract the first symbol from zip_code that states the region

In [208]:
users_df['zip_code'] = users_df['zip_code'].apply(lambda x: x[0])

#### Extract year and month from timestamp

In [209]:
ratings_df['timestamp'] = pd.to_datetime(ratings_df['timestamp'], unit='s')
ratings_df['year'] = ratings_df['timestamp'].dt.year
ratings_df['month'] = ratings_df['timestamp'].dt.month
ratings_df['day'] = ratings_df['timestamp'].dt.day

ratings_df.drop(columns=['timestamp'], inplace=True)

#### Extract number of days from release_date

In [210]:
items_df['release_date'] = pd.to_datetime(items_df['release_date'], format='%d-%b-%Y')
items_df['date'] = (items_df['release_date'].max() - items_df['release_date']).dt.days

## Fill missing values with mean
items_df['days_ago'] = items_df['date'].fillna(items_df['date'].mean())

#### Drop video_release_date, imdb_url

In [211]:
items_df = items_df.drop(columns=['video_release_date', 'imdb_url', 'release_date', 'date'])

### Preprocess title - remove punctation and digits


In [212]:
import re
def preprocess_title(title):
    remove_braces = re.sub(r'\([^)]*\)', '', title)
    if (len(remove_braces) == 0): remove_braces = title
    
    new_title = " ".join(re.findall(r'\b\w{3,}\b', remove_braces.lower())).strip()
    if (len(new_title) == 0): new_title = remove_braces

    return new_title
    

In [213]:
items_df['title'] = items_df['title'].apply(preprocess_title)

## After small preprocessing

In [214]:
print(items_df.columns)
items_df.head()

Index(['movie_id', 'title', 'unknown', 'Action', 'Adventure', 'Animation',
       'Children's', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
       'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
       'Thriller', 'War', 'Western', 'days_ago'],
      dtype='object')


Unnamed: 0,movie_id,title,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,days_ago
0,1,toy story,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,1391.0
1,2,goldeneye,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1391.0
2,3,four rooms,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1391.0
3,4,get shorty,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1391.0
4,5,copycat,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,1391.0


In [215]:
users_df.head()

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,8
1,2,53,F,other,9
2,3,23,M,writer,3
3,4,24,M,technician,4
4,5,33,F,other,1


In [216]:
ratings_df.head()

Unnamed: 0,user_id,movie_id,rating,year,month,day
0,196,242,3,1997,12,4
1,186,302,3,1998,4,4
2,22,377,1,1997,11,7
3,244,51,2,1997,11,27
4,166,346,1,1998,2,2


# Visualization of the data distributions

# Conclusions

- All video release dates are null
- urls are not useful since nobody likes video just looking at url
- dataset is unbalanced towards the good rating (4-5)

# Strong preprocessing - scalling, encoding and tokenization

### Tokenize title and expand a list of tokens into a row

In [217]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

tokenized_titles = [word_tokenize(title) for title in items_df['title']]
vectors = Word2Vec(sentences=tokenized_titles, vector_size=50, window=5, min_count=1).wv

def tokenize(title):
    # print(f'[{title}]')
    return vectors[word_tokenize(title)].mean(axis=0)

[nltk_data] Downloading package punkt to /home/anaconda/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [218]:
items_df['title'] = items_df['title'].apply(tokenize)

items_df = pd.concat([items_df, items_df['title'].apply(pd.Series)], axis=1)
items_df = items_df.drop(columns=['title'])

### Scaling

In [220]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()


# Rating - year, month, day, rating
ratings_df['rating'] = scaler.fit_transform(ratings_df[['rating']])
ratings_df['year'] = scaler.fit_transform(ratings_df[['year']])
ratings_df['month'] = scaler.fit_transform(ratings_df[['month']])
ratings_df['day'] = scaler.fit_transform(ratings_df[['day']])

# Users - age
users_df['age'] = scaler.fit_transform(users_df[['age']])

# Movie - released days ago
items_df['days_ago'] = scaler.fit_transform(items_df[['days_ago']])

### One-hot Encoding

In [221]:
# User - gender, occupatin, zipcode

users_df = pd.get_dummies(users_df, columns=['occupation'], prefix='ocp', drop_first=True)
users_df = pd.get_dummies(users_df, columns=['gender'], prefix='gender', drop_first=True)
users_df = pd.get_dummies(users_df, columns=['zip_code'], prefix='zipcode', drop_first=True)

# Result after all the preprocessing

In [191]:
# Rating distribution
ratings_df['rating'].value_counts().sort_values()

rating
0.00     6110
0.25    11370
1.00    21201
0.50    27145
0.75    34174
Name: count, dtype: int64

In [187]:
ratings_df.head()

Unnamed: 0,user_id,movie_id,rating,year,month,day
0,196,242,0.5,0.0,1.0,0.1
1,186,302,0.5,1.0,0.272727,0.1
2,22,377,0.0,0.0,0.909091,0.2
3,244,51,0.25,0.0,0.909091,0.866667
4,166,346,0.0,1.0,0.090909,0.033333


In [222]:
users_df.head()

Unnamed: 0,user_id,age,ocp_artist,ocp_doctor,ocp_educator,ocp_engineer,ocp_entertainment,ocp_executive,ocp_healthcare,ocp_homemaker,...,zipcode_9,zipcode_E,zipcode_K,zipcode_L,zipcode_M,zipcode_N,zipcode_R,zipcode_T,zipcode_V,zipcode_Y
0,1,0.257576,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,2,0.69697,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
2,3,0.242424,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,4,0.257576,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,5,0.393939,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


# Merge into one dataset

In [244]:
users = pd.merge(ratings_df, users_df, on='user_id')
users_items = pd.merge(users, items_df, on='movie_id')

In [245]:
print(list(users_items.columns))
users_items.head()

['user_id', 'movie_id', 'rating', 'year', 'month', 'day', 'age', 'ocp_artist', 'ocp_doctor', 'ocp_educator', 'ocp_engineer', 'ocp_entertainment', 'ocp_executive', 'ocp_healthcare', 'ocp_homemaker', 'ocp_lawyer', 'ocp_librarian', 'ocp_marketing', 'ocp_none', 'ocp_other', 'ocp_programmer', 'ocp_retired', 'ocp_salesman', 'ocp_scientist', 'ocp_student', 'ocp_technician', 'ocp_writer', 'gender_M', 'zipcode_1', 'zipcode_2', 'zipcode_3', 'zipcode_4', 'zipcode_5', 'zipcode_6', 'zipcode_7', 'zipcode_8', 'zipcode_9', 'zipcode_E', 'zipcode_K', 'zipcode_L', 'zipcode_M', 'zipcode_N', 'zipcode_R', 'zipcode_T', 'zipcode_V', 'zipcode_Y', 'unknown', 'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western', 'days_ago', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 3

Unnamed: 0,user_id,movie_id,rating,year,month,day,age,ocp_artist,ocp_doctor,ocp_educator,...,40,41,42,43,44,45,46,47,48,49
0,196,242,0.5,0.0,1.0,0.1,0.636364,False,False,False,...,-0.011781,-0.001448,0.012002,-0.0089,-0.010015,-0.000837,-0.012694,0.016361,0.003422,0.01643
1,305,242,1.0,1.0,0.090909,0.0,0.242424,False,False,False,...,-0.011781,-0.001448,0.012002,-0.0089,-0.010015,-0.000837,-0.012694,0.016361,0.003422,0.01643
2,6,242,0.75,0.0,1.0,0.9,0.530303,False,False,False,...,-0.011781,-0.001448,0.012002,-0.0089,-0.010015,-0.000837,-0.012694,0.016361,0.003422,0.01643
3,234,242,0.75,1.0,0.181818,0.866667,0.80303,False,False,False,...,-0.011781,-0.001448,0.012002,-0.0089,-0.010015,-0.000837,-0.012694,0.016361,0.003422,0.01643
4,63,242,0.5,0.0,0.818182,0.0,0.363636,False,False,False,...,-0.011781,-0.001448,0.012002,-0.0089,-0.010015,-0.000837,-0.012694,0.016361,0.003422,0.01643


# Test Random forest

In [249]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

dataset = users_items.drop(columns=['user_id', 'movie_id'])

X = dataset[dataset.duplicated() == False].drop('rating', axis=1)
y = dataset[dataset.duplicated() == False]['rating']

X.columns = X.columns.astype(str)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

model = RandomForestRegressor()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

mae = mean_absolute_error(y_test, predictions)
rmse = mean_squared_error(y_true=y_test, y_pred=predictions)**0.5

print(f'RMSE: {rmse:.4f}. MAE: {mae:.4f}')

Mean Absolute Error: 0.19938093268886503


In [254]:
index = X_test.index
good = 0
users = set()

for i,p in zip(index, predictions):
    ui = users_items.iloc[i]
    u = ui['user_id']
    m = ui['movie_id']
    r = ui['rating']
    users.add(u)

    if (r > 0.5 and p > 0.5 or r <= 0.5 and p <= 0.5): 
        good +=1
    print(f'User: {u}. Film: {m}. Rating: {r}. Prediction: {p}')

print(good/len(predictions))
print(f'Number of users: {len(users)}. Number of predictions: {len(predictions)}')


User: 62. Film: 155. Rating: 0.0. Prediction: 0.47
User: 347. Film: 323. Rating: 0.0. Prediction: 0.34
User: 654. Film: 95. Rating: 0.75. Prediction: 0.52
User: 548. Film: 1013. Rating: 0.5. Prediction: 0.01
User: 542. Film: 23. Rating: 1.0. Prediction: 0.77
User: 431. Film: 988. Rating: 0.25. Prediction: 0.06
User: 535. Film: 8. Rating: 0.75. Prediction: 0.82
User: 10. Film: 203. Rating: 0.75. Prediction: 0.82
User: 389. Film: 174. Rating: 0.75. Prediction: 0.61
User: 92. Film: 651. Rating: 0.75. Prediction: 0.78
User: 44. Film: 185. Rating: 0.75. Prediction: 0.8
User: 773. Film: 531. Rating: 1.0. Prediction: 0.9
User: 552. Film: 248. Rating: 0.75. Prediction: 0.36
User: 181. Film: 546. Rating: 0.25. Prediction: 0.32
User: 236. Film: 133. Rating: 1.0. Prediction: 0.49
User: 887. Film: 432. Rating: 1.0. Prediction: 0.65
User: 805. Film: 153. Rating: 0.75. Prediction: 0.78
User: 49. Film: 1009. Rating: 0.5. Prediction: 0.37
User: 842. Film: 749. Rating: 0.75. Prediction: 0.24
User: 346.