### Install Required Libraries

In [1]:
#!pip install pandas numpy scikit-learn lightfm tqdm matplotlib seaborn scipy

### Import Libraries

In [2]:
import pandas as pd
import numpy as np
import os
import re
import csv
import ast
import pickle
from datetime import datetime
from sklearn.feature_extraction.text import CountVectorizer
pd.set_option('display.max_columns', None)

### Helper

In [3]:
def create_folder(path):
    """Creates a path within the working directory
    Input: path (str) -> specify the working directory
    Output -> path will be created if not existed
    """
    try: 
        os.mkdir(path) 
    except OSError as error: 
        print(error)  

In [4]:
def extract_year(movie_year):
    """Using regular expression, we can extract the released years for each movie.
    Input: movie_year (str) -> Movie title with years
    Output: released_year (int) -> Released years from the given movie."""
    
    
    # create a pattern to extract movie year
    # if 4 digits within the parenthesis, then extract it
    pattern = re.findall(r'\(([0-9]{4})\)', movie_year)
    
    if pattern:
        global released_year
        released_year = int(pattern[0])
    else:
        pass
    
    return released_year

In [5]:
def extract_movie(val):
    """Using regular expression, we can extract the released years for each movie.
    Input:  val (str) -> movie title
    Output: final_title (str) -> The transformed movie title."""
    
    title = str(val)
    
    title_only = re.sub(r'\(([0-9]{4})\)', '', title)
    final_title = title_only.strip()
    
    return final_title

In [6]:
def change_timestamp(utc_timestamp):
    """Change the date format
    Input: utc_timestamp (int) -> timestamp before transformed
    Output: new_date (str) -> datetime information of reviews after transform"""
    
    date_format = "%Y-%m-%d %H:%M:%S"
    new_date = datetime.utcfromtimestamp(utc_timestamp).strftime(date_format)
    new_date = str(new_date)
    
    return new_date

- assets

In [7]:
asset_path = '../assets'
data_path = '../data'

In [8]:
create_folder(asset_path)
create_folder(data_path)

[Errno 17] File exists: '../assets'
[Errno 17] File exists: '../data'


### Import data

In [9]:
movie_path = '../data/movies.csv'
rating_path = '../data/ratings.csv'

In [10]:
movie = pd.read_csv(movie_path)
rating = pd.read_csv(rating_path)

### Quick check

In [11]:
movie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  62423 non-null  int64 
 1   title    62423 non-null  object
 2   genres   62423 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


In [12]:
rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000095 entries, 0 to 25000094
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 762.9 MB


- check unique values

In [13]:
print(f"In the movie dataset, there are {movie.movieId.nunique()} unique movies.")

In the movie dataset, there are 62423 unique movies.


In [14]:
print(f"In the rating dataset, there are {rating.userId.nunique()} unique users.")
print(f"In the rating dataset, {rating.movieId.nunique()} movies were rated by various users.")

In the rating dataset, there are 162541 unique users.
In the rating dataset, 59047 movies were rated by various users.


### Data Transformation

#### Movie

In [15]:
movie.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [16]:
movie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  62423 non-null  int64 
 1   title    62423 non-null  object
 2   genres   62423 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


In [17]:
movie.columns

Index(['movieId', 'title', 'genres'], dtype='object')

- extract movie names

In [18]:
movie.loc[:, 'clean_movie_title'] = movie.title.apply(extract_movie)

- extract years

In [19]:
movie.loc[:, 'released_year'] = movie.title.apply(extract_year)

- genre

In [20]:
movie.loc[:, 'genre_counts'] = movie.genres.apply(lambda row: len(row.split('|')))

find the number of gernes for each movie

transform some values since we will work on one hot coding

In [21]:
movie.loc[:, 'new_genres'] = movie.genres.apply(lambda row: 'Missing' if 'no genres listed' in str(row) else row)

In [22]:
movie.loc[:, 'new_genres'] = movie.new_genres.apply(lambda row: row.replace('|', ','))

- grouping movie released years

In [23]:
movie.loc[:, 'groupped_released_year']= movie.released_year.apply(lambda year: str(year)[:3] + '0s')

- one hot coding

In [24]:
target = list(movie.new_genres)

In [25]:
cv = CountVectorizer()
genre_to_cv = cv.fit_transform(target)

In [26]:
genre_df = pd.DataFrame(genre_to_cv.toarray(), columns = cv.get_feature_names_out())

In [27]:
genre_df.loc[:, 'movieId'] = list(movie.movieId)

- combine the movie dataset together

In [28]:
final_movie = pd.merge(movie, genre_df, left_on = ['movieId'], right_on = ['movieId'])

#### Rating

In [29]:
rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [30]:
rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000095 entries, 0 to 25000094
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 762.9 MB


- convert the timestamp

In [31]:
rating.loc[:, 'review_datetime'] = rating.timestamp.apply(change_timestamp)

In [32]:
rating.review_datetime = pd.to_datetime(rating.review_datetime)

convert it to the datetime object

In [33]:
rating.loc[:, 'review_year'] = rating.review_datetime.dt.year
rating.loc[:, 'review_quarter'] = rating.review_datetime.dt.quarter
rating.loc[:, 'review_month'] = rating.review_datetime.dt.month
rating.loc[:, 'review_day'] = rating.review_datetime.dt.day
rating.loc[:, 'review_dayname'] = rating.review_datetime.dt.day_name()

get date information from the datetime object

### Join the dataset

join the datasets together to get user-item information

convert movieId as string to join together

- convert the date type from int to str

In [34]:
final_movie.loc[:, 'movieId'] = final_movie.movieId.apply(lambda ID: str(ID))
rating.loc[:, 'movieId'] = rating.movieId.apply(lambda ID: str(ID))

- select features from both datasets

In [35]:
final_movie.columns

Index(['movieId', 'title', 'genres', 'clean_movie_title', 'released_year',
       'genre_counts', 'new_genres', 'groupped_released_year', 'action',
       'adventure', 'animation', 'children', 'comedy', 'crime', 'documentary',
       'drama', 'fantasy', 'fi', 'film', 'horror', 'imax', 'missing',
       'musical', 'mystery', 'noir', 'romance', 'sci', 'thriller', 'war',
       'western'],
      dtype='object')

In [36]:
rating.columns

Index(['userId', 'movieId', 'rating', 'timestamp', 'review_datetime',
       'review_year', 'review_quarter', 'review_month', 'review_day',
       'review_dayname'],
      dtype='object')

In [37]:
movie_features = ['movieId', 'clean_movie_title', 'released_year', 'genre_counts', 'new_genres'\
                  ,'groupped_released_year', 'action', 'adventure', 'animation', 'children', 'comedy'\
                  ,'crime', 'documentary', 'drama', 'fantasy', 'fi', 'film', 'horror', 'imax'\
                  ,'missing', 'musical', 'mystery', 'noir', 'romance', 'sci', 'thriller', 'war', 'western']

In [38]:
rating_features = ['userId', 'movieId', 'rating', 'review_datetime', 'review_year', 'review_quarter'\
                   ,'review_month', 'review_day', 'review_dayname']

In [39]:
movie_selected = final_movie[movie_features]
rating_selected = rating[rating_features]

### Export the dataset

In [40]:
movie_pkl = '../assets/movie.pkl'
rating_pkl = '../assets/rating.pkl'

In [41]:
with open(movie_pkl, 'wb') as m:
    pickle.dump(movie_selected, m)

In [42]:
with open(rating_pkl, 'wb') as r:
    pickle.dump(rating_selected, r)