### Install Required Libraries

In [287]:
#!pip install pandas numpy scikit-learn lightfm tqdm

### Import Libraries

In [288]:
import pandas as pd
import numpy as np
import os
import re
import csv
import ast
import pickle
from sklearn.feature_extraction.text import CountVectorizer
pd.set_option('display.max_columns', None)

### Helper

In [289]:
def create_folder(path):
    """Creates a path within the working directory
    Input: path (str) -> specify the working directory
    Output -> path will be created if not existed
    """
    try: 
        os.mkdir(path) 
    except OSError as error: 
        print(error)  

In [290]:
def parse_genre(data):
    
    """parsing genre from list to string
    Input:  data -> dataframe (the original dataframe)
    Output: data -> dataframe (after the parsing, which contains genres as string rather than lists)"""
    
    # get the total rows
    rows = len(data)
    
    # create a list for genre
    genre_list = []
        
    for row in range(rows):
        to_dict = ast.literal_eval(data['genres'][row])
        
        dict_len = len(to_dict)
        sub_list = []
        if dict_len > 0:
            for i in range(dict_len):
                target = to_dict[i]
                genre = target['name']
                sub_list.append(genre)
        else:
            genre = 'genre_not_specified'
            sub_list.append(genre)
        genre_list.append(sub_list)
    data.loc[:, 'parsed_genre'] = genre_list
    
    return data

- assets

In [291]:
asset_path = '../assets'
data_path = '../data'

In [292]:
create_folder(asset_path)
create_folder(data_path)

[Errno 17] File exists: '../assets'
[Errno 17] File exists: '../data'


### Import data

In [293]:
movie_path = '../data/movies_metadata.csv'
rating_path = '../data/ratings.csv'

In [294]:
movie = pd.read_csv(movie_path)
rating = pd.read_csv(rating_path)

  movie = pd.read_csv(movie_path)


### Quick check

In [295]:
movie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [296]:
rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26024289 entries, 0 to 26024288
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 794.2 MB


- check unique values

In [297]:
print(f"In the movie dataset, there are {movie.id.nunique()} unique movies.")

In the movie dataset, there are 45436 unique movies.


In [298]:
print(f"In the rating dataset, there are {rating.userId.nunique()} unique users.")
print(f"In the rating dataset, {rating.movieId.nunique()} movies were rated by various users.")

In the rating dataset, there are 270896 unique users.
In the rating dataset, 45115 movies were rated by various users.


### Data Transformation

#### Movie

Select dataframes so that we can create item-user info.

In [299]:
movie.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

- selecting features

In [300]:
new_movie = movie[['adult', 'budget', 'genres', 'id', 'original_title', 'overview', 'popularity', 'revenue', 'runtime']]

In [301]:
new_movie.head()

Unnamed: 0,adult,budget,genres,id,original_title,overview,popularity,revenue,runtime
0,False,30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,373554033.0,81.0
1,False,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,262797249.0,104.0
2,False,0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,0.0,101.0
3,False,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,81452156.0,127.0
4,False,0,"[{'id': 35, 'name': 'Comedy'}]",11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,76578911.0,106.0


re selecting columns based on our needs

- dropping rows

In [302]:
new_movie.adult.value_counts()

False                                                                                                                             45454
True                                                                                                                                  9
 - Written by Ørnås                                                                                                                   1
 Rune Balot goes to a casino connected to the October corporation to try to wrap up her case once and for all.                        1
 Avalanche Sharks tells the story of a bikini contest that turns into a horrifying affair when it is hit by a shark avalanche.        1
Name: adult, dtype: int64

In [303]:
new_movie = new_movie[new_movie.adult.apply(lambda row: row in ['True', 'False'])]

- dedup

only select unique movies

In [304]:
new_movie.drop_duplicates(subset = ['id'], inplace = True)

In [305]:
new_movie = new_movie.reset_index().drop(columns = 'index')

- parsing genre

In [306]:
parsed_movie = parse_genre(new_movie)

In [307]:
parsed_movie.loc[:, 'genre_unlisted'] = parsed_movie.parsed_genre.apply(lambda genre: ", ".join(genre))

In [308]:
parsed_movie.head()

Unnamed: 0,adult,budget,genres,id,original_title,overview,popularity,revenue,runtime,parsed_genre,genre_unlisted
0,False,30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,373554033.0,81.0,"[Animation, Comedy, Family]","Animation, Comedy, Family"
1,False,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,262797249.0,104.0,"[Adventure, Fantasy, Family]","Adventure, Fantasy, Family"
2,False,0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,0.0,101.0,"[Romance, Comedy]","Romance, Comedy"
3,False,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,81452156.0,127.0,"[Comedy, Drama, Romance]","Comedy, Drama, Romance"
4,False,0,"[{'id': 35, 'name': 'Comedy'}]",11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,76578911.0,106.0,[Comedy],Comedy


- one hot coding for genre

In [309]:
target = list(parsed_movie.genre_unlisted)

In [310]:
cv = CountVectorizer()
genre_to_cv = cv.fit_transform(target)

In [311]:
genre_df = pd.DataFrame(genre_to_cv.toarray(), columns = cv.get_feature_names_out())

In [312]:
genre_df.loc[:, 'id'] = list(parsed_movie.id)

#### Rating

- filter rows

### Join the dataset

join the datasets together to get user-item information

convert movieId as string to join together

In [313]:
rating.loc[:, 'movieId'] = rating.movieId.apply(lambda ID: str(ID))

genre one hot code and movie info

In [314]:
new_genre = pd.merge(left = new_movie[['adult', 'budget', 'genres', 'id', 'original_title', 'overview','popularity', 'revenue', 'runtime']],
                     right = genre_df,
                     left_on = ['id'],
                     right_on = ['id'])

In [315]:
joined_df = pd.merge(left = new_genre,
         right = rating[['userId', 'movieId', 'rating']],
         left_on = ['id'],
         right_on = ['movieId'],
         how = 'left')

- check

In [316]:
joined_df.id.nunique() == new_movie.id.nunique()

True

number of unique id should match when joining the datsets

### Export the dataset

In [317]:
movie_pkl = '../assets/movie.pkl'
rating_pkl = '../assets/rating.pkl'
joined_df_pkl = '../assets/df.pkl'

In [318]:
with open(movie_pkl, 'wb') as m:
    pickle.dump(new_genre, m)

In [319]:
with open(rating_pkl, 'wb') as r:
    pickle.dump(rating, r)

In [320]:
with open(joined_df_pkl, 'wb') as r:
    pickle.dump(joined_df, r)