## Setup

In [1]:
# Imports
import numpy as np
import pandas as pd
import collections
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import altair as alt
import sklearn
import sklearn.manifold
import tensorflow as tf
from tensorflow.keras import layers, models
from IPython import display

# Display options
pd.options.display.max_rows = 10
pd.options.display.float_format = "{:.3f}".format
alt.data_transformers.enable("default", max_rows=None)


# Returns a filtered dataframe by applying a mask from the result of applying function on column
def mask(df, col, function):
  return df[function(df[col])]


# Flattens the columns of the dataframe
def flatten_cols(df):
  df.columns = [" ".join(col).strip() for col in df.columns.values]
  return df


pd.DataFrame.mask = mask
pd.DataFrame.flatten_cols = flatten_cols

# For applying recommendation system on user
USER_RATINGS = False

In [2]:
# MovieLens Dataset
users_cols = ["user_id", "age", "gender", "occupation", "zip_code"]
users = pd.read_csv(
    "movielens_datasets/movielens_100K_ratings/u.user",
    sep="|",
    names=users_cols,
    encoding="latin-1",
)

ratings_cols = ["user_id", "movie_id", "rating", "timestamp"]
ratings = pd.read_csv(
    "movielens_datasets/movielens_100K_ratings/u.data",
    sep="\t",
    names=ratings_cols,
    encoding="latin-1",
)

genre_cols = [
    "genre_unknown",
    "Action",
    "Adventure",
    "Animation",
    "Children",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Fantasy",
    "Film-Noir",
    "Horror",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Thriller",
    "War",
    "Western",
]
movies_cols = [
    "movie_id",
    "title",
    "release_date",
    "video_release_date",
    "imdb_url",
] + genre_cols
movies = pd.read_csv(
    "movielens_datasets/movielens_100K_ratings/u.item",
    sep="|",
    names=movies_cols,
    encoding="latin-1",
)

In [3]:
# Show data type of columns for each dataframe
print("Users Dataset Info:")
print(users.info())
print("Ratings Dataset Info:")
print(ratings.info())

Users Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 943 entries, 0 to 942
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   user_id     943 non-null    int64 
 1   age         943 non-null    int64 
 2   gender      943 non-null    object
 3   occupation  943 non-null    object
 4   zip_code    943 non-null    object
dtypes: int64(2), object(3)
memory usage: 37.0+ KB
None
Ratings Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   user_id    100000 non-null  int64
 1   movie_id   100000 non-null  int64
 2   rating     100000 non-null  int64
 3   timestamp  100000 non-null  int64
dtypes: int64(4)
memory usage: 3.1 MB
None


In [4]:
print("Movies Dataset Info:")
print(movies.info())

Movies Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1682 entries, 0 to 1681
Data columns (total 24 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   movie_id            1682 non-null   int64  
 1   title               1682 non-null   object 
 2   release_date        1681 non-null   object 
 3   video_release_date  0 non-null      float64
 4   imdb_url            1679 non-null   object 
 5   genre_unknown       1682 non-null   int64  
 6   Action              1682 non-null   int64  
 7   Adventure           1682 non-null   int64  
 8   Animation           1682 non-null   int64  
 9   Children            1682 non-null   int64  
 10  Comedy              1682 non-null   int64  
 11  Crime               1682 non-null   int64  
 12  Documentary         1682 non-null   int64  
 13  Drama               1682 non-null   int64  
 14  Fantasy             1682 non-null   int64  
 15  Film-Noir           1682 non-null 

In [5]:
# Data Preprocessing
# Shift ids to start from 0 instead of 1 and convert them to string
id_shift = lambda x: str(x - 1)
users["user_id"] = users["user_id"].apply(id_shift)
ratings["movie_id"] = ratings["movie_id"].apply(id_shift)
ratings["user_id"] = ratings["user_id"].apply(id_shift)
movies["movie_id"] = movies["movie_id"].apply(id_shift)

# Convert ratings to float
ratings["rating"] = ratings["rating"].astype(float)

# Add year column for movies
movies["year"] = movies["release_date"].apply(lambda x: str(x).split("-")[-1])

In [9]:
genre_count = movies[genre_cols].sum().to_dict()
print("Genre occurences for movies dataframe:", genre_count)


# Some movies have mutiple genres
# Two columns are added to the movies dataframe
# 'all_generes' is a combination of the movie's multiple genres
# 'genre' is a randomly selected genre from the movie's multiple genres
def mark_genres(movies, genres):
  def get_random_genre(movie_gs):
    active = [genre for genre, g in zip(genres, movie_gs) if g == 1]
    if len(active) == 0:
        return "Other"
    return np.random.choice(active)

  def get_all_genres(movie_gs):
    active = [genre for genre, g in zip(genres, movie_gs) if g == 1]
    if len(active) == 0:
        return "Other"
    return "-".join(active)

  movies["genre"] = [
    get_random_genre(movie_gs) for movie_gs in zip(*[movies[genre] for genre in genres])
  ]
  movies["all_genres"] = [
    get_all_genres(movie_gs) for movie_gs in zip(*[movies[genre] for genre in genres])
  ]


mark_genres(movies, genre_cols)
print(movies[['title', 'genre', 'all_genres']].head())

Genre occurences for movies dataframe: {'genre_unknown': 2, 'Action': 251, 'Adventure': 135, 'Animation': 42, 'Children': 122, 'Comedy': 505, 'Crime': 109, 'Documentary': 50, 'Drama': 725, 'Fantasy': 22, 'Film-Noir': 24, 'Horror': 92, 'Musical': 56, 'Mystery': 61, 'Romance': 247, 'Sci-Fi': 101, 'Thriller': 251, 'War': 71, 'Western': 27}
               title      genre                 all_genres
0   Toy Story (1995)  Animation  Animation-Children-Comedy
1   GoldenEye (1995)     Action  Action-Adventure-Thriller
2  Four Rooms (1995)   Thriller                   Thriller
3  Get Shorty (1995)     Action        Action-Comedy-Drama
4     Copycat (1995)   Thriller       Crime-Drama-Thriller


In [None]:
# Merge three dataframes into one containing all information
# Merge movies dataframe with ratings dataframe based on common column movie_id
# Merge users dataframe with ratings dataframe based on common column user_id
movielens = ratings.merge(movies, on="movie_id").merge(users, on="user_id")


# Dataframe split function for training and test sets
def dataframe_split(df, test_split=0.1):
    test = df.sample(frac=test_split, replace=False)
    train = df[~df.index.isin(test.index)]
    return train, test