# Exploratory Data Analysis (EDA)

## Import Library

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### MovieLens Dataset

#### Movies

In [17]:
movie_df = pd.read_csv("../../data-raw/movielens-rating/movies-9k.csv")

In [18]:
movie_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


#### Ratings

In [19]:
rating_df = pd.read_csv("../../data-raw/movielens-rating/ratings-100k.csv")

In [20]:
rating_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


#### Split Genres Into Multiple Rows

In [21]:
movie_df = movie_df.assign(genres=movie_df['genres'].str.split('|'))

In [22]:
movie_df = movie_df.explode('genres', ignore_index=True)

#### Merge Movies DataFrame With Ratings DataFrame

In [70]:
movie_rating_df = pd.merge(movie_df, rating_df, on='movieId')

#### Split Title Into Title And Year

In [71]:
movie_rating_df[['title', 'year']] = movie_rating_df.title.str.extract("^(?P<title>.*) (?P<year>\(\d*\))$", expand=True)

#### Remove Brackets From Year

In [72]:
movie_rating_df['year'] = movie_rating_df['year'].str.replace('[\(\)]', '', regex=True)

#### Drop Any NaN Row

In [74]:
movie_rating_df[movie_rating_df['year'].isnull()]

Unnamed: 0,movieId,title,genres,userId,rating,timestamp,year
208207,27008,,Comedy,599,1.0,1519333508,
208208,27008,,Crime,599,1.0,1519333508,
208209,27008,,Horror,599,1.0,1519333508,
216811,40697,,Sci-Fi,210,4.0,1527266191,
216812,40697,,Sci-Fi,528,0.5,1391736467,
...,...,...,...,...,...,...,...
273518,171495,,(no genres listed),599,4.0,1519116912,
273519,171631,,(no genres listed),111,1.0,1517440909,
273522,171749,,(no genres listed),105,5.0,1526207365,
273569,171891,,(no genres listed),111,3.5,1517440199,


In [75]:
movie_rating_df = movie_rating_df.dropna()

#### Convert Year Data Type As Integer

In [67]:
movie_rating_df['year'] = movie_rating_df['year'].astype('int32')

#### Feature Data Type

In [76]:
movie_rating_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 274412 entries, 0 to 274479
Data columns (total 7 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   movieId    274412 non-null  int64  
 1   title      274412 non-null  object 
 2   genres     274412 non-null  object 
 3   userId     274412 non-null  int64  
 4   rating     274412 non-null  float64
 5   timestamp  274412 non-null  int64  
 6   year       274412 non-null  object 
dtypes: float64(1), int64(3), object(3)
memory usage: 16.7+ MB


#### Central Tendency for Movie Rating

In [77]:
movie_rating_df['rating'].describe().apply("{0:.5f}".format)

count    274412.00000
mean          3.51785
std           1.03463
min           0.50000
25%           3.00000
50%           3.50000
75%           4.00000
max           5.00000
Name: rating, dtype: object

#### Total Movies

In [78]:
len(movie_rating_df['title'].unique())

9422

#### Total User Who Gave Ratings

In [79]:
len(movie_rating_df['userId'].unique())

610