In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tests as t
import seaborn as sns

In [2]:
# Read in the datasets
movies = pd.read_csv('./data/original_movies.dat',
                     delimiter='::',
                     header=None,
                     names=['movie_id', 'movie', 'genre'],
                     dtype={'movie_id': object}, engine='python')

reviews = pd.read_csv('./data/original_ratings.dat',
                      delimiter='::',
                      header=None,
                      names=['user_id', 'movie_id', 'rating', 'timestamp'],
                      dtype={'movie_id': object, 'user_id': object, 'timestamp': object},
                      engine='python')

# Reduce the size reviews dataset
reviews = reviews.loc[:100000,:]

#### 1. Take a Look At The Data 

Take a look at the data and use your findings to fill in the dictionary below with the correct responses to show your understanding of the data.

In [3]:
reviews.drop_duplicates().shape[0]

100001

In [4]:
movies.head()

Unnamed: 0,movie_id,movie,genre
0,8,Edison Kinetoscopic Record of a Sneeze (1894),Documentary|Short
1,10,La sortie des usines Lumière (1895),Documentary|Short
2,12,The Arrival of a Train (1896),Documentary|Short
3,25,The Oxford and Cambridge University Boat Race ...,
4,91,Le manoir du diable (1896),Short|Horror


In [5]:
# Use this cell to find the following information:
# number of movies
n_movies = movies['movie_id'].unique().shape[0]
# number of ratings
n_ratings = reviews.drop_duplicates().shape[0]
# number of different genres
n_genres = movies['genre'].drop_duplicates().dropna().shape[0]
# number of unique users
n_users = reviews['user_id'].unique().shape[0]
# number of missing ratings
n_missing_rating = reviews['rating'].isna().sum()
# the `average`, `min`, and `max` ratings given
rating_avg = reviews['rating'].mean()
rating_min = reviews['rating'].min()
rating_max = reviews['rating'].max()

In [6]:
# Use your findings to match each variable to the correct statement in the dictionary
a = 8022
b = 10
c = 7
d = 35479
e = 15
f = 0
g = 4
h = 100001
i = 28

# Guide: replace "ENTER YOUR ANSWER HERE" with the matching letter above.
# For example, 'The number of movies in the dataset': d,

dict_sol1 = {
'The number of movies in the dataset': d, 
'The number of ratings in the dataset': h,
'The number of different genres': i, 
'The number of unique users in the dataset': a, 
'The number missing ratings in the reviews dataset': f, 
'The average rating given across all ratings': c,
'The minimum rating given across all ratings': f,
'The maximum rating given across all ratings': b
}

# Check your solution
t.q1_check(dict_sol1)

That looks good to me!


#### 2. Data Cleaning

Next, we need to pull some additional relevant information out of the existing columns. 

For each of the datasets, there are a couple of cleaning steps we need to take care of:

#### Movies
* Pull the date from the title and create new column
* Dummy the date column with 1's and 0's for each century of a movie (1800's, 1900's, and 2000's)
* Dummy column the genre with 1's and 0's for each genre

#### Reviews
* Create a date out of time stamp

You can check your results against the header of my solution by running the cell below with the **show_clean_dataframes** function.

In [7]:
movies.head()

Unnamed: 0,movie_id,movie,genre
0,8,Edison Kinetoscopic Record of a Sneeze (1894),Documentary|Short
1,10,La sortie des usines Lumière (1895),Documentary|Short
2,12,The Arrival of a Train (1896),Documentary|Short
3,25,The Oxford and Cambridge University Boat Race ...,
4,91,Le manoir du diable (1896),Short|Horror


In [8]:
# Movies
movies['year']  = movies['movie'].apply(lambda s: int(s.split('(')[1][:-1]))
movies['1800s'] = movies['year'].apply(lambda x: int((x>=1800)&(x<1900)))
movies['1900s'] = movies['year'].apply(lambda x: int((x>=1900)&(x<2000)))
movies['2000s'] = movies['year'].apply(lambda x: int((x>=2000)))

In [9]:
movies['genre'] = movies['genre'].str.split('|')

genres = movies['genre'].explode().dropna().unique()

for genre in genres:
    movies[genre] = movies['genre'].apply(lambda x: int(np.isin(x, genre).sum()>0))
    

  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)


In [10]:
movies.head(2)

Unnamed: 0,movie_id,movie,genre,year,1800s,1900s,2000s,Documentary,Short,Horror,...,Thriller,Animation,Music,Musical,Film-Noir,Adult,Talk-Show,News,Reality-TV,Game-Show
0,8,Edison Kinetoscopic Record of a Sneeze (1894),"[Documentary, Short]",1894,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,10,La sortie des usines Lumière (1895),"[Documentary, Short]",1895,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# Reviews
reviews.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,114508,8,1381006850
1,2,208092,5,1586466072
2,2,358273,9,1579057827
3,2,10039344,5,1578603053
4,2,6751668,9,1578955697


In [12]:
reviews['timestamp'].dtype

dtype('O')

In [13]:
reviews['date'] = pd.to_datetime(reviews['timestamp'], unit='s').apply(lambda x: x.date())

In [14]:
reviews.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,date
0,1,114508,8,1381006850,2013-10-05
1,2,208092,5,1586466072,2020-04-09
2,2,358273,9,1579057827,2020-01-15
3,2,10039344,5,1578603053,2020-01-09
4,2,6751668,9,1578955697,2020-01-13


In [16]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100001 entries, 0 to 100000
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   user_id    100001 non-null  object
 1   movie_id   100001 non-null  object
 2   rating     100001 non-null  int64 
 3   timestamp  100001 non-null  object
 4   date       100001 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.8+ MB


In [17]:
reviews_new, movies_new = t.show_clean_dataframes()

   Unnamed: 0  user_id  movie_id  rating   timestamp                 date
0           0        1    114508       8  1381006850  2013-10-05 21:00:50
1           1        2    208092       5  1586466072  2020-04-09 21:01:12
2           2        2    358273       9  1579057827  2020-01-15 03:10:27
3           3        2  10039344       5  1578603053  2020-01-09 20:50:53
4           4        2   6751668       9  1578955697  2020-01-13 22:48:17
   Unnamed: 0  movie_id                                              movie  \
0           0         8      Edison Kinetoscopic Record of a Sneeze (1894)   
1           1        10                La sortie des usines Lumière (1895)   
2           2        12                      The Arrival of a Train (1896)   
3           3        25  The Oxford and Cambridge University Boat Race ...   
4           4        91                         Le manoir du diable (1896)   

               genre  date  1800's  1900's  2000's  Documentary  Adventure  \
0  Docume

In [18]:
reviews.to_csv('data/reviews_out.csv')
movies.to_csv('data/movies_out.csv')