In [1]:
import pandas as pd
import numpy as np
from pandasql import sqldf

## Importing Data

In [2]:
# Add column with service name to each DataFrame
amazon = pd.read_csv("data/raw/amazon_prime_titles.csv").assign(service="amazon")
disney = pd.read_csv("data/raw/disney_plus_titles.csv").assign(service="disney")
hbo = pd.read_csv("data/raw/HBO_MAX_Content.csv").assign(service="hbo")
hulu = pd.read_csv("data/raw/hulu_titles.csv").assign(service="hulu")
netflix = pd.read_csv("data/raw/netflix_titles.csv").assign(service="netflix")

## Cleaning

The only needed columns from raw data are: title, release year, type and rating. All other columns can be removed, as any necessary data is obtained through APIs or other datasets. 

In [3]:
# Most columns not needed. Select only important columns
amazon = sqldf("SELECT title, release_year, type, rating, service FROM amazon")
disney = sqldf("SELECT title, release_year, type, rating, service FROM disney")

# Rename year as release_year for consistency across services
hbo = sqldf("SELECT title, year as release_year, type, rating, service FROM hbo")

hulu = sqldf("SELECT title, release_year, type, rating, service FROM hulu")
netflix = sqldf("SELECT title, release_year, type, rating, service FROM netflix")

In [4]:
# All columns are now the same across these datasets
for i in [amazon, disney, hbo, hulu, netflix]:
    print(list(i.columns))

['title', 'release_year', 'type', 'rating', 'service']
['title', 'release_year', 'type', 'rating', 'service']
['title', 'release_year', 'type', 'rating', 'service']
['title', 'release_year', 'type', 'rating', 'service']
['title', 'release_year', 'type', 'rating', 'service']


In [5]:
for i in [amazon, disney, hbo, hulu, netflix]:
    print(i.info())
    print()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9668 entries, 0 to 9667
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         9668 non-null   object
 1   release_year  9668 non-null   int64 
 2   type          9668 non-null   object
 3   rating        9331 non-null   object
 4   service       9668 non-null   object
dtypes: int64(1), object(4)
memory usage: 377.8+ KB
None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1450 entries, 0 to 1449
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         1450 non-null   object
 1   release_year  1450 non-null   int64 
 2   type          1450 non-null   object
 3   rating        1447 non-null   object
 4   service       1450 non-null   object
dtypes: int64(1), object(4)
memory usage: 56.8+ KB
None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2087 entries, 0 to 2086
Data column

In [6]:
for i in [amazon, disney, hbo, hulu, netflix]:
    print(i.describe())
    print()

       release_year
count   9668.000000
mean    2008.341849
std       18.922482
min     1920.000000
25%     2007.000000
50%     2016.000000
75%     2019.000000
max     2021.000000

       release_year
count   1450.000000
mean    2003.091724
std       21.860162
min     1928.000000
25%     1999.000000
50%     2011.000000
75%     2018.000000
max     2021.000000

       release_year
count   2087.000000
mean    1997.639195
std       22.898744
min     1915.000000
25%     1989.000000
50%     2006.000000
75%     2015.000000
max     2020.000000

       release_year
count   3073.000000
mean    2012.567524
std       10.844069
min     1923.000000
25%     2010.000000
50%     2016.000000
75%     2019.000000
max     2021.000000

       release_year
count   8807.000000
mean    2014.180198
std        8.819312
min     1925.000000
25%     2013.000000
50%     2017.000000
75%     2019.000000
max     2021.000000



Most necessary data exists in a non-null format. Release years are all integers with minimum and maximum values that make sense. Ratings need to be cleaned up. There are also a decent amount of null ratings, but this is not super important as rating distributions can still be observed. Types are mostly okay, but HBO stores Movie type as null and TV Show type as TV. This needs to be fixed. 

Incorrect data will be filtered out when data is merged with TMDb API.

## Cleaning Type

In [7]:
# Looking at type consistency
for i in [amazon,disney,hbo,hulu,netflix]:
    print(i.type.unique())

['Movie' 'TV Show']
['Movie' 'TV Show']
['TV' None]
['Movie' 'TV Show']
['Movie' 'TV Show']


In [8]:
# Set new content type for consistency
def get_type(t):
    if(t == "TV Show" or t == "Movie"):
        return t
    elif(t == "TV"):
        return "TV Show"
    else:
        return "Movie"

In [9]:
# Replace types
for i in [amazon, disney, hbo, hulu, netflix]:
    i["type"] = i["type"].apply(get_type)

## Cleaning Ratings

In [10]:
# Looking at rating consistency
for i in [amazon,disney,hbo,hulu,netflix]:
    print(i.rating.unique())

[None '13+' 'ALL' '18+' 'R' 'TV-Y' 'TV-Y7' 'NR' '16+' 'TV-PG' '7+' 'TV-14'
 'TV-NR' 'TV-G' 'PG-13' 'TV-MA' 'G' 'PG' 'NC-17' 'UNRATED' '16' 'AGES_16_'
 'AGES_18_' 'ALL_AGES' 'NOT_RATE']
['TV-G' 'PG' 'TV-PG' None 'PG-13' 'TV-14' 'G' 'TV-Y7' 'TV-Y' 'TV-Y7-FV']
['R/TV-MA' 'PG-13/TV-14' 'PG/TV-PG' None 'G/TV-G']
['TV-MA' None 'PG-13' 'R' 'TV-14' 'PG' 'TV-PG' 'NOT RATED' 'G' 'TV-G'
 '2 Seasons' 'TV-Y' '93 min' '4 Seasons' 'TV-Y7' '136 min' '91 min'
 '85 min' '98 min' '89 min' '94 min' '86 min' '3 Seasons' '121 min'
 '88 min' '101 min' '1 Season' '83 min' '100 min' '95 min' '92 min'
 '96 min' '109 min' '99 min' '75 min' '87 min' '67 min' '104 min'
 '107 min' '84 min' '103 min' '105 min' '119 min' '114 min' '82 min'
 '90 min' '130 min' '110 min' '80 min' '6 Seasons' '97 min' '111 min'
 '81 min' '49 min' '45 min' '41 min' '73 min' '40 min' '36 min' '39 min'
 '34 min' '47 min' '65 min' '37 min' '78 min' '102 min' '129 min'
 '115 min' '112 min' 'NR' '61 min' '106 min' '76 min' '77 min' '79 min'
 

In [11]:
# Get a new rating that conforms to other data
def get_rating(rating):
    # Dictionary that will convert ratings to be in the same set
    conversion_dict = {"13+": "PG-13", "16": "R", "16+": "R", "18+": "NC-17", "7+": "G", "AGES_16_": "R", "AGES_18_": "NC-17",
                      "ALL": "G", "ALL_AGES": "G", "NOT RATED": "NR", "NOT_RATE": "NR", "TV-14": "PG-13", "TV-G": "G", 
                      "TV-MA": "R", "TV-PG": "PG", "TV-Y": "G", "TV-Y7": "PG", "TV-Y7-FV": "PG", "UNRATED": "NR", "UR": "NR",
                      "TV-NR": "NR", "R/TV-MA": "R", "PG-13/TV-14": "PG-13", "PG/TV-PG": "PG", "G/TV-G": "G"}

    # Return converted rating if possible
    if(rating in conversion_dict.keys()):
        return conversion_dict.get(rating)
    # If rating exists in value set, return it
    elif(rating in conversion_dict.values()):
        return rating
    # If rating is neither a key nor value in the dictionary, it must be thrown out
    else:
        return None

In [12]:
# Replace ratings
for i in [amazon, disney, hbo, hulu, netflix]:
    i["rating"] = i["rating"].apply(get_rating)

## Saving Clean Data

In [13]:
# Write all clean datasets to .csv files
amazon.to_csv("data/amazon_clean.csv", index=False)
disney.to_csv("data/disney_clean.csv", index=False)
hbo.to_csv("data/hbo_clean.csv", index=False)
hulu.to_csv("data/hulu_clean.csv", index=False)
netflix.to_csv("data/netflix_clean.csv", index=False)