In [1]:
import pandas as pd
import numpy as np
from pandasql import sqldf

## Importing Data

In [2]:
# Add column with service name to each DataFrame
amazon = pd.read_csv("data/raw/amazon_prime_titles.csv").assign(service="amazon")
disney = pd.read_csv("data/raw/disney_plus_titles.csv").assign(service="disney")
hulu = pd.read_csv("data/raw/hulu_titles.csv").assign(service="hulu")
netflix = pd.read_csv("data/raw/netflix_titles.csv").assign(service="netflix")

## Cleaning

In [3]:
# All columns are the same across these datasets
for i in [amazon, disney, hulu, netflix]:
    print(list(i.columns))

['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added', 'release_year', 'rating', 'duration', 'listed_in', 'description', 'service']
['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added', 'release_year', 'rating', 'duration', 'listed_in', 'description', 'service']
['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added', 'release_year', 'rating', 'duration', 'listed_in', 'description', 'service']
['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added', 'release_year', 'rating', 'duration', 'listed_in', 'description', 'service']


In [4]:
amazon.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9668 entries, 0 to 9667
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       9668 non-null   object
 1   type          9668 non-null   object
 2   title         9668 non-null   object
 3   director      7586 non-null   object
 4   cast          8435 non-null   object
 5   country       672 non-null    object
 6   date_added    155 non-null    object
 7   release_year  9668 non-null   int64 
 8   rating        9331 non-null   object
 9   duration      9668 non-null   object
 10  listed_in     9668 non-null   object
 11  description   9668 non-null   object
 12  service       9668 non-null   object
dtypes: int64(1), object(12)
memory usage: 982.0+ KB


In [5]:
disney.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1450 entries, 0 to 1449
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       1450 non-null   object
 1   type          1450 non-null   object
 2   title         1450 non-null   object
 3   director      977 non-null    object
 4   cast          1260 non-null   object
 5   country       1231 non-null   object
 6   date_added    1447 non-null   object
 7   release_year  1450 non-null   int64 
 8   rating        1447 non-null   object
 9   duration      1450 non-null   object
 10  listed_in     1450 non-null   object
 11  description   1450 non-null   object
 12  service       1450 non-null   object
dtypes: int64(1), object(12)
memory usage: 147.4+ KB


In [6]:
hulu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3073 entries, 0 to 3072
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   show_id       3073 non-null   object 
 1   type          3073 non-null   object 
 2   title         3073 non-null   object 
 3   director      3 non-null      object 
 4   cast          0 non-null      float64
 5   country       1620 non-null   object 
 6   date_added    3045 non-null   object 
 7   release_year  3073 non-null   int64  
 8   rating        2553 non-null   object 
 9   duration      2594 non-null   object 
 10  listed_in     3073 non-null   object 
 11  description   3069 non-null   object 
 12  service       3073 non-null   object 
dtypes: float64(1), int64(1), object(11)
memory usage: 312.2+ KB


In [7]:
netflix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
 12  service       8807 non-null   object
dtypes: int64(1), object(12)
memory usage: 894.6+ KB


Columns that will be dropped are: show_id, director, and cast.

Show_id is not a unique identifier when dataframes are combined, and directors and cast members are not present or consistent enough to justify presence. May be brought in by using other data.

In [8]:
for i in [amazon, disney, hulu, netflix]:
    try:
        i.drop(["show_id", "director", "cast"], axis="columns", inplace=True)
    except:
        pass

In [9]:
pd.set_option('display.max_rows', None)

In [10]:
amazon.head(1000)

Unnamed: 0,type,title,country,date_added,release_year,rating,duration,listed_in,description,service
0,Movie,The Grand Seduction,Canada,"March 30, 2021",2014,,113 min,"Comedy, Drama",A small fishing village must procure a local d...,amazon
1,Movie,Take Care Good Night,India,"March 30, 2021",2018,13+,110 min,"Drama, International",A Metro Family decides to fight a Cyber Crimin...,amazon
2,Movie,Secrets of Deception,United States,"March 30, 2021",2017,,74 min,"Action, Drama, Suspense",After a man discovers his wife is cheating on ...,amazon
3,Movie,Pink: Staying True,United States,"March 30, 2021",2014,,69 min,Documentary,"Pink breaks the mold once again, bringing her ...",amazon
4,Movie,Monster Maker,United Kingdom,"March 30, 2021",1989,,45 min,"Drama, Fantasy",Teenage Matt Banting wants to work with a famo...,amazon
5,Movie,Living With Dinosaurs,United Kingdom,"March 30, 2021",1989,,52 min,"Fantasy, Kids",The story unfolds in a an English seaside town...,amazon
6,Movie,Hired Gun,United States,"March 30, 2021",2017,,98 min,"Documentary, Special Interest","They are the ""First Call, A-list"" musicians, j...",amazon
7,Movie,Grease Live!,United States,"March 30, 2021",2016,,131 min,Comedy,"This honest, uncompromising comedy chronicles ...",amazon
8,Movie,Global Meltdown,Canada,"March 30, 2021",2017,,87 min,"Action, Science Fiction, Suspense",A helicopter pilot and an environmental scient...,amazon
9,Movie,David's Mother,United States,"April 1, 2021",1994,,92 min,Drama,Sally Goodson is a devoted mother to her autis...,amazon


In [11]:
disney.head(1000)

Unnamed: 0,type,title,country,date_added,release_year,rating,duration,listed_in,description,service
0,Movie,Duck the Halls: A Mickey Mouse Christmas Special,,"November 26, 2021",2016,TV-G,23 min,"Animation, Family",Join Mickey and the gang as they duck the halls!,disney
1,Movie,Ernest Saves Christmas,,"November 26, 2021",1988,PG,91 min,Comedy,Santa Claus passes his magic bag to a new St. ...,disney
2,Movie,Ice Age: A Mammoth Christmas,United States,"November 26, 2021",2011,TV-G,23 min,"Animation, Comedy, Family",Sid the Sloth is on Santa's naughty list.,disney
3,Movie,The Queen Family Singalong,,"November 26, 2021",2021,TV-PG,41 min,Musical,"This is real life, not just fantasy!",disney
4,TV Show,The Beatles: Get Back,,"November 25, 2021",2021,,1 Season,"Docuseries, Historical, Music",A three-part documentary from Peter Jackson ca...,disney
5,Movie,Becoming Cousteau,United States,"November 24, 2021",2021,PG-13,94 min,"Biographical, Documentary",An inside look at the legendary life of advent...,disney
6,TV Show,Hawkeye,,"November 24, 2021",2021,TV-14,1 Season,"Action-Adventure, Superhero",Clint Barton/Hawkeye must team up with skilled...,disney
7,TV Show,Port Protection Alaska,United States,"November 24, 2021",2015,TV-14,2 Seasons,"Docuseries, Reality, Survival",Residents of Port Protection must combat volat...,disney
8,TV Show,Secrets of the Zoo: Tampa,United States,"November 24, 2021",2019,TV-PG,2 Seasons,"Animals & Nature, Docuseries, Family",A day in the life at ZooTampa is anything but ...,disney
9,Movie,A Muppets Christmas: Letters To Santa,United States,"November 19, 2021",2008,G,45 min,"Comedy, Family, Musical",Celebrate the holiday season with all your fav...,disney


In [12]:
hulu.head(1000)

Unnamed: 0,type,title,country,date_added,release_year,rating,duration,listed_in,description,service
0,Movie,Ricky Velez: Here's Everything,,"October 24, 2021",2021,TV-MA,,"Comedy, Stand Up",​Comedian Ricky Velez bares it all with his ho...,hulu
1,Movie,Silent Night,,"October 23, 2021",2020,,94 min,"Crime, Drama, Thriller","Mark, a low end South London hitman recently r...",hulu
2,Movie,The Marksman,,"October 23, 2021",2021,PG-13,108 min,"Action, Thriller",A hardened Arizona rancher tries to protect an...,hulu
3,Movie,Gaia,,"October 22, 2021",2021,R,97 min,Horror,A forest ranger and two survivalists with a cu...,hulu
4,Movie,Settlers,,"October 22, 2021",2021,,104 min,"Science Fiction, Thriller",Mankind's earliest settlers on the Martian fro...,hulu
5,TV Show,The Halloween Candy Magic Pet,,"October 22, 2021",2021,,1 Season,"Family, Kids",Join Mila and Morphle on a mystery-filled Hall...,hulu
6,Movie,The Evil Next Door,,"October 21, 2021",2020,,88 min,"Horror, Thriller","New to her role as a stepmom, a young woman mo...",hulu
7,TV Show,The Next Thing You Eat,,"October 21, 2021",2021,,1 Season,"Cooking & Food, Documentaries, Lifestyle & Cul...",With the unique insights and experience of Ugl...,hulu
8,TV Show,Queens,,"October 20, 2021",2021,TV-14,1 Season,"Drama, Music",Four women in their 40s reunite for a chance t...,hulu
9,TV Show,The Bachelorette,United States,"October 20, 2021",2003,TV-14,3 Seasons,"Reality, Romance",ABC's romance reality show lets one lucky lady...,hulu


In [13]:
netflix.head(1000)

Unnamed: 0,type,title,country,date_added,release_year,rating,duration,listed_in,description,service
0,Movie,Dick Johnson Is Dead,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",netflix
1,TV Show,Blood & Water,South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",netflix
2,TV Show,Ganglands,,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,netflix
3,TV Show,Jailbirds New Orleans,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",netflix
4,TV Show,Kota Factory,India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,netflix
5,TV Show,Midnight Mass,,"September 24, 2021",2021,TV-MA,1 Season,"TV Dramas, TV Horror, TV Mysteries",The arrival of a charismatic young priest brin...,netflix
6,Movie,My Little Pony: A New Generation,,"September 24, 2021",2021,PG,91 min,Children & Family Movies,Equestria's divided. But a bright-eyed hero be...,netflix
7,Movie,Sankofa,"United States, Ghana, Burkina Faso, United Kin...","September 24, 2021",1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies","On a photo shoot in Ghana, an American model s...",netflix
8,TV Show,The Great British Baking Show,United Kingdom,"September 24, 2021",2021,TV-14,9 Seasons,"British TV Shows, Reality TV",A talented batch of amateur bakers face off in...,netflix
9,Movie,The Starling,United States,"September 24, 2021",2021,PG-13,104 min,"Comedies, Dramas",A woman adjusting to life after a loss contend...,netflix


## Checking Column Consistency Across DataFrames

In [14]:
# Type consistency
for i in [amazon,disney,hulu,netflix]:
    print(i.type.unique())

['Movie' 'TV Show']
['Movie' 'TV Show']
['Movie' 'TV Show']
['Movie' 'TV Show']


In [15]:
# Looking at different country formats
diff_countries = []
for i in [amazon,disney,hulu,netflix]:
    for countries in i.country.unique():
        try:
            for country in countries.split(","):
                if(country not in diff_countries):
                    diff_countries.append(country)
        except AttributeError:
            # These are NaN values
            pass
sorted(diff_countries)

['',
 ' Afghanistan',
 ' Albania',
 ' Algeria',
 ' Angola',
 ' Argentina',
 ' Armenia',
 ' Australia',
 ' Austria',
 ' Azerbaijan',
 ' Bahamas',
 ' Bangladesh',
 ' Belgium',
 ' Bermuda',
 ' Botswana',
 ' Brazil',
 ' Bulgaria',
 ' Burkina Faso',
 ' Cambodia',
 ' Canada',
 ' Cayman Islands',
 ' Chile',
 ' China',
 ' Colombia',
 ' Costa Rica',
 ' Croatia',
 ' Cuba',
 ' Czech Republic',
 ' Denmark',
 ' Dominican Republic',
 ' East Germany',
 ' Ecuador',
 ' Egypt',
 ' Ethiopia',
 ' Finland',
 ' France',
 ' French Polynesia',
 ' Georgia',
 ' Germany',
 ' Ghana',
 ' Greece',
 ' Guatemala',
 ' Hong Kong',
 ' Hungary',
 ' Iceland',
 ' India',
 ' Indonesia',
 ' Iran',
 ' Iraq',
 ' Ireland',
 ' Israel',
 ' Italy',
 ' Japan',
 ' Jordan',
 ' Kazakhstan',
 ' Kenya',
 ' Kuwait',
 ' Latvia',
 ' Lebanon',
 ' Liechtenstein',
 ' Lithuania',
 ' Luxembourg',
 ' Malawi',
 ' Malaysia',
 ' Malta',
 ' Mexico',
 ' Monaco',
 ' Mongolia',
 ' Montenegro',
 ' Morocco',
 ' Namibia',
 ' Nepal',
 ' Netherlands',
 ' Ne

In [16]:
# Remaking country strings so that they are consistent
for i in [amazon,disney,hulu,netflix]:
    new_countries = []
    for countries in i.country:
            try:
                new_str = ""
                for country in countries.split(","):
                    if(country != ""):
                        new_str += country.strip() + ","
                new_countries.append(new_str[:-1])
            except AttributeError:
                # These are NaN values
                new_countries.append(countries)
    i["country"] = new_countries

In [17]:
# Date added consistency
for i in [amazon,disney,hulu,netflix]:
    print(i.date_added.unique())

['March 30, 2021' 'April 1, 2021' 'April 4, 2021' 'April 10, 2021'
 'April 17, 2021' 'April 24, 2021' 'May 2, 2021' 'June 3, 2021' nan
 'April 13, 2021' 'April 20, 2021' 'April 26, 2021' 'April 27, 2021'
 'April 30, 2021' 'May 1, 2021' 'May 9, 2021' 'May 11, 2021'
 'May 12, 2021' 'May 13, 2021' 'May 18, 2021' 'May 19, 2021'
 'May 22, 2021' 'May 24, 2021' 'May 29, 2021' 'June 4, 2021'
 'June 11, 2021' 'June 12, 2021' 'June 15, 2021' 'June 18, 2021'
 'June 20, 2021' 'June 21, 2021' 'June 22, 2021' 'June 23, 2021'
 'June 24, 2021' 'June 25, 2021' 'June 26, 2021' 'June 29, 2021'
 'July 1, 2021' 'July 3, 2021' 'July 8, 2021' 'July 12, 2021'
 'July 16, 2021' 'July 17, 2021' 'July 20, 2021' 'July 23, 2021'
 'July 24, 2021' 'July 26, 2021' 'July 27, 2021' 'August 1, 2021'
 'August 4, 2021' 'August 6, 2021' 'August 14, 2021' 'August 15, 2021'
 'August 16, 2021' 'August 17, 2021' 'August 18, 2021' 'August 21, 2021'
 'August 26, 2021' 'August 27, 2021' 'August 29, 2021' 'September 2, 2021'
 'Sept

In [18]:
# Release year consistency
for i in [amazon,disney,hulu,netflix]:
    print(i.release_year.unique())

[2014 2018 2017 1989 2016 1994 2020 2019 2008 2001 1941 1991 2005 2015
 2011 2013 1949 2007 2002 1955 1959 1983 2009 2012 2010 1986 1988 1920
 1936 1992 2021 1993 2006 1948 1946 1944 1935 1985 1937 1970 1945 1939
 1996 1997 1974 1938 1978 2004 1943 1975 1960 1934 1940 1961 2003 2000
 1967 1995 1951 1932 1999 1963 1969 1952 1947 1929 1990 1925 1968 1987
 1942 1979 1980 1981 1976 1966 1973 1956 1972 1950 1953 1982 1977 1933
 1958 1984 1998 1924 1922 1926 1954 1930 1971 1965 1931 1923 1962 1964
 1957 1927]
[2016 1988 2011 2021 2015 2019 2008 2020 2007 2013 2018 2014 2012 2006
 2010 1996 2009 2017 1993 1994 1998 1989 1997 2005 2000 2004 1987 1985
 1967 1973 1991 1956 1995 1984 1974 1959 2003 1976 2001 1990 1992 1952
 1955 1977 1957 1999 1948 1964 1969 1942 1950 1951 1953 1949 1940 1946
 1954 1936 1944 1935 1939 1975 1978 2002 1971 1961 1962 1981 1932 1938
 1941 1986 1947 1937 1966 1943 1934 1980 1960 1983 1972 1982 1979 1928
 1965 1970 1963 1933 1945 1968]
[2021 2020 2003 2011 2015 2012 20

In [19]:
# Rating consistency
for i in [amazon,disney,hulu,netflix]:
    print(i.rating.unique())

[nan '13+' 'ALL' '18+' 'R' 'TV-Y' 'TV-Y7' 'NR' '16+' 'TV-PG' '7+' 'TV-14'
 'TV-NR' 'TV-G' 'PG-13' 'TV-MA' 'G' 'PG' 'NC-17' 'UNRATED' '16' 'AGES_16_'
 'AGES_18_' 'ALL_AGES' 'NOT_RATE']
['TV-G' 'PG' 'TV-PG' nan 'PG-13' 'TV-14' 'G' 'TV-Y7' 'TV-Y' 'TV-Y7-FV']
['TV-MA' nan 'PG-13' 'R' 'TV-14' 'PG' 'TV-PG' 'NOT RATED' 'G' 'TV-G'
 '2 Seasons' 'TV-Y' '93 min' '4 Seasons' 'TV-Y7' '136 min' '91 min'
 '85 min' '98 min' '89 min' '94 min' '86 min' '3 Seasons' '121 min'
 '88 min' '101 min' '1 Season' '83 min' '100 min' '95 min' '92 min'
 '96 min' '109 min' '99 min' '75 min' '87 min' '67 min' '104 min'
 '107 min' '84 min' '103 min' '105 min' '119 min' '114 min' '82 min'
 '90 min' '130 min' '110 min' '80 min' '6 Seasons' '97 min' '111 min'
 '81 min' '49 min' '45 min' '41 min' '73 min' '40 min' '36 min' '39 min'
 '34 min' '47 min' '65 min' '37 min' '78 min' '102 min' '129 min'
 '115 min' '112 min' 'NR' '61 min' '106 min' '76 min' '77 min' '79 min'
 '157 min' '28 min' '64 min' '7 min' '5 min' '6 min' '1

In [20]:
# If duration is misplaced, move it and set rating to NaN
for i in [amazon, disney, hulu, netflix]:
    for index, row in i.iterrows():
        if("Season" in str(row["rating"]) or "min" in str(row["rating"])):
            row["duration"] = row["rating"]
            row["rating"] = np.nan
            i.loc[index] = row

In [21]:
# Duration consistency
for i in [amazon,disney,hulu,netflix]:
    print(i.duration.unique())

['113 min' '110 min' '74 min' '69 min' '45 min' '52 min' '98 min'
 '131 min' '87 min' '92 min' '88 min' '93 min' '94 min' '46 min' '96 min'
 '1 Season' '104 min' '62 min' '50 min' '3 Seasons' '2 Seasons' '86 min'
 '36 min' '37 min' '103 min' '9 min' '18 min' '14 min' '20 min' '19 min'
 '22 min' '60 min' '6 min' '54 min' '5 min' '84 min' '126 min' '125 min'
 '109 min' '89 min' '85 min' '56 min' '40 min' '111 min' '33 min' '34 min'
 '95 min' '99 min' '78 min' '4 Seasons' '77 min' '55 min' '53 min'
 '115 min' '58 min' '49 min' '135 min' '91 min' '64 min' '59 min' '48 min'
 '122 min' '90 min' '102 min' '65 min' '114 min' '136 min' '70 min'
 '138 min' '100 min' '480 min' '4 min' '30 min' '152 min' '68 min'
 '57 min' '7 Seasons' '31 min' '151 min' '149 min' '9 Seasons' '141 min'
 '121 min' '79 min' '140 min' '51 min' '106 min' '75 min' '27 min'
 '107 min' '108 min' '38 min' '157 min' '43 min' '118 min' '139 min'
 '6 Seasons' '112 min' '15 min' '72 min' '5 Seasons' '116 min' '142 min'
 '71 mi

In [22]:
# Turn durations into integers. Show durations are in number of seasons and movie durations in minutes
# Also drop rows where duration is nan
for i in [amazon, disney, hulu, netflix]:
    new_durations = []
    for dur in i.duration:
        try:
            new_durations.append(int(dur.split(" ")[0]))
        except:
            try:
                new_durations.append(int(dur))
            except:
                new_durations.append(0)
    i["duration"] = new_durations
    
# Iteratively, this did not work, so manual restriction of duration is done here
amazon = sqldf("SELECT * FROM amazon WHERE duration > 0")
disney = sqldf("SELECT * FROM disney WHERE duration > 0")
hulu = sqldf("SELECT * FROM hulu WHERE duration > 0")
netflix = sqldf("SELECT * FROM netflix WHERE duration > 0")

## Fixing Genre Disaster

Genres are very inconsistent across different services. Here, genre strings are being rebuilt to be consistent

In [23]:
# Rewriting similar genres so that formats are consistent across services
def get_genre(genre, content_type):
    # Dictionary for replacing inconsistent movie genres
    movie_rewrites = {"Arts": ["Arts/Entertainment/Culture"], "Entertainment": [None], "and Culture": [None], 
                     "Action & Adventure": ["Action", "Adventure"], "Action-Adventure": ["Action", "Adventure"], 
                     "Anime Features": ["Anime"], "Classic Movies": ["Classics"], "Comedies": ["Comedy"], "Documentaries": ["Documentary"],
                     "Dramas": ["Drama"], "Faith & Spirituality": ["Faith and Spirituality"], "Children & Family Movies": ["Family"], 
                     "Fitness": ["Health and Wellness"], "Historical": ["History"], "Horror Movies": ["Horror"], 
                     "International Movies": ["International"], "LGBTQ": ["LGBTQ+"], "LGBTQ Movies": ["LGBTQ+"],
                     "Lifestyle & Culture": ["Lifestyle"], "Music & Musicals": ["Music"], 
                     "Musical": ["Music"], "Concert Film": ["Music Videos and Concerts"], "Romantic Comedy": ["Romance", "Comedy"],
                     "Romantic Movies": ["Romance"], "Sci-Fi & Fantasy": ["Science Fiction", "Fantasy"], "Sitcom": ["Comedy"], 
                     "Sports Movies": ["Sports"], "Stand-Up Comedy": ["Stand Up"], "Talk Show and Variety": ["Variety"], 
                     "Thrillers": ["Thriller"], "Travel": ["Lifestyle"], "Cartoons": ["Animation", "Cartoons"], "Suspense": ["Thriller"],
                     "Movies": [None], "": [None]}

    # Dictionary for replacing inconsistent TV genres
    tv_rewrites = {"Arts": ["Arts/Entertainment/Culture"], "Entertainment": [None], "and Culture": [None],
                  "Action-Adventure": ["Action", "Adventure"], "Anime Series": ["Anime"], "British TV Shows": ["International", "British"],
                  "Classic & Cult TV": ["Classics"], "Crime TV Shows": ["Crime"], "Documentaries": ["Documentary"], 
                  "Docuseries": ["Documentary"], "Game Show / Competition": ["Game Show"], "Game Shows": ["Game Show"], 
                  "Fitness": ["Health & Wellness"], "Historical": ["History"], "International TV Shows": ["International"],
                  "Kids' TV": ["Kids"], "Korean TV Shows": ["International", "Korean"], "LGBTQ": ["LGBTQ+"], 
                  "Lifestyle & Culture": ["Lifestyle"], "Musical": ["Music"], "Reality TV": ["Reality"], "Romantic TV Shows": ["Romance"],
                  "Science & Nature TV": ["Science and Nature"], "Animals & Nature": ["Science and Nature"], "Series": [None],
                  "Sitcom": ["Sitcom", "Comedy"], "Sketch Comedy": ["Sketch Comedy", "Comedy"], "Soap Opera / Melodrama": ["Soap Opera"],
                  "Spanish-Language TV Shows": ["International", "Spanish"], "Stand-Up Comedy & Talk Shows": ["Stand Up", "Talk Show and Variety"],
                  "Latino": ["International", "Latino"], "TV Action & Adventure": ["Action", "Adventure"], "TV Comedies": ["Comedy"],
                  "TV Dramas": ["Drama"], "TV Horror": ["Horror"], "TV Mysteries": ["Mystery"], 
                  "TV Sci-Fi & Fantasy": ["Science Fiction", "Fantasy"], "TV Shows": [None], "TV Thrillers": ["Thriller"], 
                  "Stand Up": ["Talk Show and Variety"], "Talk Show": ["Talk Show and Variety"], "Teen TV Shows": ["Teen"],
                  "Variety": ["Talk Show and Variety"], "Travel": ["Lifestyle"], "Cartoons": ["Animation", "Cartoons"], "Suspense": ["Thriller"],
                  "Movies": [None], "": [None]}

    if(content_type == "Movie" and genre in movie_rewrites):
        return movie_rewrites.get(genre)
    elif(content_type == "TV Show" and genre in tv_rewrites):
        return tv_rewrites.get(genre)
    elif("&" in genre):
        return [genre.replace("&", "and")]
    else:
        return [genre]

In [24]:
# Set new genre strings
for i in [amazon, disney, hulu, netflix]:
    new_genres = []
    for index,row in i.iterrows():
        new_str = ""
        for genre in row["listed_in"].split(", "):
            gs = get_genre(genre, row["type"])
            for g in gs:
                if(g != None and str(g) not in new_str):
                    new_str += g + ","
        new_genres.append(new_str[:-1])
    i["listed_in"] = new_genres
    
# Iteratively, this did not work, so manual restriction of duration is done here
amazon = sqldf("SELECT * FROM amazon WHERE listed_in != \"\"")
disney = sqldf("SELECT * FROM disney WHERE listed_in != \"\"")
hulu = sqldf("SELECT * FROM hulu WHERE listed_in != \"\"")
netflix = sqldf("SELECT * FROM netflix WHERE listed_in != \"\"")

In [25]:
unique_genres = []
for i in [amazon, disney, hulu, netflix]:
    for genres in i.listed_in:
        for g in genres.split(","):
            if(g not in unique_genres):
                unique_genres.append(g)
sorted(unique_genres)

['Action',
 'Adult Animation',
 'Adventure',
 'Animals and Nature',
 'Animation',
 'Anime',
 'Anthology',
 'Arthouse',
 'Arts/Entertainment/Culture',
 'Biographical',
 'Black Stories',
 'British',
 'Buddy',
 'Cartoons',
 'Classics',
 'Comedy',
 'Coming of Age',
 'Cooking and Food',
 'Crime',
 'Cult Movies',
 'Dance',
 'Disaster',
 'Documentary',
 'Drama',
 'Faith and Spirituality',
 'Family',
 'Fantasy',
 'Game Show',
 'Health & Wellness',
 'Health and Wellness',
 'History',
 'Horror',
 'Independent Movies',
 'International',
 'Kids',
 'Korean',
 'LGBTQ+',
 'Late Night',
 'Latino',
 'Lifestyle',
 'Medical',
 'Military and War',
 'Music',
 'Music Videos and Concerts',
 'Mystery',
 'News',
 'Parody',
 'Police/Cop',
 'Reality',
 'Romance',
 'Science Fiction',
 'Science and Nature',
 'Science and Technology',
 'Sitcom',
 'Sketch Comedy',
 'Soap Opera',
 'Spanish',
 'Special Interest',
 'Sports',
 'Spy/Espionage',
 'Stand Up',
 'Superhero',
 'Survival',
 'Talk Show and Variety',
 'Teen',
 '

## Saving Clean Data

In [26]:
# Write all clean datasets to .csv files
amazon.to_csv("data/amazon_clean.csv", index=False)
disney.to_csv("data/disney_clean.csv", index=False)
hulu.to_csv("data/hulu_clean.csv", index=False)
netflix.to_csv("data/netflix_clean.csv", index=False)