In [1]:
# For Basic Operations
import pandas as pd
import numpy as np

## Creating 2018 movie dataset from Wikipedia

In [2]:
link = "https://en.wikipedia.org/wiki/List_of_American_films_of_2018"
# pd.read_html : Read HTML tables into a list of DataFrame objects.
# returns: dfs, A list of DataFrames

df1 = pd.read_html(link, header=0)[2]
df2 = pd.read_html(link, header=0)[3]
df3 = pd.read_html(link, header=0)[4]
df4 = pd.read_html(link, header=0)[5]

In [3]:
# combining the lists, ignoring the index values
df = df1.append(df2.append(df3.append(df4,ignore_index=True),ignore_index=True),ignore_index=True)

# printing the data
df

Unnamed: 0,Opening,Opening.1,Title,Production company,Cast and crew,.mw-parser-output .tooltip-dotted{border-bottom:1px dotted;cursor:help}Ref.,Ref.
0,JANUARY,5,Insidious: The Last Key,Universal Pictures / Blumhouse Productions / S...,Adam Robitel (director); Leigh Whannell (scree...,[2],
1,JANUARY,5,The Strange Ones,Vertical Entertainment,Lauren Wolkstein (director); Christopher Radcl...,[3],
2,JANUARY,5,Stratton,Momentum Pictures,"Simon West (director); Duncan Falconer, Warren...",[4],
3,JANUARY,10,Sweet Country,Samuel Goldwyn Films,"Warwick Thornton (director); David Tranter, St...",[5],
4,JANUARY,12,The Commuter,Lionsgate / StudioCanal / The Picture Company,Jaume Collet-Serra (director); Byron Willinger...,[6],
...,...,...,...,...,...,...,...
267,DECEMBER,25,Holmes & Watson,Columbia Pictures / Gary Sanchez Productions,Etan Cohen (director/screenplay); Will Ferrell...,,[164]
268,DECEMBER,25,Vice,Annapurna Pictures / Plan B Entertainment,Adam McKay (director/screenplay); Christian Ba...,,[137]
269,DECEMBER,25,On the Basis of Sex,Focus Features,Mimi Leder (director); Daniel Stiepleman (scre...,,[228]
270,DECEMBER,25,Destroyer,Annapurna Pictures,"Karyn Kusama (director); Phil Hay, Matt Manfre...",,[260]


In [4]:
# pip install tmdbv3api

In [5]:
# will now use tmdb data to fetch genre of the Movies
from tmdbv3api import TMDb
import json
import requests
tmdb = TMDb()
tmdb.api_key = '75f49154067c35aa93fcb726bdcc2adb'

In [6]:
from tmdbv3api import Movie
tmdb_movie = Movie()

# function to get genre of the movie
def get_genre(x):
    genres = []
    result = tmdb_movie.search(x)
    if not result:
        return np.NaN
    else:
        movie_id = result[0].id
        response = requests.get('https://api.themoviedb.org/3/movie/{}?api_key={}'.format(movie_id,tmdb.api_key))
        data_json = response.json()
        if data_json['genres']:
            genre_str = " " 
            for i in range(0,len(data_json['genres'])):
                genres.append(data_json['genres'][i]['name'])
            return genre_str.join(genres)
        else:
            np.NaN

In [7]:
# creating 'genres' column
df['genres'] = df['Title'].map(lambda x: get_genre(str(x)))

# printing the data
df

Unnamed: 0,Opening,Opening.1,Title,Production company,Cast and crew,.mw-parser-output .tooltip-dotted{border-bottom:1px dotted;cursor:help}Ref.,Ref.,genres
0,JANUARY,5,Insidious: The Last Key,Universal Pictures / Blumhouse Productions / S...,Adam Robitel (director); Leigh Whannell (scree...,[2],,Horror Mystery Thriller
1,JANUARY,5,The Strange Ones,Vertical Entertainment,Lauren Wolkstein (director); Christopher Radcl...,[3],,Thriller Drama
2,JANUARY,5,Stratton,Momentum Pictures,"Simon West (director); Duncan Falconer, Warren...",[4],,Action Thriller
3,JANUARY,10,Sweet Country,Samuel Goldwyn Films,"Warwick Thornton (director); David Tranter, St...",[5],,Drama History Western
4,JANUARY,12,The Commuter,Lionsgate / StudioCanal / The Picture Company,Jaume Collet-Serra (director); Byron Willinger...,[6],,Action Thriller Mystery
...,...,...,...,...,...,...,...,...
267,DECEMBER,25,Holmes & Watson,Columbia Pictures / Gary Sanchez Productions,Etan Cohen (director/screenplay); Will Ferrell...,,[164],Mystery Adventure Comedy Crime
268,DECEMBER,25,Vice,Annapurna Pictures / Plan B Entertainment,Adam McKay (director/screenplay); Christian Ba...,,[137],Thriller Science Fiction Action Adventure
269,DECEMBER,25,On the Basis of Sex,Focus Features,Mimi Leder (director); Daniel Stiepleman (scre...,,[228],Drama History
270,DECEMBER,25,Destroyer,Annapurna Pictures,"Karyn Kusama (director); Phil Hay, Matt Manfre...",,[260],Thriller Crime Drama Action


In [8]:
# creating new data using certain columns of df
df_2018 = df[['Title','Cast and crew','genres']]

df_2018

Unnamed: 0,Title,Cast and crew,genres
0,Insidious: The Last Key,Adam Robitel (director); Leigh Whannell (scree...,Horror Mystery Thriller
1,The Strange Ones,Lauren Wolkstein (director); Christopher Radcl...,Thriller Drama
2,Stratton,"Simon West (director); Duncan Falconer, Warren...",Action Thriller
3,Sweet Country,"Warwick Thornton (director); David Tranter, St...",Drama History Western
4,The Commuter,Jaume Collet-Serra (director); Byron Willinger...,Action Thriller Mystery
...,...,...,...
267,Holmes & Watson,Etan Cohen (director/screenplay); Will Ferrell...,Mystery Adventure Comedy Crime
268,Vice,Adam McKay (director/screenplay); Christian Ba...,Thriller Science Fiction Action Adventure
269,On the Basis of Sex,Mimi Leder (director); Daniel Stiepleman (scre...,Drama History
270,Destroyer,"Karyn Kusama (director); Phil Hay, Matt Manfre...",Thriller Crime Drama Action


In [9]:
# function to get director name
def get_director_name(x):
    if " (director)" in x:
        # splitting using keywords given in brackets in Cast and crew
        return x.split(" (director)")[0]
    elif " (directors)" in x:
        return x.split(" (directors)")[0]
    else:
        return x.split(" (director/screenplay)")[0]

In [42]:
# function to get actor1 name
def get_actor1_name(x):
    if len((x.split("screenplay); ")[-1]).split(", ")) < 1:
        return np.NaN
    else:
        return ((x.split("screenplay); ")[-1]).split(", ")[0])

In [11]:
# function to get actor2 name
def get_actor2_name(x):
    if len((x.split("screenplay); ")[-1]).split(", ")) < 2:
        return np.NaN
    else:
        return ((x.split("screenplay); ")[-1]).split(", ")[1])

In [12]:
def get_actor3_name(x):
    if len((x.split("screenplay); ")[-1]).split(", ")) < 3:
        return np.NaN
    else:
        return ((x.split("screenplay); ")[-1]).split(", ")[2])

In [13]:
# creating respective columns using lambda function
df_2018['director_name'] = df_2018['Cast and crew'].map(lambda x: get_director_name(x))

df_2018['actor1_name'] = df_2018['Cast and crew'].map(lambda x: get_actor1_name(x))

df_2018['actor2_name'] = df_2018['Cast and crew'].map(lambda x: get_actor2_name(x))

df_2018['actor3_name'] = df_2018['Cast and crew'].map(lambda x: get_actor3_name(x))


#printing the data
df_2018

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2018['director_name'] = df_2018['Cast and crew'].map(lambda x: get_director_name(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2018['actor1_name'] = df_2018['Cast and crew'].map(lambda x: get_actor1_name(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2018['actor2_name'] = df_2018[

Unnamed: 0,Title,Cast and crew,genres,director_name,actor1_name,actor2_name,actor3_name
0,Insidious: The Last Key,Adam Robitel (director); Leigh Whannell (scree...,Horror Mystery Thriller,Adam Robitel,Lin Shaye,Angus Sampson,Leigh Whannell
1,The Strange Ones,Lauren Wolkstein (director); Christopher Radcl...,Thriller Drama,Lauren Wolkstein,Alex Pettyfer,James Freedson-Jackson,Emily Althaus
2,Stratton,"Simon West (director); Duncan Falconer, Warren...",Action Thriller,Simon West,Dominic Cooper,Austin Stowell,Gemma Chan
3,Sweet Country,"Warwick Thornton (director); David Tranter, St...",Drama History Western,Warwick Thornton,Bryan Brown,Sam Neill,
4,The Commuter,Jaume Collet-Serra (director); Byron Willinger...,Action Thriller Mystery,Jaume Collet-Serra,Liam Neeson,Vera Farmiga,Patrick Wilson
...,...,...,...,...,...,...,...
267,Holmes & Watson,Etan Cohen (director/screenplay); Will Ferrell...,Mystery Adventure Comedy Crime,Etan Cohen,Will Ferrell,John C. Reilly,Rebecca Hall
268,Vice,Adam McKay (director/screenplay); Christian Ba...,Thriller Science Fiction Action Adventure,Adam McKay,Christian Bale,Amy Adams,Steve Carell
269,On the Basis of Sex,Mimi Leder (director); Daniel Stiepleman (scre...,Drama History,Mimi Leder,Felicity Jones,Armie Hammer,Justin Theroux
270,Destroyer,"Karyn Kusama (director); Phil Hay, Matt Manfre...",Thriller Crime Drama Action,Karyn Kusama,Nicole Kidman,Sebastian Stan,Toby Kebbell


In [14]:
# renaming Title of df_2018
df_2018 = df_2018.rename(columns={'Title':'title'})

# new data df18
df18 = df_2018.loc[:,['title','genres','director_name','actor1_name','actor2_name','actor3_name']]
# printing the data
df18

Unnamed: 0,title,genres,director_name,actor1_name,actor2_name,actor3_name
0,Insidious: The Last Key,Horror Mystery Thriller,Adam Robitel,Lin Shaye,Angus Sampson,Leigh Whannell
1,The Strange Ones,Thriller Drama,Lauren Wolkstein,Alex Pettyfer,James Freedson-Jackson,Emily Althaus
2,Stratton,Action Thriller,Simon West,Dominic Cooper,Austin Stowell,Gemma Chan
3,Sweet Country,Drama History Western,Warwick Thornton,Bryan Brown,Sam Neill,
4,The Commuter,Action Thriller Mystery,Jaume Collet-Serra,Liam Neeson,Vera Farmiga,Patrick Wilson
...,...,...,...,...,...,...
267,Holmes & Watson,Mystery Adventure Comedy Crime,Etan Cohen,Will Ferrell,John C. Reilly,Rebecca Hall
268,Vice,Thriller Science Fiction Action Adventure,Adam McKay,Christian Bale,Amy Adams,Steve Carell
269,On the Basis of Sex,Drama History,Mimi Leder,Felicity Jones,Armie Hammer,Justin Theroux
270,Destroyer,Thriller Crime Drama Action,Karyn Kusama,Nicole Kidman,Sebastian Stan,Toby Kebbell


In [15]:
# if names of actor2 and actor3 are unavailable, replace nan with 'Unknown'
df18['actor2_name'] = df18['actor2_name'].replace(np.nan, 'Unknown')
df18['actor3_name'] = df18['actor3_name'].replace(np.nan, 'Unknown')

In [16]:
# converting movie title in lowercase
df18['title'] = df18['title'].str.lower()

In [17]:
# creating a column with combined information
df18['comb'] = df18['director_name'] + ' ' + df18['actor1_name'] + ' ' + df18['actor2_name'] + ' ' + df18['actor3_name'] + ' ' + df18['genres']

In [18]:
# printing 2018 movies data df18
df18

Unnamed: 0,title,genres,director_name,actor1_name,actor2_name,actor3_name,comb
0,insidious: the last key,Horror Mystery Thriller,Adam Robitel,Lin Shaye,Angus Sampson,Leigh Whannell,Adam Robitel Lin Shaye Angus Sampson Leigh Wha...
1,the strange ones,Thriller Drama,Lauren Wolkstein,Alex Pettyfer,James Freedson-Jackson,Emily Althaus,Lauren Wolkstein Alex Pettyfer James Freedson-...
2,stratton,Action Thriller,Simon West,Dominic Cooper,Austin Stowell,Gemma Chan,Simon West Dominic Cooper Austin Stowell Gemma...
3,sweet country,Drama History Western,Warwick Thornton,Bryan Brown,Sam Neill,Unknown,Warwick Thornton Bryan Brown Sam Neill Unknown...
4,the commuter,Action Thriller Mystery,Jaume Collet-Serra,Liam Neeson,Vera Farmiga,Patrick Wilson,Jaume Collet-Serra Liam Neeson Vera Farmiga Pa...
...,...,...,...,...,...,...,...
267,holmes & watson,Mystery Adventure Comedy Crime,Etan Cohen,Will Ferrell,John C. Reilly,Rebecca Hall,Etan Cohen Will Ferrell John C. Reilly Rebecca...
268,vice,Thriller Science Fiction Action Adventure,Adam McKay,Christian Bale,Amy Adams,Steve Carell,Adam McKay Christian Bale Amy Adams Steve Care...
269,on the basis of sex,Drama History,Mimi Leder,Felicity Jones,Armie Hammer,Justin Theroux,Mimi Leder Felicity Jones Armie Hammer Justin ...
270,destroyer,Thriller Crime Drama Action,Karyn Kusama,Nicole Kidman,Sebastian Stan,Toby Kebbell,Karyn Kusama Nicole Kidman Sebastian Stan Toby...


## Creating 2019 movie dataset from Wikipedia

In [19]:
link = "https://en.wikipedia.org/wiki/List_of_American_films_of_2019"
df1 = pd.read_html(link, header=0)[2]
df2 = pd.read_html(link, header=0)[3]
df3 = pd.read_html(link, header=0)[4]
df4 = pd.read_html(link, header=0)[5]

In [20]:
# combining the lists, ignoring the index values
df = df1.append(df2.append(df3.append(df4,ignore_index=True),ignore_index=True),ignore_index=True)

# printing the data
df

Unnamed: 0,Opening,Opening.1,Title,Production company,Cast and crew,Ref.
0,JANUARY,4,Escape Room,Columbia Pictures / Original Film,"Adam Robitel (director); Bragi F. Schut, Maria...",[2]
1,JANUARY,4,Rust Creek,IFC Films,Jen McGowan (director); Julie Lipson (screenpl...,[3]
2,JANUARY,4,American Hangman,Hangman Justice Productions,Wilson Coneybeare (director/screenplay); Donal...,[4]
3,JANUARY,11,A Dog's Way Home,Columbia Pictures,Charles Martin Smith (director); W. Bruce Came...,[5]
4,JANUARY,11,The Upside,STX Entertainment,Neil Burger (director); Jon Hartmere (screenpl...,[6]
...,...,...,...,...,...,...
237,DECEMBER,25,Spies in Disguise,20th Century Fox / Blue Sky Studios / Chernin ...,"Nick Bruno, Troy Quane (directors); Brad Copel...",[132]
238,DECEMBER,25,Little Women,Columbia Pictures / Regency Enterprises,Greta Gerwig (director/screenplay); Saoirse Ro...,[221]
239,DECEMBER,25,1917,Universal Pictures / DreamWorks Pictures,Sam Mendes (director/screenplay); Krysty Wilso...,[222]
240,DECEMBER,25,Just Mercy,Warner Bros. Pictures / Participant Media,"Destin Daniel Cretton (director/screenplay), A...",[223]


In [21]:
# creating 'genres' column
df['genres'] = df['Title'].map(lambda x: get_genre(str(x)))

# creating new data using certain columns of df
df_2019 = df[['Title','Cast and crew','genres']]

# printing the data
df_2019

Unnamed: 0,Title,Cast and crew,genres
0,Escape Room,"Adam Robitel (director); Bragi F. Schut, Maria...",Horror Thriller Mystery
1,Rust Creek,Jen McGowan (director); Julie Lipson (screenpl...,Thriller Drama
2,American Hangman,Wilson Coneybeare (director/screenplay); Donal...,Thriller
3,A Dog's Way Home,Charles Martin Smith (director); W. Bruce Came...,Drama Adventure Family
4,The Upside,Neil Burger (director); Jon Hartmere (screenpl...,Comedy Drama
...,...,...,...
237,Spies in Disguise,"Nick Bruno, Troy Quane (directors); Brad Copel...",Animation Action Adventure Comedy Family
238,Little Women,Greta Gerwig (director/screenplay); Saoirse Ro...,Drama Romance
239,1917,Sam Mendes (director/screenplay); Krysty Wilso...,War Drama Action Thriller
240,Just Mercy,"Destin Daniel Cretton (director/screenplay), A...",Drama Crime History


In [22]:
# creating respective columns using lambda function
df_2019['director_name'] = df_2019['Cast and crew'].map(lambda x: get_director_name(str(x)))

df_2019['actor1_name'] = df_2019['Cast and crew'].map(lambda x: get_actor1_name(x))

df_2019['actor2_name'] = df_2019['Cast and crew'].map(lambda x: get_actor2_name(x))

df_2019['actor3_name'] = df_2019['Cast and crew'].map(lambda x: get_actor3_name(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2019['director_name'] = df_2019['Cast and crew'].map(lambda x: get_director_name(str(x)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2019['actor1_name'] = df_2019['Cast and crew'].map(lambda x: get_actor1_name(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2019['actor2_name'] = df_

In [23]:
# renaming Title of df_2019
df_2019 = df_2019.rename(columns={'Title':'title'})

In [24]:
# new data df19
df19 = df_2019.loc[:,['title','genres','director_name','actor1_name','actor2_name','actor3_name']]

# printing the data
df19

Unnamed: 0,title,genres,director_name,actor1_name,actor2_name,actor3_name
0,Escape Room,Horror Thriller Mystery,Adam Robitel,Taylor Russell,Logan Miller,Deborah Ann Woll
1,Rust Creek,Thriller Drama,Jen McGowan,Hermione Corfield,Jay Paulson,Sean O'Bryan
2,American Hangman,Thriller,Wilson Coneybeare,Donald Sutherland,Vincent Kartheiser,Oliver Dennis
3,A Dog's Way Home,Drama Adventure Family,Charles Martin Smith,Bryce Dallas Howard,Edward James Olmos,Alexandra Shipp
4,The Upside,Comedy Drama,Neil Burger,Bryan Cranston,Kevin Hart,Nicole Kidman
...,...,...,...,...,...,...
237,Spies in Disguise,Animation Action Adventure Comedy Family,"Nick Bruno, Troy Quane",Will Smith,Tom Holland,Rashida Jones
238,Little Women,Drama Romance,Greta Gerwig,Saoirse Ronan,Emma Watson,Florence Pugh
239,1917,War Drama Action Thriller,Sam Mendes,George MacKay,Dean-Charles Chapman,Mark Strong
240,Just Mercy,Drama Crime History,Destin Daniel Cretton,Michael B. Jordan,Jamie Foxx,Brie Larson


In [25]:
# if names of actor2 and actor3 are unavailable, replace nan with 'Unknown'
df19['actor2_name'] = df19['actor2_name'].replace(np.nan, 'Unknown')
df19['actor3_name'] = df19['actor3_name'].replace(np.nan, 'Unknown')

In [26]:
# converting movie title in lowercase
df19['title'] = df19['title'].str.lower()

In [27]:
# creating a column with combined information
df19['comb'] = df19['director_name'] + ' ' + df19['actor1_name'] + ' ' + df19['actor2_name'] + ' ' + df19['actor3_name'] + ' ' + df19['genres']

In [28]:
# printing 2019 movies data df19
df19

Unnamed: 0,title,genres,director_name,actor1_name,actor2_name,actor3_name,comb
0,escape room,Horror Thriller Mystery,Adam Robitel,Taylor Russell,Logan Miller,Deborah Ann Woll,Adam Robitel Taylor Russell Logan Miller Debor...
1,rust creek,Thriller Drama,Jen McGowan,Hermione Corfield,Jay Paulson,Sean O'Bryan,Jen McGowan Hermione Corfield Jay Paulson Sean...
2,american hangman,Thriller,Wilson Coneybeare,Donald Sutherland,Vincent Kartheiser,Oliver Dennis,Wilson Coneybeare Donald Sutherland Vincent Ka...
3,a dog's way home,Drama Adventure Family,Charles Martin Smith,Bryce Dallas Howard,Edward James Olmos,Alexandra Shipp,Charles Martin Smith Bryce Dallas Howard Edwar...
4,the upside,Comedy Drama,Neil Burger,Bryan Cranston,Kevin Hart,Nicole Kidman,Neil Burger Bryan Cranston Kevin Hart Nicole K...
...,...,...,...,...,...,...,...
237,spies in disguise,Animation Action Adventure Comedy Family,"Nick Bruno, Troy Quane",Will Smith,Tom Holland,Rashida Jones,"Nick Bruno, Troy Quane Will Smith Tom Holland ..."
238,little women,Drama Romance,Greta Gerwig,Saoirse Ronan,Emma Watson,Florence Pugh,Greta Gerwig Saoirse Ronan Emma Watson Florenc...
239,1917,War Drama Action Thriller,Sam Mendes,George MacKay,Dean-Charles Chapman,Mark Strong,Sam Mendes George MacKay Dean-Charles Chapman ...
240,just mercy,Drama Crime History,Destin Daniel Cretton,Michael B. Jordan,Jamie Foxx,Brie Larson,Destin Daniel Cretton Michael B. Jordan Jamie ...


## Creating 2020 movie dataset from Wikipedia

In [29]:
link = "https://en.wikipedia.org/wiki/List_of_American_films_of_2020"
df1 = pd.read_html(link, header=0)[2]
df2 = pd.read_html(link, header=0)[3]
df3 = pd.read_html(link, header=0)[4]
df4 = pd.read_html(link, header=0)[5]

In [30]:
# combining the lists, ignoring the index values
df = df1.append(df2.append(df3.append(df4,ignore_index=True),ignore_index=True),ignore_index=True)

# printing the data
df

Unnamed: 0,Opening,Opening.1,Title,Production company,Cast and crew,.mw-parser-output .tooltip-dotted{border-bottom:1px dotted;cursor:help}Ref.,Ref.
0,JANUARY,3,The Grudge,Screen Gems / Stage 6 Films / Ghost House Pict...,Nicolas Pesce (director/screenplay); Andrea Ri...,[2],
1,JANUARY,10,Underwater,20th Century Fox / TSG Entertainment / Chernin...,"William Eubank (director); Brian Duffield, Ada...",[3],
2,JANUARY,10,Like a Boss,Paramount Pictures,"Miguel Arteta (director); Sam Pitman, Adam Col...",[4],
3,JANUARY,10,Three Christs,IFC Films,Jon Avnet (director/screenplay); Eric Nazarian...,,
4,JANUARY,10,Inherit the Viper,Barry Films / Tycor International Film Company,Anthony Jerjen (director); Andrew Crabtree (sc...,[5],
...,...,...,...,...,...,...,...
270,DECEMBER,25,We Can Be Heroes,Netflix / Troublemaker Studios,Robert Rodriguez (director/screenplay); Priyan...,,[241]
271,DECEMBER,25,News of the World,Universal Pictures / Perfect World Pictures,Paul Greengrass (director/screenplay); Luke Da...,,[242]
272,DECEMBER,25,One Night in Miami...,Amazon Studios,Regina King (director); Kemp Powers (screenpla...,,[243]
273,DECEMBER,25,Promising Young Woman,Focus Features / FilmNation Entertainment,Emerald Fennell (director/screenplay); Carey M...,,[244]


In [31]:
# creating 'genres' column
df['genres'] = df['Title'].map(lambda x: get_genre(str(x)))

# creating new data using certain columns of df
df_2020 = df[['Title','Cast and crew','genres']]

# printing the data
df_2020

Unnamed: 0,Title,Cast and crew,genres
0,The Grudge,Nicolas Pesce (director/screenplay); Andrea Ri...,Horror Mystery Thriller
1,Underwater,"William Eubank (director); Brian Duffield, Ada...",Action Horror Science Fiction Thriller
2,Like a Boss,"Miguel Arteta (director); Sam Pitman, Adam Col...",Comedy
3,Three Christs,Jon Avnet (director/screenplay); Eric Nazarian...,Drama
4,Inherit the Viper,Anthony Jerjen (director); Andrew Crabtree (sc...,Crime Thriller Drama
...,...,...,...
270,We Can Be Heroes,Robert Rodriguez (director/screenplay); Priyan...,Action Fantasy Family Comedy
271,News of the World,Paul Greengrass (director/screenplay); Luke Da...,Drama Western Adventure
272,One Night in Miami...,Regina King (director); Kemp Powers (screenpla...,Drama
273,Promising Young Woman,Emerald Fennell (director/screenplay); Carey M...,Thriller Crime Drama


In [32]:
# creating respective columns using lambda function
df_2020['director_name'] = df_2020['Cast and crew'].map(lambda x: get_director_name(str(x)))

df_2020['actor1_name'] = df_2020['Cast and crew'].map(lambda x: get_actor1_name(x))

df_2020['actor2_name'] = df_2020['Cast and crew'].map(lambda x: get_actor2_name(x))

df_2020['actor3_name'] = df_2020['Cast and crew'].map(lambda x: get_actor3_name(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2020['director_name'] = df_2020['Cast and crew'].map(lambda x: get_director_name(str(x)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2020['actor1_name'] = df_2020['Cast and crew'].map(lambda x: get_actor1_name(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2020['actor2_name'] = df_

In [33]:
# renaming Title of df_2020
df_2020 = df_2020.rename(columns={'Title':'title'})

# new data df20
df20 = df_2020.loc[:,['title','genres','director_name','actor1_name','actor2_name','actor3_name']]

# printing the data
df20

Unnamed: 0,title,genres,director_name,actor1_name,actor2_name,actor3_name
0,The Grudge,Horror Mystery Thriller,Nicolas Pesce,Andrea Riseborough,Demián Bichir,John Cho
1,Underwater,Action Horror Science Fiction Thriller,William Eubank,Kristen Stewart,Vincent Cassel,Jessica Henwick
2,Like a Boss,Comedy,Miguel Arteta,Tiffany Haddish,Rose Byrne,Salma Hayek
3,Three Christs,Drama,Jon Avnet,Richard Gere,Peter Dinklage,Walton Goggins
4,Inherit the Viper,Crime Thriller Drama,Anthony Jerjen,Josh Hartnett,Margarita Levieva,Chandler Riggs
...,...,...,...,...,...,...
270,We Can Be Heroes,Action Fantasy Family Comedy,Robert Rodriguez,Priyanka Chopra Jonas,Pedro Pascal,YaYa Gosselin
271,News of the World,Drama Western Adventure,Paul Greengrass,Tom Hanks,Helena Zengel,
272,One Night in Miami...,Drama,Regina King,Kingsley Ben-Adir,Eli Goree,Aldis Hodge
273,Promising Young Woman,Thriller Crime Drama,Emerald Fennell,Carey Mulligan,Bo Burnham,Alison Brie


In [34]:
# if names of actor2 and actor3 are unavailable, replace nan with 'Unknown'
df20['actor2_name'] = df20['actor2_name'].replace(np.nan, 'Unknown')
df20['actor3_name'] = df20['actor3_name'].replace(np.nan, 'Unknown')

In [35]:
# converting movie title in lowercase
df20['title'] = df20['title'].str.lower()

In [36]:
# creating a column with combined information
df20['comb'] = df20['director_name'] + ' ' + df20['actor1_name'] + ' ' + df20['actor2_name'] + ' ' + df20['actor3_name'] + ' ' + df20['genres']

In [37]:
# printing 2020 movies data df20
df20

Unnamed: 0,title,genres,director_name,actor1_name,actor2_name,actor3_name,comb
0,the grudge,Horror Mystery Thriller,Nicolas Pesce,Andrea Riseborough,Demián Bichir,John Cho,Nicolas Pesce Andrea Riseborough Demián Bichir...
1,underwater,Action Horror Science Fiction Thriller,William Eubank,Kristen Stewart,Vincent Cassel,Jessica Henwick,William Eubank Kristen Stewart Vincent Cassel ...
2,like a boss,Comedy,Miguel Arteta,Tiffany Haddish,Rose Byrne,Salma Hayek,Miguel Arteta Tiffany Haddish Rose Byrne Salma...
3,three christs,Drama,Jon Avnet,Richard Gere,Peter Dinklage,Walton Goggins,Jon Avnet Richard Gere Peter Dinklage Walton G...
4,inherit the viper,Crime Thriller Drama,Anthony Jerjen,Josh Hartnett,Margarita Levieva,Chandler Riggs,Anthony Jerjen Josh Hartnett Margarita Levieva...
...,...,...,...,...,...,...,...
270,we can be heroes,Action Fantasy Family Comedy,Robert Rodriguez,Priyanka Chopra Jonas,Pedro Pascal,YaYa Gosselin,Robert Rodriguez Priyanka Chopra Jonas Pedro P...
271,news of the world,Drama Western Adventure,Paul Greengrass,Tom Hanks,Helena Zengel,Unknown,Paul Greengrass Tom Hanks Helena Zengel Unknow...
272,one night in miami...,Drama,Regina King,Kingsley Ben-Adir,Eli Goree,Aldis Hodge,Regina King Kingsley Ben-Adir Eli Goree Aldis ...
273,promising young woman,Thriller Crime Drama,Emerald Fennell,Carey Mulligan,Bo Burnham,Alison Brie,Emerald Fennell Carey Mulligan Bo Burnham Alis...


## Creating 2021 movie dataset from Wikipedia

In [38]:
link = "https://en.wikipedia.org/wiki/List_of_American_films_of_2021"
df1 = pd.read_html(link, header=0)[2]
df2 = pd.read_html(link, header=0)[3]
df3 = pd.read_html(link, header=0)[4]
df4 = pd.read_html(link, header=0)[5]

In [39]:
# combining the lists, ignoring the index values
df = df1.append(df2.append(df3.append(df4,ignore_index=True),ignore_index=True),ignore_index=True)

# printing the data
df

Unnamed: 0,Opening,Opening.1,Title,Production company,Cast and crew,.mw-parser-output .tooltip-dotted{border-bottom:1px dotted;cursor:help}Ref.,Ref.
0,JANUARY,1.0,Shadow in the Cloud,Vertical Entertainment,Roseanne Liang (director/screenplay); Max Land...,[2],
1,JANUARY,13.0,The White Tiger,Netflix,Ramin Bahrani (director/screenplay); Adarsh Go...,,
2,JANUARY,14.0,Locked Down,HBO Max / Warner Bros. Pictures,Doug Liman (director); Steven Knight (screenpl...,[3],
3,JANUARY,15.0,The Dig,Netflix / Clerkenwell Films,Simon Stone (director); Moira Buffini (screenp...,[4],
4,JANUARY,15.0,Outside the Wire,Netflix,"Mikael Håfström (director); Rob Yescombe, Rowa...",[5],
...,...,...,...,...,...,...,...
353,DECEMBER,25.0,The Tragedy of Macbeth,Apple TV+ / A24 / IAC Films,Joel Coen (director/screenplay); Denzel Washin...,,[271]
354,DECEMBER,25.0,A Journal for Jordan,Columbia Pictures / Escape Artists / Bron Studios,Denzel Washington (director); Virgil Williams ...,,[272]
355,DECEMBER,25.0,American Underdog,Lionsgate,"Erwin brothers (directors); Jon Erwin, David A...",,[273]
356,DECEMBER,26.0,Memoria,Neon,Apichatpong Weerasethakul (director/acreenplay...,,[274]


In [40]:
# creating 'genres' column
df['genres'] = df['Title'].map(lambda x: get_genre(str(x)))

# creating new data using certain columns of df
df_2021 = df[['Title','Cast and crew','genres']]

# printing the data
df_2021

Unnamed: 0,Title,Cast and crew,genres
0,Shadow in the Cloud,Roseanne Liang (director/screenplay); Max Land...,Horror Action War
1,The White Tiger,Ramin Bahrani (director/screenplay); Adarsh Go...,Drama
2,Locked Down,Doug Liman (director); Steven Knight (screenpl...,Comedy Crime Drama
3,The Dig,Simon Stone (director); Moira Buffini (screenp...,Drama History
4,Outside the Wire,"Mikael Håfström (director); Rob Yescombe, Rowa...",Thriller Action Science Fiction
...,...,...,...
353,The Tragedy of Macbeth,Joel Coen (director/screenplay); Denzel Washin...,Drama War
354,A Journal for Jordan,Denzel Washington (director); Virgil Williams ...,Drama Romance
355,American Underdog,"Erwin brothers (directors); Jon Erwin, David A...",Drama
356,Memoria,Apichatpong Weerasethakul (director/acreenplay...,Drama Fantasy Mystery


In [44]:
# fillna() method to impute (to replace) null values with something default. 
# here all null or NaN's can be replaced with empty string " "
df_2021['Cast and crew'] = df_2021['Cast and crew'].fillna(" ")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2021['Cast and crew'] = df_2021['Cast and crew'].fillna(" ")


In [45]:
# creating respective columns using lambda function
df_2021['director_name'] = df_2021['Cast and crew'].map(lambda x: get_director_name(str(x)))

df_2021['actor1_name'] = df_2021['Cast and crew'].map(lambda x: get_actor1_name(x))

df_2021['actor2_name'] = df_2021['Cast and crew'].map(lambda x: get_actor2_name(x))

df_2021['actor3_name'] = df_2021['Cast and crew'].map(lambda x: get_actor3_name(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2021['director_name'] = df_2021['Cast and crew'].map(lambda x: get_director_name(str(x)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2021['actor1_name'] = df_2021['Cast and crew'].map(lambda x: get_actor1_name(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2021['actor2_name'] = df_

In [46]:
# renaming Title of df_2021
df_2021 = df_2021.rename(columns={'Title':'title'})

# new data df21
df21 = df_2021.loc[:,['title','genres','director_name','actor1_name','actor2_name','actor3_name']]

# printing the data
df21

Unnamed: 0,title,genres,director_name,actor1_name,actor2_name,actor3_name
0,Shadow in the Cloud,Horror Action War,Roseanne Liang,Chloë Grace Moretz,Taylor John Smith,Beulah Koale
1,The White Tiger,Drama,Ramin Bahrani,Adarsh Gourav,Rajkummar Rao,Priyanka Chopra Jonas
2,Locked Down,Comedy Crime Drama,Doug Liman,Anne Hathaway,Chiwetel Ejiofor,Stephen Merchant
3,The Dig,Drama History,Simon Stone,Carey Mulligan,Ralph Fiennes,Lily James
4,Outside the Wire,Thriller Action Science Fiction,Mikael Håfström,Anthony Mackie,Damson Idris,Emily Beecham
...,...,...,...,...,...,...
353,The Tragedy of Macbeth,Drama War,Joel Coen,Denzel Washington,Frances McDormand,Bertie Carvel
354,A Journal for Jordan,Drama Romance,Denzel Washington,Michael B. Jordan,Chanté Adams,Jalon Christian
355,American Underdog,Drama,Erwin brothers,Zachary Levi,Anna Paquin,Dennis Quaid
356,Memoria,Drama Fantasy Mystery,Apichatpong Weerasethakul (director/acreenplay...,Apichatpong Weerasethakul (director/acreenplay...,Elkin Díaz,Jeanne Balibar


In [47]:
# if names of actor2 and actor3 are unavailable, replace nan with 'Unknown'
df21['actor2_name'] = df21['actor2_name'].replace(np.nan, 'Unknown')
df21['actor3_name'] = df21['actor3_name'].replace(np.nan, 'Unknown')

In [48]:
# converting movie title in lowercase
df21['title'] = df21['title'].str.lower()

In [49]:
# creating a column with combined information
df21['comb'] = df21['director_name'] + ' ' + df21['actor1_name'] + ' ' + df21['actor2_name'] + ' ' + df21['actor3_name'] + ' ' + df21['genres']

In [50]:
# printing 2021 movies data df21
df21

Unnamed: 0,title,genres,director_name,actor1_name,actor2_name,actor3_name,comb
0,shadow in the cloud,Horror Action War,Roseanne Liang,Chloë Grace Moretz,Taylor John Smith,Beulah Koale,Roseanne Liang Chloë Grace Moretz Taylor John ...
1,the white tiger,Drama,Ramin Bahrani,Adarsh Gourav,Rajkummar Rao,Priyanka Chopra Jonas,Ramin Bahrani Adarsh Gourav Rajkummar Rao Priy...
2,locked down,Comedy Crime Drama,Doug Liman,Anne Hathaway,Chiwetel Ejiofor,Stephen Merchant,Doug Liman Anne Hathaway Chiwetel Ejiofor Step...
3,the dig,Drama History,Simon Stone,Carey Mulligan,Ralph Fiennes,Lily James,Simon Stone Carey Mulligan Ralph Fiennes Lily ...
4,outside the wire,Thriller Action Science Fiction,Mikael Håfström,Anthony Mackie,Damson Idris,Emily Beecham,Mikael Håfström Anthony Mackie Damson Idris Em...
...,...,...,...,...,...,...,...
353,the tragedy of macbeth,Drama War,Joel Coen,Denzel Washington,Frances McDormand,Bertie Carvel,Joel Coen Denzel Washington Frances McDormand ...
354,a journal for jordan,Drama Romance,Denzel Washington,Michael B. Jordan,Chanté Adams,Jalon Christian,Denzel Washington Michael B. Jordan Chanté Ada...
355,american underdog,Drama,Erwin brothers,Zachary Levi,Anna Paquin,Dennis Quaid,Erwin brothers Zachary Levi Anna Paquin Dennis...
356,memoria,Drama Fantasy Mystery,Apichatpong Weerasethakul (director/acreenplay...,Apichatpong Weerasethakul (director/acreenplay...,Elkin Díaz,Jeanne Balibar,Apichatpong Weerasethakul (director/acreenplay...


## Creating 2022 movie dataset from Wikipedia

In [51]:
link = "https://en.wikipedia.org/wiki/List_of_American_films_of_2022"
df1 = pd.read_html(link, header=0)[2]
df2 = pd.read_html(link, header=0)[3]
df3 = pd.read_html(link, header=0)[4]
df4 = pd.read_html(link, header=0)[5]

In [52]:
# combining the lists, ignoring the index values
df = df1.append(df2.append(df3.append(df4,ignore_index=True),ignore_index=True),ignore_index=True)

# printing the data
df

Unnamed: 0,Rank,Title,Distributor,Domestic Gross,Opening,Opening.1,Production company,Cast and crew,.mw-parser-output .tooltip-dotted{border-bottom:1px dotted;cursor:help}Ref.,Ref.
0,1.0,The Batman*,Warner Bros.,"$369,309,462",,,,,,
1,2.0,Doctor Strange in the Multiverse of Madness*,Disney,"$351,992,340",,,,,,
2,3.0,Sonic the Hedgehog 2*,Paramount,"$181,950,102",,,,,,
3,4.0,Uncharted,Sony,"$147,726,420",,,,,,
4,5.0,The Lost City*,Paramount,"$99,674,006",,,,,,
...,...,...,...,...,...,...,...,...,...,...
174,,"Are You There God? It's Me, Margaret",,,SEPTEMBER,16.0,Lionsgate / Gracie Films,Kelly Fremon Craig (director/screenplay); Rach...,,[143]
175,,Distant,,,SEPTEMBER,16.0,Universal Pictures / DreamWorks Pictures / Amb...,Josh Gordon & Will Speck (directors); Spenser ...,,[144]
176,,Don't Worry Darling,,,SEPTEMBER,23.0,Warner Bros. Pictures / New Line Cinema,Olivia Wilde (director); Katie Silberman (scre...,,[145]
177,,Bros,,,SEPTEMBER,30.0,Universal Pictures / Apatow Productions,Nicholas Stoller (director/screenplay); Billy ...,,[146]


In [53]:
# creating 'genres' column
df['genres'] = df['Title'].map(lambda x: get_genre(str(x)))

# creating new data using certain columns of df
df_2022 = df[['Title','Cast and crew','genres']]

# printing the data
df_2022

Unnamed: 0,Title,Cast and crew,genres
0,The Batman*,,Crime Mystery Thriller
1,Doctor Strange in the Multiverse of Madness*,,Fantasy Action Adventure
2,Sonic the Hedgehog 2*,,Action Fantasy Comedy Family Adventure Science...
3,Uncharted,,Action Adventure
4,The Lost City*,,Action Adventure Comedy Romance
...,...,...,...
174,"Are You There God? It's Me, Margaret",Kelly Fremon Craig (director/screenplay); Rach...,Drama Comedy
175,Distant,Josh Gordon & Will Speck (directors); Spenser ...,Science Fiction Comedy Romance
176,Don't Worry Darling,Olivia Wilde (director); Katie Silberman (scre...,Drama Mystery Thriller
177,Bros,Nicholas Stoller (director/screenplay); Billy ...,Adventure Comedy Family Science Fiction


In [55]:
# fillna() method to impute (to replace) null values with something default. 
# here all null or NaN's can be replaced with empty string " "
df_2022['Cast and crew'] = df_2022['Cast and crew'].fillna(" ")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2022['Cast and crew'] = df_2022['Cast and crew'].fillna(" ")


In [56]:
# creating respective columns using lambda function
df_2022['director_name'] = df_2022['Cast and crew'].map(lambda x: get_director_name(str(x)))

df_2022['actor1_name'] = df_2022['Cast and crew'].map(lambda x: get_actor1_name(x))

df_2022['actor2_name'] = df_2022['Cast and crew'].map(lambda x: get_actor2_name(x))

df_2022['actor3_name'] = df_2022['Cast and crew'].map(lambda x: get_actor3_name(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2022['director_name'] = df_2022['Cast and crew'].map(lambda x: get_director_name(str(x)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2022['actor1_name'] = df_2022['Cast and crew'].map(lambda x: get_actor1_name(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2022['actor2_name'] = df_

In [57]:
# renaming Title of df_2022
df_2022 = df_2022.rename(columns={'Title':'title'})

# new data df22
df22 = df_2022.loc[:,['title','genres','director_name','actor1_name','actor2_name','actor3_name']]

# printing the data
df22

Unnamed: 0,title,genres,director_name,actor1_name,actor2_name,actor3_name
0,The Batman*,Crime Mystery Thriller,,,,
1,Doctor Strange in the Multiverse of Madness*,Fantasy Action Adventure,,,,
2,Sonic the Hedgehog 2*,Action Fantasy Comedy Family Adventure Science...,,,,
3,Uncharted,Action Adventure,,,,
4,The Lost City*,Action Adventure Comedy Romance,,,,
...,...,...,...,...,...,...
174,"Are You There God? It's Me, Margaret",Drama Comedy,Kelly Fremon Craig,Rachel McAdams,Abby Ryder Fortson,Benny Safdie
175,Distant,Science Fiction Comedy Romance,Josh Gordon & Will Speck,Anthony Ramos,Naomi Scott,Zachary Quinto
176,Don't Worry Darling,Drama Mystery Thriller,Olivia Wilde,Florence Pugh,Harry Styles,Olivia Wilde
177,Bros,Adventure Comedy Family Science Fiction,Nicholas Stoller,Billy Eichner,Luke Macfarlane,Ts Madison


In [58]:
# if names of actor2 and actor3 are unavailable, replace nan with 'Unknown'
df22['actor2_name'] = df22['actor2_name'].replace(np.nan, 'Unknown')
df22['actor3_name'] = df22['actor3_name'].replace(np.nan, 'Unknown')

In [59]:
# converting movie title in lowercase
df22['title'] = df22['title'].str.lower()

In [60]:
# creating a column with combined information
df22['comb'] = df22['director_name'] + ' ' + df22['actor1_name'] + ' ' + df22['actor2_name'] + ' ' + df22['actor3_name'] + ' ' + df22['genres']

In [61]:
# printing 2022 movies data df22
df22

Unnamed: 0,title,genres,director_name,actor1_name,actor2_name,actor3_name,comb
0,the batman*,Crime Mystery Thriller,,,Unknown,Unknown,Unknown Unknown Crime Mystery Thriller
1,doctor strange in the multiverse of madness*,Fantasy Action Adventure,,,Unknown,Unknown,Unknown Unknown Fantasy Action Adventure
2,sonic the hedgehog 2*,Action Fantasy Comedy Family Adventure Science...,,,Unknown,Unknown,Unknown Unknown Action Fantasy Comedy Fami...
3,uncharted,Action Adventure,,,Unknown,Unknown,Unknown Unknown Action Adventure
4,the lost city*,Action Adventure Comedy Romance,,,Unknown,Unknown,Unknown Unknown Action Adventure Comedy Ro...
...,...,...,...,...,...,...,...
174,"are you there god? it's me, margaret",Drama Comedy,Kelly Fremon Craig,Rachel McAdams,Abby Ryder Fortson,Benny Safdie,Kelly Fremon Craig Rachel McAdams Abby Ryder F...
175,distant,Science Fiction Comedy Romance,Josh Gordon & Will Speck,Anthony Ramos,Naomi Scott,Zachary Quinto,Josh Gordon & Will Speck Anthony Ramos Naomi S...
176,don't worry darling,Drama Mystery Thriller,Olivia Wilde,Florence Pugh,Harry Styles,Olivia Wilde,Olivia Wilde Florence Pugh Harry Styles Olivia...
177,bros,Adventure Comedy Family Science Fiction,Nicholas Stoller,Billy Eichner,Luke Macfarlane,Ts Madison,Nicholas Stoller Billy Eichner Luke Macfarlane...


### Combining all the datasets 2018-2022

In [66]:
newdf = df18.append(df19,ignore_index = True)
newdf = newdf.append(df20, ignore_index = True)
newdf = newdf.append(df21, ignore_index = True)
newdf = newdf.append(df22, ignore_index = True)

# displaying the data
newdf

Unnamed: 0,title,genres,director_name,actor1_name,actor2_name,actor3_name,comb
0,insidious: the last key,Horror Mystery Thriller,Adam Robitel,Lin Shaye,Angus Sampson,Leigh Whannell,Adam Robitel Lin Shaye Angus Sampson Leigh Wha...
1,the strange ones,Thriller Drama,Lauren Wolkstein,Alex Pettyfer,James Freedson-Jackson,Emily Althaus,Lauren Wolkstein Alex Pettyfer James Freedson-...
2,stratton,Action Thriller,Simon West,Dominic Cooper,Austin Stowell,Gemma Chan,Simon West Dominic Cooper Austin Stowell Gemma...
3,sweet country,Drama History Western,Warwick Thornton,Bryan Brown,Sam Neill,Unknown,Warwick Thornton Bryan Brown Sam Neill Unknown...
4,the commuter,Action Thriller Mystery,Jaume Collet-Serra,Liam Neeson,Vera Farmiga,Patrick Wilson,Jaume Collet-Serra Liam Neeson Vera Farmiga Pa...
...,...,...,...,...,...,...,...
1321,"are you there god? it's me, margaret",Drama Comedy,Kelly Fremon Craig,Rachel McAdams,Abby Ryder Fortson,Benny Safdie,Kelly Fremon Craig Rachel McAdams Abby Ryder F...
1322,distant,Science Fiction Comedy Romance,Josh Gordon & Will Speck,Anthony Ramos,Naomi Scott,Zachary Quinto,Josh Gordon & Will Speck Anthony Ramos Naomi S...
1323,don't worry darling,Drama Mystery Thriller,Olivia Wilde,Florence Pugh,Harry Styles,Olivia Wilde,Olivia Wilde Florence Pugh Harry Styles Olivia...
1324,bros,Adventure Comedy Family Science Fiction,Nicholas Stoller,Billy Eichner,Luke Macfarlane,Ts Madison,Nicholas Stoller Billy Eichner Luke Macfarlane...


In [96]:
# reading the previous dataset that contains movies till 2017
old_df = pd.read_csv(r'C:\Users\Administrator\Movie Recommendation System\movies_till2017.csv')

# displaying the data
old_df

Unnamed: 0,title,genres,director_name,actor1_name,actor2_name,actor3_name,comb
0,pirates of the caribbean: dead men tell no tales,Adventure Action Fantasy Comedy,Joachim Rønning Espen Sandberg,Johnny Depp,Javier Bardem,Geoffrey Rush,Joachim Rønning Espen Sandberg Johnny Depp Jav...
1,justice league,Action Adventure Fantasy Sci-Fi,Zack Snyder,Ben Affleck,Henry Cavill,Gal Gadot,Zack Snyder Ben Affleck Henry Cavill Gal Gadot...
2,thor: ragnarok,Action Adventure Fantasy Sci-Fi,Taika Waititi,Chris Hemsworth,Tom Hiddleston,Cate Blanchett,Taika Waititi Chris Hemsworth Tom Hiddleston C...
3,guardians of the galaxy vol. 2,Action Adventure Comedy Sci-Fi,James Gunn,Chris Pratt,Zoe Saldana,Dave Bautista,James Gunn Chris Pratt Zoe Saldana Dave Bautis...
4,the king's daughter,Fantasy Action Adventure,Sean McNamara,Pierce Brosnan,William Hurt,Benjamin Walker,Sean McNamara Pierce Brosnan William Hurt Benj...
...,...,...,...,...,...,...,...
5496,signed sealed delivered,Comedy Drama,Scott Smith,Eric Mabius,Daphne Zuniga,Crystal Lowe,Scott Smith Eric Mabius Daphne Zuniga Crystal ...
5497,the following,Crime Drama Mystery Thriller,Unknown,Natalie Zea,Valorie Curry,Sam Underwood,Unknown Natalie Zea Valorie Curry Sam Underwoo...
5498,a plague so pleasant,Drama Horror Thriller,Benjamin Roberds,Eva Boehnke,Maxwell Moody,David Chandler,Benjamin Roberds Eva Boehnke Maxwell Moody Dav...
5499,shanghai calling,Comedy Drama Romance,Daniel Hsia,Alan Ruck,Daniel Henney,Eliza Coupe,Daniel Hsia Alan Ruck Daniel Henney Eliza Coup...


In [97]:
# adding 2018-2022 data to movies data till 2017
final_df = old_df.append(newdf,ignore_index=True)

# displaying the data
final_df

Unnamed: 0,title,genres,director_name,actor1_name,actor2_name,actor3_name,comb
0,pirates of the caribbean: dead men tell no tales,Adventure Action Fantasy Comedy,Joachim Rønning Espen Sandberg,Johnny Depp,Javier Bardem,Geoffrey Rush,Joachim Rønning Espen Sandberg Johnny Depp Jav...
1,justice league,Action Adventure Fantasy Sci-Fi,Zack Snyder,Ben Affleck,Henry Cavill,Gal Gadot,Zack Snyder Ben Affleck Henry Cavill Gal Gadot...
2,thor: ragnarok,Action Adventure Fantasy Sci-Fi,Taika Waititi,Chris Hemsworth,Tom Hiddleston,Cate Blanchett,Taika Waititi Chris Hemsworth Tom Hiddleston C...
3,guardians of the galaxy vol. 2,Action Adventure Comedy Sci-Fi,James Gunn,Chris Pratt,Zoe Saldana,Dave Bautista,James Gunn Chris Pratt Zoe Saldana Dave Bautis...
4,the king's daughter,Fantasy Action Adventure,Sean McNamara,Pierce Brosnan,William Hurt,Benjamin Walker,Sean McNamara Pierce Brosnan William Hurt Benj...
...,...,...,...,...,...,...,...
6822,"are you there god? it's me, margaret",Drama Comedy,Kelly Fremon Craig,Rachel McAdams,Abby Ryder Fortson,Benny Safdie,Kelly Fremon Craig Rachel McAdams Abby Ryder F...
6823,distant,Science Fiction Comedy Romance,Josh Gordon & Will Speck,Anthony Ramos,Naomi Scott,Zachary Quinto,Josh Gordon & Will Speck Anthony Ramos Naomi S...
6824,don't worry darling,Drama Mystery Thriller,Olivia Wilde,Florence Pugh,Harry Styles,Olivia Wilde,Olivia Wilde Florence Pugh Harry Styles Olivia...
6825,bros,Adventure Comedy Family Science Fiction,Nicholas Stoller,Billy Eichner,Luke Macfarlane,Ts Madison,Nicholas Stoller Billy Eichner Luke Macfarlane...


In [98]:
# isna() function is used to detect missing values
final_df.isna().sum()

title            2
genres           6
director_name    0
actor1_name      0
actor2_name      0
actor3_name      0
comb             6
dtype: int64

In [99]:
# dropna() method removes the rows that contains NULL values
final_df = final_df.dropna()

# again checking
final_df.isna().sum()

title            0
genres           0
director_name    0
actor1_name      0
actor2_name      0
actor3_name      0
comb             0
dtype: int64

In [101]:
# The to_csv() function is used to write object to a comma-separated values (csv) file.
final_df.to_csv('final_movie_data.csv',index=False)

In [102]:
movie_name = list(final_df["title"].values)

# displaying first 5 contents of movie_name
movie_name[:5]


['pirates of the caribbean: dead men tell no tales',
 'justice league',
 'thor: ragnarok',
 'guardians of the galaxy vol. 2',
 "the king's daughter"]

In [103]:
len(movie_name)

6819

In [104]:
movie_name = list(set(movie_name))

# total number of movies
print(len(movie_name))

6622


In [105]:
movie_data = {"movie_names":[]}

for index, name in enumerate(movie_name):
    movie_data["movie_names"].append({"title":str(name).strip()})

In [106]:
import json

def convert(val):
    # isinstance(val, np.generic) will return True if val is an array scalar object
    if isinstance(val, np.generic):
        return val.item()
    raise TypeError

In [107]:
# json.dumps() function converts a Python object into a json string
movie_names_json = json.dumps(movie_data, default = convert)

In [108]:
# creating json file
with open('movie_names.json', 'w') as outfile:
    json.dump(movie_data, outfile)