In [1]:
import pandas as pd
import numpy as np
import requests

### Function for fetching the movie data from wikipedia

In [2]:
def get_df(url):
  df1 = pd.read_html(url,header=0)[2]
  df2 = pd.read_html(url,header=0)[3]
  df3 = pd.read_html(url,header=0)[4]
  df4 = pd.read_html(url,header=0)[5]
  df = df1.append(df2.append(df3.append(df4,ignore_index = True),ignore_index = True),ignore_index = True)
  return df

### Function to fetch genres of all movies

In [3]:
def get_genres(movie):
  response  = requests.get('http://www.omdbapi.com/?t={}&apikey=d023b9b3'.format(movie))
  data = response.json()
  if data['Response'] == 'False':
    np.NaN
  else:
    return data['Genre']

### Function to fetch plot of all movies

In [4]:
def get_plot(movie):
  response  = requests.get('http://www.omdbapi.com/?t={}&apikey=d023b9b3'.format(movie))
  data = response.json()
  if data['Response'] == 'False':
    np.NaN
  else:
    return data['Plot']

### Function to fetch director's name

In [5]:
def get_director_name(x):
  if "(director)" in x:
    return x.split(' (director)')[0]
  elif "(director/screenplay)" in x:
    return x.split(' (director/screenplay)')[0]
  else:
    return x.split(' (directors)')[0]

### Function to fetch names of top Actors

In [6]:
def get_actors(x):
  return ((x.split("screenplay); ")[-1]).split(', ')[0:3])

### Function to perform tasks

In [7]:
def tasks(df):

  # renaming
  df.rename(columns = {'Title':'title','Cast and crew':'cast'},inplace=True)
  
  # keeping columns in need
  df = df[['title','cast','genres','plot']]
  
  # pretty format
  df['title'] = df['title'].astype(str)
  df['genres'] = df['genres'].astype(str)
  df['plot'] = df['plot'].astype(str)
  df['cast'] = df['cast'].astype(str)
  
  # get Director's name
  df['director_name'] = df['cast'].apply(get_director_name)
  
  # get actors name
  df['actors'] = df['cast'].apply(get_actors)
  
  # replacing ','
  df['genres']=df['genres'].apply(lambda x:x.replace(',',' '))
  df['plot']=df['plot'].apply(lambda x:x.replace(',',' '))

  # to lowercase
  df['title'] = df['title'].apply(lambda x:x.lower())
  
  # joining names
  df['director_name'] = df['director_name'].apply(lambda x:" ".join(x))
  df['director_name'] = df['director_name'].apply(lambda x:x.replace(" ",""))
  df['actors'] = df['actors'].apply(lambda x:[i.replace(" ","") for i in x])
  
  # converting into list
  df['genres'] = df['genres'].apply(lambda x:x.split())
  df['plot'] = df['plot'].apply(lambda x:x.split())
  df['director_name'] = df['director_name'].apply(lambda x:x.split())
  
  # combining all
  df['tags'] = df['actors'] + df['director_name'] + df['genres'] + df['plot']
  
  # tags to string
  df['tags'] = df['tags'].apply(lambda x:" ".join(x))
  
  #tags to lower
  df['tags'] = df['tags'].apply(lambda x:x.lower())
  
  # finalizing
  df = df[['tags','title']]
  
  return df

## <u><b>2018 data</b></u>


In [27]:
url_2018 = "https://en.wikipedia.org/wiki/List_of_American_films_of_2018"
df_2018 = get_df(url_2018)

In [29]:
df_2018['genres'] = df_2018['Title'].apply(get_genres)


In [30]:
df_2018['plot'] = df_2018['Title'].apply(get_plot)


In [31]:
df_2018.isna().sum()


Opening                                                                          0
Opening.1                                                                        0
Title                                                                            0
Production company                                                               0
Cast and crew                                                                    0
.mw-parser-output .tooltip-dotted{border-bottom:1px dotted;cursor:help}Ref.    199
Ref.                                                                            75
genres                                                                           4
plot                                                                             4
dtype: int64

In [32]:
df_2018 = df_2018.dropna(subset=['Title'])

In [None]:
df_2018 = tasks(df_2018)


In [34]:
df_2018

Unnamed: 0,tags,title
0,linshaye angussampson leighwhannell adamrobite...,insidious: the last key
1,alexpettyfer jamesfreedson-jackson emilyalthau...,the strange ones
2,dominiccooper austinstowell gemmachan simonwes...,stratton
3,bryanbrown samneill warwickthornton adventure ...,sweet country
4,liamneeson verafarmiga patrickwilson jaumecoll...,the commuter
...,...,...
268,willferrell johnc.reilly rebeccahall etancohen...,holmes & watson
269,christianbale amyadams stevecarell adammckay b...,vice
270,felicityjones armiehammer justintheroux mimile...,on the basis of sex
271,nicolekidman sebastianstan tobykebbell karynku...,destroyer


In [35]:
df_2018.to_csv('csv-files\\df_2018.csv',index=False)

## <u><b>2019 data</b></u>


In [77]:
url_2019 = "https://en.wikipedia.org/wiki/List_of_American_films_of_2019"
df_2019 = get_df(url_2019)


In [78]:
df_2019['genres'] = df_2019['Title'].apply(get_genres)


In [79]:
df_2019['plot'] = df_2019['Title'].apply(get_plot)


In [80]:
df_2019.isna().sum()


Opening                0
Opening.1              0
Title                  0
Production company     0
Cast and crew          0
Ref.                  10
genres                 1
plot                   1
dtype: int64

In [81]:
df_2019 = df_2019.dropna(subset=['Title'])


In [None]:
df_2019 = tasks(df_2019)

In [84]:
df_2019.to_csv('csv-files\\df_2019.csv',index=False)

## <u><b>2020 data</b></u>


In [40]:
url_2020 = "https://en.wikipedia.org/wiki/List_of_American_films_of_2020"
df_2020 = get_df(url_2020)

### Fetching genres of all movies


In [44]:
df_2020['genres'] = df_2020['Title'].apply(get_genres)


### Fetching plot of all movies


In [45]:
df_2020['plot'] = df_2020['Title'].apply(get_plot)


### Checking of null values

In [46]:
df_2020.isna().sum()

Opening                                                                          0
Opening.1                                                                        0
Title                                                                            0
Production company                                                               0
Cast and crew                                                                    0
.mw-parser-output .tooltip-dotted{border-bottom:1px dotted;cursor:help}Ref.    219
Ref.                                                                            84
genres                                                                           4
plot                                                                             4
dtype: int64

### Dropping null values

In [47]:
df_2020 = df_2020.dropna(subset=['Title'])


### Other tasks

In [None]:
df_2020 = tasks(df_2020)

In [51]:
df_2020.to_csv('csv-files\\df_2020.csv',index=False)

## <u><b>2021 data</b></u>


In [8]:
url_2021 = "https://en.wikipedia.org/wiki/List_of_American_films_of_2021"
df_2021 = get_df(url_2021)

### Fetching genres of all movies

In [9]:
df_2021['genres'] = df_2021['Title'].apply(get_genres)


### Fetching plot of all movies

In [10]:
df_2021['plot'] = df_2021['Title'].apply(get_plot)

### Other tasks

In [None]:
df_2021 = tasks(df_2021)

In [12]:
df_2021.to_csv('csv-files/df_2021.csv',index=False)

## <u><b>2022 data</b></u>


In [67]:
url_2022 = "https://en.wikipedia.org/wiki/List_of_American_films_of_2022"

In [68]:
def get_df_2022(url):
  df1 = pd.read_html(url,header=0)[3]
  df2 = pd.read_html(url,header=0)[4]
  df3 = pd.read_html(url,header=0)[5]
  df4 = pd.read_html(url,header=0)[6]
  df = df1.append(df2.append(df3.append(df4,ignore_index = True),ignore_index = True),ignore_index = True)
  return df

In [69]:
df_2022 = get_df_2022(url_2022)


### Checking for null values

In [70]:
df_2022.isna().sum()

Opening                                                                          0
Opening.1                                                                        4
Title                                                                            4
Production company                                                               4
Cast and crew                                                                    4
.mw-parser-output .tooltip-dotted{border-bottom:1px dotted;cursor:help}Ref.    262
Ref.                                                                            83
dtype: int64

### Droping null values

In [71]:
df_2022 = df_2022.dropna(subset=['Title'])

### Fetching genres of all movies

In [72]:
df_2022['genres'] = df_2022['Title'].apply(get_genres)

### Fetching plot of all movies


In [74]:
df_2022['plot'] = df_2022['Title'].apply(get_plot)


### Other tasks

In [None]:
df_2022 = tasks(df_2022)

In [76]:
df_2022.to_csv('csv-files\\df_2022.csv',index=False)

# Appending all datasets
