# Create 'movie_data.csv' using Movie.py

In [1]:
import pandas as pd
from Movie import Movie
import csv
import re

#### Step 1:

Take list from IMDB of domestic films released from 1972 to 2016 and (7) lists of original movies from streaming services, pull titles and years, and put data in 'movie_year_from_1972_2023.csv'

In [19]:
# Gets titles and years and writes to 'movie_year_from_1972_2023.csv'

files1 = ['data/us_released_movies_1972_to_2016.csv',
         'data/Streaming Movie Lists/amazon_prime_movie_list.csv',
         'data/Streaming Movie Lists/apple_tv_movie_list.csv',
         'data/Streaming Movie Lists/disney_plus_movie_list.csv',
         'data/Streaming Movie Lists/hbo_movie_list.csv',
         'data/Streaming Movie Lists/hulu_programming_list.csv',
         'data/Streaming Movie Lists/netflix_movie_list.csv',
         'data/Streaming Movie Lists/paramount_plus_movie_list.csv'
        ]
         
Movie.get_titles_from_csv(files1,'movie_year_from_1972_2023.csv')


#### Step 2:
Using HTML pages from Box Office Mojo, pull titles and years of domestic films made from 2017 to 2023 to supplement the lists from files1.

In [21]:
# Below code creates 'movie_year_from_1972_2023.csv' containing title,year of every domestic movie from 1972-2023.

# List of html files of domestic movies by year indicated in name
files2 = ['data/Domestic Movies from 2017-2023/Domestic Box Office For 2017 - Box Office Mojo.txt',
         'data/Domestic Movies from 2017-2023/Domestic Box Office For 2018 - Box Office Mojo.txt',
         'data/Domestic Movies from 2017-2023/Domestic Box Office For 2019 - Box Office Mojo.txt',
         'data/Domestic Movies from 2017-2023/Domestic Box Office For 2020 - Box Office Mojo.txt',
         'data/Domestic Movies from 2017-2023/Domestic Box Office For 2021 - Box Office Mojo.txt',
         'data/Domestic Movies from 2017-2023/Domestic Box Office For 2022 - Box Office Mojo.txt',
         'data/Domestic Movies from 2017-2023/Domestic Box Office For 2023 - Box Office Mojo.txt']

Movie.get_titles_from_html(files2, 'movie_year_from_1972_2023.csv')



#### Step 3:
Because the dataset, movie_year_from_1972_2023.csv, contains a lot of duplicates, we now clean the data and remove those duplicates.

Check for duplicate rows in movie_year_from_1972_2023.csv.

In [None]:
import pandas as pd
import requests
import json

df = pd.read_csv('movie_year_from_1972_2023.csv')
df[df.duplicated() == True]

Remove duplicates from movie_year_from_1972_2023.csv and verify removal.

In [None]:
df_no_dups = df.drop_duplicates()
df_no_dups[df_no_dups.duplicated() == True]

In [None]:
print("Original Dataset had", df.shape[0],'rows.')
print("Cleaned Dataset has", df_no_dups.shape[0],'rows.')

Export the pandas dataset to 'no_dups_movie_year_from_1972_2023.xlsx' and remove numerical index manually.

In [None]:
df_no_dups.to_excel('no_dups_movie_year_from_1972_2023.xlsx')

#### Step 4:

With no_dups_movie_year_from_1972_2023.csv now containing all domestic movies made from 1972 to 2023, **make_movie_data()** does the following:

* Creates movie_data.csv and writes first row containing column names.
* Iterates through each row in movie_year_from_1972_2023, requests movie data from (2) APIs, and appends to movie_data.csv.
* The only parameter is 'start' which is either:
  * **1 (int)** to start from the beginning of the 'no_dups_movie_year_from_1972_2023.csv' dataset.
  * **'Movie Title/YearReleased'** to start from the last movie found, as indicated from the bottom of movie_data.csv.
    * For example, "The Terminator/1984" would be valid input.

In [3]:
def make_movie_data(start):

    if start == 1:
        # Writes first line of 'movie_data.csv'
        with open('movie_data.csv','w') as file:
            writer = csv.writer(file,delimiter=',')
            writer.writerow(['Title',
                            'Year',
                            'Genre(s)',
                            'IMDB',
                            'Rotten Tomatoes',
                            'Metacritic',
                            'TMDB',
                            '# of IMDB Votes',
                            '# of Awards',
                            'Gross Domestic Box Office Sales ($)',
                            'Media',
                            'Directors',
                            'Origin'])
        
    # Writes the dataset
    with open('no_dups_movie_year_from_1972_2023.csv','r',buffering=1) as file:
        reader = csv.reader(file,delimiter=',')
        
        if start != 1:
            row = next(reader)
            while (row[0] + '/' + row[1] != start):
                row = next(reader)
            for row in reader:
                new_movie = Movie(row[0],row[1])
                new_movie.get_movie_data()
        else:
            header = next(reader)
            for row in reader:
                new_movie = Movie(row[0],row[1])
                new_movie.get_movie_data()

Below commands invoke make_movie_data().

In [5]:
# Use below line to "pick up" after last movie pulled from OMDB API
#make_movie_data(1)
make_movie_data('Hobgoblins/1988')

This was written to movie_data.csv: 'Round Midnight 1986
This was written to movie_data.csv: Bat*21 1988
This was written to movie_data.csv: Howling II: ... Your Sister Is a Werewolf 1985
This was written to movie_data.csv: One from the Heart 1981
This was written to movie_data.csv: Suburbia 1983
This was written to movie_data.csv: Butterfly 1981
This was written to movie_data.csv: Liquid Sky 1982
This was written to movie_data.csv: S.O.B. 1981
This was written to movie_data.csv: Looker 1981
This was written to movie_data.csv: Death Hunt 1981
This was written to movie_data.csv: Under the Rainbow 1981
This was written to movie_data.csv: BMX Bandits 1983
This was written to movie_data.csv: Terror Train 1980
This was written to movie_data.csv: The Private Eyes 1980
This was written to movie_data.csv: True Confessions 1981
This was written to movie_data.csv: Raise the Titanic 1980
This was written to movie_data.csv: Ms .45 1981
This was written to movie_data.csv: Slave of the Cannibal God 

# IGNORE BELOW!

In [None]:
#Error occurred with request to OMDB regarding ('Hannah and Her Sisters', '1986').

In [7]:
# Check status of data pull:
df2 = pd.read_csv('movie_data.csv')
df2.shape

df2[df2.duplicated() == True]

Unnamed: 0,Title,Year,Genre(s),IMDB,Rotten Tomatoes,Metacritic,TMDB,# of IMDB Votes,# of Awards,Gross Domestic Box Office Sales ($),Media,Directors,Origin


In [9]:
df2.shape

(12397, 13)

In [15]:
df2_no_dups = df2.drop_duplicates()
df2_no_dups[df2_no_dups.duplicated() == True]

Unnamed: 0,Title,Year,Genre(s),IMDB,Rotten Tomatoes,Metacritic,TMDB,# of IMDB Votes,# of Awards,Gross Domestic Box Office Sales ($),Media,Directors,Origin


In [17]:
df2_no_dups.shape

(10766, 13)

In [23]:
df2_no_dups.to_excel('df2_no_dups.xlsx')

In [11]:
df3 = pd.read_csv('no_dups_movie_year_from_1972_2023.csv')
df3.shape

(12397, 2)

In [None]:
if start != 1:
    while (row[0] + '/' + row[1] != start):
        unwanted_row = next(reader)
    for row in reader:
        new_movie = Movie(row[0],row[1])
        new_movie.get_movie_data()
    
