In [None]:
import requests
import json
import pandas as pd
import numpy as np
from config2 import omdb_key, tmdb_key, pwrd
import warnings
from pprint import pprint
import re
import time
from sqlalchemy import create_engine
warnings.filterwarnings('ignore')

In [None]:
#create url list for API calls to The Movie Database for the first 500 pages of movies
url_list = []


for i in range(1,501):
    num = i
    url = "https://api.themoviedb.org/3/discover/movie?api_key="+tmdb_key+"&language=en-US&&sort_by=vote_count.desc&include_adult=false&include_video=false&page=" + str(num)
    url_list.append(url)


In [None]:
len(url_list)

In [None]:
#loop through url's and make API call for each

movies_list = []

for i in url_list:
    r = requests.get(i)
    print(r)
    r = r.json()
    movies_list.append(r["results"])
    

In [None]:
len(movies_list)

In [None]:
#get movies from each page into a list (there are 20 moives per page)
m_list = []

for i in range(len(movies_list)):
    for j in range(20):
        m_list.append(movies_list[i][j].items())

In [None]:
print(f"There are {len(m_list)} movies in the database")

In [None]:
#put movies in data frame
mymovies_df = pd.DataFrame(m_list)
mymovies_df.head()

In [None]:
#rename columns
mymovies_df.columns = ["popularity", "tmdb_vote_count", "video", "poster_path", "id", "adult", "backdrop_path", "original_language", "original_title", "genre_ids", "title", "vote_average", "overview", "release_date"]

In [None]:
print(mymovies_df.columns)

In [None]:
#create smaller df with the columns we want
short_df = mymovies_df[['original_title','overview']].copy()
short_df.head()

In [None]:
#remove tuple from each cell and only show data
for x in short_df.columns:
    short_df[x] = [y[1] for y in short_df[x]]


In [None]:
short_df.head()

In [None]:
#create list for json results and url variables for API call
movie_jsons = []
url1 = "http://www.omdbapi.com/?t="
url2 = "&apikey="

In [None]:
#loop through movies and make API call for each title
for movie in short_df["original_title"]:
    url = url1 + movie + url2 + omdb_key
    response = requests.get(url)
    print(response)
    response = response.json()
    movie_jsons.append(response)
    time.sleep(.2)

In [None]:
 #create dataframe
df1 = pd.DataFrame(movie_jsons)

In [None]:
#set max column view and check dataframe
pd.set_option('display.max_columns', 999)
df1.head()

In [None]:
#check data count
df1.count()

In [None]:
#drop rows that have no ratings
df1 = df1[pd.notnull(df1['Ratings'])]

In [None]:
#drop rows that have no Box Office
df1 = df1[pd.notnull(df1['BoxOffice'])]

In [None]:
df1.count()

In [None]:
total_ratings = [len(data) for data in df1["Ratings"]]

In [None]:
total_ratings.count(3)

In [None]:
#remove movies that dont have ratings from all 3 sources
df1 = df1[df1["Ratings"].map(len)==3]

In [None]:
df1.count()

In [None]:
#create list of all ratings in the dataframe
ratings_list = []
for data in df1["Ratings"]:
    for i in data:
        ratings_list.append(i)
ratings_list[:5]

In [None]:
#create spearate list for each source
imdb = []
r_t = []
metacritic = []
for d in ratings_list:
    if d["Source"] == "Internet Movie Database":
        imdb.append(d["Value"])
    elif d["Source"] == "Rotten Tomatoes":
        r_t.append(d["Value"])
    else:
        metacritic.append(d["Value"])
        

In [None]:
#check sample list
print(len(imdb))
print(len(r_t))
print(len(metacritic))

In [None]:
#convert strings to floats and ints
imdb = [float(x.split("/")[0]) for x in imdb]
r_t = [int(x.rstrip("%")) for x in r_t]
metacritic = [int(x.split("/")[0]) for x in metacritic]

In [None]:
#check results
print(imdb[:5])
print(r_t[:5])
print(metacritic[:5])

In [None]:
print(len(imdb))
print(len(r_t))
print(len(metacritic))

In [None]:
#create new columns
df1["IMDB"] = imdb
df1["Rotten Tomatoes"] = r_t
df1["Metacritic"] = metacritic

In [None]:
#check dataframe 
df1

In [None]:
#drop unwanted columns
df1 = df1.drop(["Country", "DVD", "Error", "Language", "Metascore", "Ratings", "Response", "imdbRating", "Website"], axis = 1)

In [None]:
#create variable for regex test
string = df1["Awards"][0]

In [None]:
#test regex code
more_wins = re.search(r'(\d+) win', string)
if more_wins:
    print(more_wins.group(1))
else:
    print("No more wins")
more_noms = re.search(r'(\d+) nominations', string)
if more_noms:
    print(more_noms.group(1))
    print(type(more_noms.group(1)))
    print(int(more_noms.group(1)))
else:
    print("No more noms")
big_noms = re.search(r'Nominated for (\d+)', string)
if big_noms:
    print(big_noms.group(1))
    print(type(big_noms.group(1)))
else:
    print("No big noms")
big_wins = re.search(r'Won (\d+)', string)
if big_wins:
    print(big_wins.group(1))
else:
    print("No big wins")

In [None]:
#create wins and noms list. Prase through 

wins_list = []
noms_list = []

for x in df1["Awards"]:
    wins = 0
    noms = 0
    big_wins = re.search(r'Won (\d+)', x)
    if big_wins:
        wins += int(big_wins.group(1))
    big_noms = re.search(r'Nominated for (\d+)', x)
    if big_noms:
        noms += int(big_noms.group(1))
    more_wins = re.search(r'(\d+) win', x)
    if more_wins:
        wins += int(more_wins.group(1))
    more_noms = re.search(r'(\d+) nominations', x)
    if more_noms:
        noms += int(more_noms.group(1))
    wins_list.append(wins)
    noms_list.append(noms)
    
print(wins_list[:5])
print(noms_list[:5])
    

In [None]:
#adds wins and nominations to df1
df1["Wins"] = wins_list
df1["Nominations"] = noms_list

In [None]:
#convert runtime to int
df1["Runtime"] = [int(x.split(" ")[0]) for x in df1["Runtime"]]


In [None]:
#drop rows that has no imdbVotes
df1 = df1[~df1["imdbVotes"].str.contains("N/A")]


In [None]:
df1.count()

In [None]:
#convert imdbVotes to string
df1["imdbVotes"] = [int(x.replace(',','')) for x in df1["imdbVotes"]]


In [None]:
#check dataframe
df1.head()

In [None]:
#save as csv
df1.to_csv("data/movies.csv")