In [None]:
# Dependencies

import pandas as pd
import requests
import json
from pprint import pprint
import re
import timeit
from itertools import chain

In [None]:
movies_lrg = pd.read_csv('data/data_lrg/movies_lrg.csv')

In [None]:
print(movies_lrg.shape)

movies_lrg.head()

In [None]:
# check for duplicates

dups = movies_lrg['title'].value_counts()

# drop duplicates based on movie name
# first copy dataframe
movies_data = movies_lrg.copy()

movies_data.drop_duplicates(subset ="title", 
                     keep = 'first', inplace = True) 

movies_data.shape

In [None]:
#split title column into movie name and year 

movies_year = movies_data['title'].str.split("(",n =1, expand = True) 
    
movies_year.head()


In [None]:
# clean the second column by to show only numeric data

movies_year[1] = movies_year[1].str.extract('(\d+)', expand=True)

movies_year[1]

In [None]:
# add the separated columns back into the original dataset

movies_data['movie_nm'] = movies_year[0]
movies_data['released'] = movies_year[1]

movies_data.head()



In [None]:
# search for nulls

missing_info = movies_data.isnull().sum()

# remove all rows that contain a null value

movies_data.dropna(axis =0, inplace = True)

In [None]:
# test to ensure dropna was successful
missing_info = movies_data.isnull().sum()

missing_info

In [None]:
movies_data.head()

In [None]:
#drop original title column

movies_data.drop(['title'], axis =1,  inplace=True)

In [None]:

movies_data.head()

In [None]:
# convert released to numeric and limit the number of movies

movies_data['released'] = movies_data['released'].astype(int)

movies_data.head()

In [None]:
# check values for released year
released_min = movies_data['released'].min()
released_max = movies_data['released'].max()

print(released_min, released_max)

In [None]:
# eliminate rows where released is less than 1888 and greater than 2018

movies_data = movies_data.loc[movies_data['released'] >= 1990] 

movies_data = movies_data.loc[movies_data['released'] <= 2018]

released_min = movies_data['released'].min()
released_max = movies_data['released'].max()

print(released_min, released_max)

In [None]:
movies_data.head()

In [None]:
movies_data.to_csv('movies_data.csv', index = False)

In [None]:
movies_by_year = movies_data.groupby(movies_data['released']).count()

movies_by_year

In [None]:
# find number of movies by year

movies_year = dict.fromkeys([1990, 1991 , 1992 , 1993 , 1994 , 1995 , 1996, 1997, 1998 , 1999, 
               2000 , 2001 , 2002, 2003 , 2004 ,2005, 2006, 2007 , 2008 , 2009, 2010, 2011, 2012, 
              2013, 2014, 2015, 2016, 2017 , 2018])

print(movies_year[2008])


In [None]:
# add the movies released in each year to the movies_year dictionary as values
for key in movies_year:
    
    movies_year[key] = movies_data.loc[movies_data['released'] == key]['movie_nm'].values.tolist()

print(movies_year[1990])

###### Get other movie data using OMDB API

In [None]:
# API endpoint
url = "http://www.omdbapi.com/?apikey=INSERTAPIKEY="


In [None]:
# test loop on sample data
movies_sample = movies_data[0:5]['movie_nm']

movies_sample

#create a list to store each director
movies_results = []
for movie in movies_sample:
    movie_response = requests.get(url + movie).json()
    movies_results.append(movie_response)
    
movies_results

### Call OMDB API to get details for each movie from the movie_lens dataset
Because of the size of the dataset, we required thousands of calls to the API. To avoid the API mistaking our calls for an attack on the API and to wait a few minutes between running each loop. I decided to limit the dataset to only movies released from 1990 to 2018 and do pull the data one year at a time.Please note these 

In [None]:
# function to call API for 1990 - 2018. The function will accept the year and a list and return a list

def call_omd(year, movie_year):
    for movie in movies_year[year]:
        try:
            movie_response = requests.get(url + movie).json()
            movie_year.append(movie_response)
        except:
            continue
    return movie_year

# example function call: note that you have to create an empty list before calling the function
movies_test = []
call_omd(1990, movies_test)
    

In [None]:
# Empty Lists and function call to complete API calls

movies_90 = []
call_omd(1990, movies_90)    
    

movies_91 = []
call_omd(1991, movies_91)    
        

movies_92 = []
call_omd(1992, movies_92)      

movies_93 = []
call_omd(1993, movies_93) 

movies_94 = []
call_omd(1994, movies_94) 

movies_95 = []
call_omd(1995, movies_95) 
    

movies_96 = []
call_omd(1996, movies_96) 


movies_97 = []
call_omd(1997, movies_97) 

movies_98 = []
call_omd(1998, movies_98) 

movies_99 = []
call_omd(1999, movies_99) 

movies_00 = []
call_omd(2000, movies_00) 

movies_01 = []
call_omd(2001, movies_01) 

movies_02 = []
call_omd(2002, movies_02) 

movies_03 = []
call_omd(2003, movies_03) 

movies_04 = []
call_omd(2004, movies_04) 

movies_05 = []
call_omd(2005, movies_05) 

movies_06 = []
call_omd(2006, movies_06) 

movies_07 = []
call_omd(2007, movies_07) 

movies_08 = []
call_omd(2008, movies_08) 

In [None]:
movies_90_2008 = [movies_90, movies_91, movies_92, movies_93, movies_94, movies_95, 
                  movies_96, movies_97, movies_98, movies_99,movies_00, movies_01, movies_02, movies_03,
                 movies_04, movies_05, movies_06, movies_07, movies_08]

In [None]:
# convert list of dictionaries to dataframe

from itertools import chain
movies_90_08_df = pd.DataFrame(list(chain.from_iterable(movies_90_2008)))

In [None]:
movies_90_08_df.shape

In [None]:
movies_90_08_df.to_csv('movies_1990_2008.csv')

###### Continue tomorrow

In [None]:
movies_09 = []
call_omd(2009, movies_09) 

movies_10 = []
call_omd(2010, movies_10) 

movies_11 = []
call_omd(2011, movies_11) 

movies_12 = []
call_omd(2012, movies_12) 

movies_13 = []
call_omd(2013, movies_13) 

movies_14 = []
call_omd(2014, movies_14) 


movies_15 = []
call_omd(2015, movies_15) 

movies_16 = []
call_omd(2016, movies_16) 

movies_17 = []
call_omd(2017, movies_17) 

movies_18 = []
call_omd(2018, movies_18) 

In [None]:
movies_2009_2018 = [movies_09, movies_10, movies_11, movies_12, movies_13, movies_14,
                   movies_15, movies_16, movies_17, movies_18]

In [None]:
movies_2009_2018_df = pd.DataFrame(list(chain.from_iterable(movies_2009_2018)))

In [None]:
movies_2009_2018_df.head()

In [None]:
movies_2009_2018_df.to_csv("movies_2009_2018_df.csv",index=False)