In [16]:
# Importing required packages
import pandas as pd
import requests
from bs4 import BeautifulSoup
from omdbapi.movie_search import GetMovie

In [3]:
# Goal of this project is to extract information on the top 250 movies according to imdb.

# Step 1: Get list from IMDB
imdb_url = 'https://www.imdb.com/chart/top/' # url for top 250 list
imdb_r = requests.get(imdb_url)
imdb_soup = BeautifulSoup(imdb_r.text, 'lxml') # Creating soup object for scraping

In [40]:
# Subsetting soup object to movie title section
movies = imdb_soup.select('td.titleColumn')
Titles = [] # Creating empty list to hold movie titles 

# Looping through html to populate list with movie titles
for index in range(0, len(movies)):
    movie_string = movies[index].get_text()
    movie = (' '.join(movie_string.split()))
    movie_title = movie[len(str(index))+1:-7]
    Titles.append(movie_title)

# Finishing touches on some formatting in the list of titles
Titles = [title.strip() for title in Titles]
for i in range(0,250):
    if Titles[i][0] == '.':
        Titles[i] = Titles[i][1:]
Titles = [title.strip() for title in Titles]

In [42]:
# Step 2: Use scraped list of movies to access open source omdb api

# Retrieve personel api key and connect to omdb api using omdbapi library 
f = open("omdbkey.txt", 'r')
key = f.readline()
movie = GetMovie(api_key = key)

In [53]:
# Define function to retrieve item by index
def scrape(index, item):
    if movie.get_movie(title=Titles[index]).get(item) == None:
        return 0
    else:
        return movie.get_movie(title = Titles[index])[item]

In [89]:
# Populating lists for the rest of the desired columns

# Year
Year = [scrape(i, 'year') for i in range(0,250)]

# Rating
Rating = [scrape(i, 'rated') for i in range(0,250)]

# Runtime
Runtime = [scrape(i, 'runtime') for i in range(0,250)]

# Genre
Genre = [scrape(i, 'genre') for i in range(0,250)]

# Director
Director = [scrape(i, 'director') for i in range(0,250)]

# IMDB rating 
Score = [scrape(i, 'imdbrating') for i in range(0,250)]

# Box office
Boxoffice = [scrape(i,'boxoffice') for i in range(0,250)]

In [90]:
# Creating pandas dataframe

top250 = pd.DataFrame({
    'Title' : Titles,
    'Year' : Year,
    'Rating' : Rating,
    'Runtime' : Runtime,
    'Genre' : Genre,
    'Director' : Director,
    'Score' : Score,
    'BoxOffice' : Boxoffice
})

# Exporting data to csv
top250.to_csv('top250.csv', sep=',')