# ITM 891 Project - Part 1
## Web Scraping
Scraping data of top 250 movies from IMDb website

In [None]:
# Importing libraries
import urllib
import bs4
import pandas as pd
import re

Extracting links of movies listed on "Top Rated Movies" page on IMDb

In [None]:
# Defining source page
url_source = 'https://www.imdb.com/chart/top/?ref_=nv_mv_250'
print("Source URL: " + url_source)
print()

with urllib.request.urlopen(url_source) as url:
    soup = bs4.BeautifulSoup(url)

In [None]:
# Getting Links of all the movies on the source page
href_urls = set()
for a_tag in soup.html.body.find_all('a'):
    if 'href' in a_tag.attrs:
        # Links to all the movies start with 'title'
        if a_tag.attrs['href'].startswith('/title/'):
            href_urls.add(a_tag.attrs['href'])
            
href_urls = list(href_urls)

In [None]:
# Links to movies on source page
href_urls

Defining DataFrame to store information of movies listed on source page

In [None]:
column_names = ['Title', 'IMDB_Rating', 'Director','Star_1', 'Star_2', 'Star_3', 'Genre',\
           'Language', 'Release_Date', 'Budget', 'Gross_USA', 'Gross_Worldwide', 'Runtime_min']
movie_df = pd.DataFrame(columns = column_names)

### Extracting data
Extracting following attributes for each movie:
* Title: Title of the movie
* IMDB_Rating: IMDb Score for the movie
* Director: Name of the movie director
* Star_1, Star_2, Star_3: Names of lead actors of the movie
* Genre: First genre listed on IMDb website
* Language: Language in which the movie was released
* Release_Date: Release date of the movie
* Budget: Estimated budget of the movie
* Gross_USA: Gross earning of the movie in USA
* Gross_Worldwide: Gross worldwide earning of the movie
* Runtime_min: Movie duration in minutes

The following cell will take Approximately 200 seconds to run

In [None]:
for link in href_urls:
    a_url = urllib.parse.urljoin(url_source, link)
    
    with urllib.request.urlopen(a_url) as url:
        soup = bs4.BeautifulSoup(url)
    
    title_div = soup.html.body.find_all('h1')
    title = title_div[0].text
    
    rating_div = soup.findAll("span", {"itemprop": "ratingValue"})
    rating = float(rating_div[0].text)
    
    credit_div = soup.html.body.find_all('div', {"class": "credit_summary_item"})
    for credit in credit_div:
        if 'Director:' in credit.text:
            director = credit.text.split('\n')[-1]
        if 'Stars:' in credit.text:
            stars = s = credit.text.split('\n')[2]
            star1 = s.split(', ')[0]
            star2 = s.split(', ')[1]
            star3 = s.split(', ')[2][:-2]
            
    genre_div = soup.html.body.find_all('div', {"class": "see-more inline canwrap"})
    for g_div in genre_div:
        if 'Genres:' in g_div.text:
            genre = g_div.text.split("\n")[2][:-2]
    
    details_div = soup.html.body.find_all('div', {"class": "txt-block"})
    for detail in details_div:
        if 'Language' in detail.text:
            language = detail.text.split('\n')[2]
            
        if 'Release Date' in detail.text:
            release_info = detail.text.split(' ')
            release_date = '-'.join(release_info[2:5])
    
        if 'Budget' in detail.text:
            budget = detail.text.split('\n')[1]
            budget = int(''.join(re.findall('\d', budget, flags = 0)))
            
        if 'Gross USA:' in detail.text:
            gross_usa = detail.text.split('\n')[1]
            gross_usa = int(''.join(re.findall('\d', gross_usa, flags = 0)))
            
        if 'Cumulative Worldwide Gross:' in detail.text:
            gross = detail.text.split('\n')[1]
            gross = int(''.join(re.findall('\d', gross, flags = 0)))
        
        if 'Runtime' in detail.text:
            runtime = detail.text.split('\n')[2]
            runtime = int(runtime.split(' ')[0])
            
    movie = pd.DataFrame({'Title': title, 'IMDB_Rating': rating, 'Director': director,\
                          'Star_1': star1, 'Star_2': star2, 'Star_3': star3, 'Genre': genre,\
                          'Language': language, 'Release_Date': release_date, 'Budget': budget,\
                          'Gross_USA': gross_usa, 'Gross_Worldwide': gross, 'Runtime_min': runtime}, index = [0])
    
    movie_df = pd.concat([movie_df, movie])

In [None]:
# Checking DataFrame
movie_df.head()

Saving DataFrame as CSV

In [None]:
movie_df.to_csv('/Users/syedkashif9786/Documents/Michigan State University/Large Scale Data Analysis/imdb_data.csv',\
                index = False)