# Web Scrapping for top 250 movies on IMDB
Web Scrapping is the technique to fetch the usefull data from the webpage. This data can be exported as csv file or any other desired format and can used for the further analysis. Below is the code to extract the top 250 movies from the [IMDB Webpage for top 250 movies](https://www.imdb.com/chart/top/?ref_=nv_mv_250) 


In [7]:
import bs4 as bs
import urllib.request
import lxml
import requests
import pandas as pd

In [25]:
# defining function to web scrape the web page
# paste the link in the webscrap function
def webscrap(link):
    # fetching the wbpage
    print("fetching the data from the webpage...")
    page = requests.get(link)
    soup = bs.BeautifulSoup(page.text, "html.parser")
    # extracting the movietags and innertags
    movietags = soup.select("td.titleColumn")
    inner_movietags = soup.select("td.titleColumn a")
    # extracting the name of the movie
    print("fetching movie names...")
    movie_names = [movie.text for movie in inner_movietags]
    # extracting the actors including the director
    print("fetching actors...")
    actors = [movie["title"] for movie in inner_movietags]
    # year of release of movie
    print("fetching year of release...")
    years = [movie.text.split()[-1].replace(")","").replace("(","") for movie in movietags]
    # imdb rating tag of the movie 
    rating_tags = soup.select("td.posterColumn span[name = ir]")
    # imdb rating of movie
    print("fetching movie rating...")
    rating = [round(float(movie['data-value']),2) for movie in rating_tags]
    # creating a dictionary of all the list created above
    dict = {'movie name': movie_names, 'actors': actors, 'rating': rating, "year": years} 
    # converting the dictionary into dataframe
    df = pd.DataFrame(dict)
    # Extracting the directors
    print("fetching movie director...")
    df["director"]=[actors[0].replace("(dir.)","").strip() for actors in df["actors"].str.split(",")]
    # changing the datatype of year from string to numeric
    df["year"] = df["year"].apply(pd.to_numeric)
    print("creating dataframe...")
    return (df)

def save_csv(df, name):
    if name[-4:] == ".csv":
        df.to_csv(name)
    else:
        name = name + ".csv"
        df.to_csv(name)
    print("your file",name, "has been saved")

## line by line breakdown of above functions

In [9]:
# fetching the webpage
page = urllib.request.urlopen("https://www.imdb.com/chart/top/?ref_=nv_mv_250/").read()
soup = bs.BeautifulSoup(page, "lxml")

In [10]:
# this can also be used to fetch the webpage
page = requests.get("https://www.imdb.com/chart/top/?ref_=nv_mv_250/")
soup = bs.BeautifulSoup(page.text, "html.parser")

In [11]:
# extracting the movie tags and inner movie tags
movietags = soup.select("td.titleColumn")
inner_movietags = soup.select("td.titleColumn a")

In [12]:
# extracting the name of the movie
movie_names = [movie.text for movie in inner_movietags]

In [13]:
# extracting the actors including the director
actors = [movie["title"] for movie in inner_movietags]

In [14]:
# year of release of movie
years = [movie.text.split()[-1].replace(")","").replace("(","") for movie in movietags]

In [15]:
# imdb rating tag of the movie 
rating_tags = soup.select("td.posterColumn span[name = ir]")
# imdb rating of movie
rating = [round(float(movie['data-value']),2) for movie in rating_tags]

In [16]:
# creating a dictionary of all the list created above
dict = {'movie name': movie_names, 'actors': actors, 'rating': rating, "year": years} 

In [17]:
# converting the dictionary into dataframe
df = pd.DataFrame(dict)
df.head()

Unnamed: 0,movie name,actors,rating,year
0,The Shawshank Redemption,"Frank Darabont (dir.), Tim Robbins, Morgan Fre...",9.22,1994
1,The Godfather,"Francis Ford Coppola (dir.), Marlon Brando, Al...",9.15,1972
2,The Godfather: Part II,"Francis Ford Coppola (dir.), Al Pacino, Robert...",8.98,1974
3,The Dark Knight,"Christopher Nolan (dir.), Christian Bale, Heat...",8.97,2008
4,12 Angry Men,"Sidney Lumet (dir.), Henry Fonda, Lee J. Cobb",8.93,1957


In [18]:
# Extracting the directors
df["director"]=[actors[0].replace("(dir.)","").strip() for actors in df["actors"].str.split(",")]

In [19]:
# changing the datatype of year from string to numeric
df["year"] = df["year"].apply(pd.to_numeric)

In [20]:
df.head()

Unnamed: 0,movie name,actors,rating,year,director
0,The Shawshank Redemption,"Frank Darabont (dir.), Tim Robbins, Morgan Fre...",9.22,1994,Frank Darabont
1,The Godfather,"Francis Ford Coppola (dir.), Marlon Brando, Al...",9.15,1972,Francis Ford Coppola
2,The Godfather: Part II,"Francis Ford Coppola (dir.), Al Pacino, Robert...",8.98,1974,Francis Ford Coppola
3,The Dark Knight,"Christopher Nolan (dir.), Christian Bale, Heat...",8.97,2008,Christopher Nolan
4,12 Angry Men,"Sidney Lumet (dir.), Henry Fonda, Lee J. Cobb",8.93,1957,Sidney Lumet


In [28]:
df.dtypes

movie name     object
actors         object
rating        float64
year            int64
director       object
dtype: object

In [29]:
# saving the file as csv file
df.to_csv("top_250_movies_imdb.csv")