In [36]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
import itertools
#from imdbUtils import *

pd.options.display.max_colwidth=2000

In [43]:
import requests
from bs4 import BeautifulSoup

def getSoup(url):
    """
    Utility function which takes a url and returns a Soup object.
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    return soup

def minMax(a):
    '''Returns the index of negative and positive review.'''
    
    # get the index of least rated user review
    minpos = a.index(min(a))
    
    # get the index of highest rated user review
    maxpos = a.index(max(a))
    
    return minpos, maxpos

def getReviews(soup):
    '''Function returns a negative and positive review for each movie.'''
    
    # get a list of user ratings
    user_review_ratings = [tag.previous_element for tag in 
                           soup.find_all('span', attrs={'class': 'point-scale'})]
    
    
    # find the index of negative and positive review
    n_index, p_index = minMax(list(map(int, user_review_ratings)))
    
    
    # get the review tags
    user_review_list = soup.find_all('a', attrs={'class':'title'})
    
    
    # get the negative and positive review tags
    n_review_tag = user_review_list[n_index]
    p_review_tag = user_review_list[p_index]
    
    # return the negative and positive review link
    n_review_link = "https://www.imdb.com" + n_review_tag['href']
    p_review_link = "https://www.imdb.com" + p_review_tag['href']
    
    return n_review_link, p_review_link

def getReviewText(review_url):
    '''Returns the user review text given the review url.'''
    
    # get the review_url's soup
    soup = getSoup(review_url)
    
    # find div tags with class text show-more__control
    tag = soup.find('div', attrs={'class': 'text show-more__control'})
    
    return tag.getText()

def getMovieTitle(review_url):
    '''Returns the movie title from the review url.'''
    
    # get the review_url's soup
    soup = getSoup(review_url)
    
    # find h1 tag
    tag = soup.find('h1')
    
    return list(tag.children)[1].getText()

def getNounChunks(user_review):
    
    # create the doc object
    doc = nlp(user_review)
    
    # get a list of noun_chunks
    noun_chunks = list(doc.noun_chunks)
    
    # convert noun_chunks from span objects to strings, otherwise it won't pickle
    noun_chunks_strlist = [chunk.text for chunk in noun_chunks]
    
    return noun_chunks_strlist

In [44]:
url = '''https://www.imdb.com/search/title/?title_type=feature&num_votes=10000,&year=2013,2020&sort=popularity,desc&count=250&genres=sci-fi'''

In [45]:
# get the soup object for main api url
movies_soup = getSoup(url)

In [46]:
# find all a-tags with class:None
movie_tags = movies_soup.find_all('a', attrs={'class': None})

# filter the a-tags to get just the titles
movie_tags = [tag.attrs['href'] for tag in movie_tags 
              if tag.attrs['href'].startswith('/title') & tag.attrs['href'].endswith('/')]

# remove duplicate links
movie_tags = list(dict.fromkeys(movie_tags))

print("There are a total of " + str(len(movie_tags)) + " movie titles")
print("Displaying 10 titles")
movie_tags[:10]

There are a total of 250 movie titles
Displaying 10 titles


['/title/tt1427298/',
 '/title/tt4244162/',
 '/title/tt2436516/',
 '/title/tt2358592/',
 '/title/tt2414766/',
 '/title/tt3774790/',
 '/title/tt2263944/',
 '/title/tt2668134/',
 '/title/tt3819668/',
 '/title/tt4195278/']

In [47]:
# movie links
base_url = "https://www.imdb.com"
movie_links = [base_url + tag + 'reviews' for tag in movie_tags]
print("There are a total of " + str(len(movie_links)) + " movie user reviews")
print("Displaying 10 user reviews links")
movie_links[:10]

There are a total of 250 movie user reviews
Displaying 10 user reviews links


['https://www.imdb.com/title/tt1427298/reviews',
 'https://www.imdb.com/title/tt4244162/reviews',
 'https://www.imdb.com/title/tt2436516/reviews',
 'https://www.imdb.com/title/tt2358592/reviews',
 'https://www.imdb.com/title/tt2414766/reviews',
 'https://www.imdb.com/title/tt3774790/reviews',
 'https://www.imdb.com/title/tt2263944/reviews',
 'https://www.imdb.com/title/tt2668134/reviews',
 'https://www.imdb.com/title/tt3819668/reviews',
 'https://www.imdb.com/title/tt4195278/reviews']

In [48]:
# get a list of soup objects
movie_soups = [getSoup(link) for link in movie_links]

# get all 500 movie review links
movie_review_list = [getReviews(movie_soup) for movie_soup in movie_soups]

movie_review_list = list(itertools.chain(*movie_review_list))
print(len(movie_review_list))

print("There are a total of " + str(len(movie_review_list)) + " individual movie reviews")
print("Displaying 10 reviews")
movie_review_list[:10]

500
There are a total of 500 individual movie reviews
Displaying 10 reviews


['https://www.imdb.com/review/rw2928847/',
 'https://www.imdb.com/review/rw2938108/',
 'https://www.imdb.com/review/rw3523953/',
 'https://www.imdb.com/review/rw4040522/',
 'https://www.imdb.com/review/rw2925799/',
 'https://www.imdb.com/review/rw2795797/',
 'https://www.imdb.com/review/rw2917639/',
 'https://www.imdb.com/review/rw2865282/',
 'https://www.imdb.com/review/rw6110271/',
 'https://www.imdb.com/review/rw2841172/']

In [49]:
# get review text from the review link
review_texts = [getReviewText(url) for url in movie_review_list]

# get movie name from the review link
movie_titles = [getMovieTitle(url) for url in movie_review_list]

# label each review with negative or positive
review_sentiment = np.array(['negative', 'positive'] * (len(movie_review_list)//2))

# construct a dataframe
#df_darma = pd.DataFrame({'movie': movie_titles, 'user_review_permalink': movie_review_list,
             #'user_review': review_texts, 'sentiment': review_sentiment})

In [50]:
df_scifi = pd.DataFrame({'movie': movie_titles, 'user_review': review_texts, 'sentiment': review_sentiment})

In [51]:
df_scifi.head()

Unnamed: 0,movie,user_review,sentiment
0,The Human Race,"i'm honestly just writing this because of the high score and 2 rave reviews obviously written by people involved with the project. i feel like i was duped into watching it. so here's an honest review: this is a terrible movie. it's poorly written, and badly acted. only exception is the deaf girl. she wasn't bad, given the material she had to work with. it's just lazy filmmaking, seems like. says nothing about the human condition (as purported from the existing reviews), only the sophomoric insight of a film-school reject who saw one too many eli roth movies. seriously, avoid at all costs.",negative
1,The Human Race,"The first thing you need to know about this film is that it is low- budget, funded by a kickstarter project and it doesn't have a single person in it that is known in the movie world. That being said, this film is a brilliant indie that deserves to be in the spot-light. Smart, creative, well-made and seriously dark and disturbing, The Human Race restores my faith in filmmaking and the movie industry. Although this film is not for the film-goer who can only see films with bloated budgets and celebra-whores, the movie goers that do appreciate a film that takes risks and takes the honest yet disturbing route of displaying characters in ways that you're definitely not use to seeing, you will love and appreciate this film as much as I did. This film takes the classic Battle Royale storyline and pulls it deeper into the horror genre by showing a much darker side of the survival of the fittest as well as creating imagery that borders on comic book fantasy. Setting aside the awesome storyline and the genius way in which the film tells the tale, the director/writer respectfully showcases characters that are disabled yet are the strongest competitors and yet are also the weakest. I am in serious awe of this film and I hope others will enjoy it and see it for what it is as much as I do.Please like me on Facebook! You can read more of my reviews and get info on the latest movies in horror:http://www.facebook.com/pages/I-Heart-Horror/338327476286206",positive
2,Beta Test,"After seeing the rating for this on here (currently at 7.0, although I suspect that will change) I thought that this might actually be really good.But uhm, let's just say I think the director have a lot of good friends who are kind with their ratings.It's not completely awful, but it's not that great either.I've always liked Larenz Tate so it's cool to see him in something new, so there is always that, and he hasn't aged a day in 20 years (he's 41 believe it or not).He plays a gamer who's been given a beta version of the latest game bound to conquer global fandom, and the plot is decent enough but it's just such a low-budget film and that shows in every aspect of the movie.First of all the ""game"" that he's playing looks like it's well over 10 years old, and it simply doesn't make any sense why they couldn't just use real footage in the game scenes instead. Then at least Larenz would have a clear reason to be so blown away by the details in the game.And the ""twist"" about the game of course would have made A lot more sense too.Kevon Stover is one of the bad guys, and his attempts at being so stereotypically tough would make even Hulk Hogan cringe. I suppose that could be intentional to get a bit of 90's video-game feel to the character. But yeah, he's a chippendale appearantly which doesn't come as a surprise at all.The first hour is still semi-watchable if you manage to look past the flaws but the last 25 minutes is a bit of a drag to say the least.",negative
3,Beta Test,"It sounded good, it started OK and then it got awful.I watched it to the end which is the only reason it didn't get a 1 star but it is awful, awful acting, awful script, awful production values just awful.",positive
4,Go Goa Gone,"Go Goa Gone is truly one of the most hilarious Hindi films since Delhi Belly. It's a zombie comedy, with lots of blood and gore, so it's in no way for the squeamish, and the humor is quite mature, so it's NOT a child-friendly watch, but, hey, it's the subject matter itself which isn't really kid-suitable.Nevertheless, Go Goa Gone sets out to bring a new genre into Indian cinema with great gusto, and it fails to fail. With rib-tickling lines, great comic acting by Kunal Khemu, Vir Das, Anand Tiwari, and even Saif Ali Khan as the quirky, macho Boris (soon to be one of the greatest cult characters of all of Hindi cinema), Go Goa Gone FORCES you to laugh effortlessly.Yes, the plot gets a little dragging at the second half, and Puja Gupta's acting isn't very novel (although she looks great), but mostly, Go Goa Gone just always makes you laugh at one point or the other. I was chuckling at every other moment. The zombies are well-made, for a first in Bollywood, and nothing looks cheap. The action is pretty nice, and the songs are quirky and trippy.I highly recommend this to anyone who wants to have a good laugh-fest. Except the kids, of course...",negative


In [52]:
df_scifi.shape

(500, 3)

In [53]:
df_scifi.to_csv('scifi.csv', index=False)