In [55]:
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import defaultdict
from itertools import combinations
#from ggplot import *
import unicodedata
from textblob import TextBlob
import nltk
#from itertools import izip
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
%matplotlib inline 
matplotlib.style.use('ggplot') 

In [21]:
def requestPage(url):
    try:
        response = requests.get(url) 
        page  = BeautifulSoup(response.text, 'html.parser')
        return page
       
    except(ConnectionError, Exception) as e:
        print ("Exception is :", e)
       

def extractRatings(page,i):
    data_rating = {}
    i = i + 1
    for K in page.select('div[class="ipl-ratings-bar"] '):
        rating = K.text.replace('\n','')
        data_rating.update({i:rating.encode('utf-8')})
        i = i + 1
    return data_rating

def extractReviews(page,i):
    data_review = {}
    i = i + 1
    for a in page.select('div[class="content"] div'):
        review = a.get_text().replace("\n"," ").replace("*** This review may contain spoilers ***", "IGNORE").replace("found this helpful.","IGNORE").replace("Add another review","IGNORE")
        #if  review != "IGNORE":
        if review.find("IGNORE") == -1:
            data_review.update({i:review.encode('utf-8')})
            i = i + 1
    return data_review

def tokenize(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [31]:
url = "http://www.imdb.com/title/tt0120338/reviews?ref_=tt_ql_3"

# DataFrame for all extracted reviews
imdb = pd.DataFrame()

# Pagination for 50 reviews
pagination = [0,5,10,15,20,25,30,35,40,45,50] # pagination for imdb pagination to get top 50 reviews in a category

for counter in pagination:

    positive_url = url
  
   
    # Request url for reviews
    responses_positive = requestPage(positive_url)
    #responses_negative = requestPage(negative_url)
   
    # Extract ratings and reviews
    ratings_positive  = extractRatings(responses_positive,counter)   
    reviews_positive  = extractReviews(responses_positive,counter)
   
    #ratings_negative  = extractRatings(responses_negative,counter+50)
    #reviews_negative  = extractReviews(responses_negative,counter+50)
   
    # Copy the extracted data into dataframes
    df_p = pd.DataFrame(columns=['ratings','reviews'])
    df_p = pd.DataFrame([ratings_positive,reviews_positive]).T
   
    #df_n = pd.DataFrame(columns=['ratings','reviews'])
    #df_n = pd.DataFrame([ratings_negative,reviews_negative]).T
   
    # Create master dataframe for text analytics
    imdb = imdb.append([df_p])
   
print ("\n Extracted ratings & reviews")


 Extracted ratings & reviews


In [5]:
imdb.shape

(275, 2)

In [32]:
imdb.columns = ['ratings', 'reviews']
imdb = imdb.sort_index()
print(imdb.head(10))


    ratings                                            reviews
1  b'10/10'  b"Very beautiful and cinematic movie with lots...
2  b'10/10'  b'Titanic is one of my all time favourite film...
3  b'10/10'  b'You can watch this movie in 1997, you can wa...
4  b'10/10'  b"Ah, yes, the film that propelled Leonardi Di...
5  b'10/10'  b'Good Lord. This movie right here, it\'s a ma...
6  b'10/10'  b"Very beautiful and cinematic movie with lots...
6  b'10/10'  b'Every once in a while the conversation will ...
7  b'10/10'  b'Back in 1997, do I remember that year: Clint...
7  b'10/10'  b'Titanic is one of my all time favourite film...
8  b'10/10'  b"To all the miserable people who have done ev...


In [7]:
imdb.head()

Unnamed: 0,ratings,reviews
1,b'10/10',"b""Very beautiful and cinematic movie with lots..."
2,b'10/10',b'Titanic is one of my all time favourite film...
3,b'10/10',"b'You can watch this movie in 1997, you can wa..."
4,b'10/10',"b""Ah, yes, the film that propelled Leonardi Di..."
5,b'10/10',"b'Good Lord. This movie right here, it\'s a ma..."


In [33]:
# Convert user ratings from string to float 
imdb['ratings']  = imdb['ratings'].str.decode('utf-8')
imdb['reviews']  = imdb['reviews'].str.decode('utf-8')
#temp_ratings =  imdb['ratings'].str.split('/', expand = True ).astype(float)
#temp_ratings['divide'] = temp_ratings[0].div(temp_ratings[1], axis = "index")
#imdb['ratings1'] = temp_ratings['divide']

In [34]:
imdb

Unnamed: 0,ratings,reviews
1,10/10,Very beautiful and cinematic movie with lots o...
2,10/10,Titanic is one of my all time favourite films....
3,10/10,"You can watch this movie in 1997, you can watc..."
4,10/10,"Ah, yes, the film that propelled Leonardi DiCa..."
5,10/10,"Good Lord. This movie right here, it's a maste..."
6,10/10,Very beautiful and cinematic movie with lots o...
6,10/10,Every once in a while the conversation will tu...
7,10/10,"Back in 1997, do I remember that year: Clinton..."
7,10/10,Titanic is one of my all time favourite films....
8,10/10,To all the miserable people who have done ever...


In [35]:
stop_words = nltk.corpus.stopwords.words('english')
extended_stopwords = ['\'ll','\'d','\'m','\'re','\'s','\'ve','ca n\'t','r','n\'t','ca','see','get','movies','movie','go','say','come','many','another','could','would','made','really','want','even','odd','films','plot','ever','actually','also','movie','film']
stops = stop_words + extended_stopwords

In [39]:
# Find polarity and subjectivity of the reviews ( sentiment analysis)

imdb['polarity'] = imdb.reviews.apply(lambda s: TextBlob(s).sentiment.polarity)
#imdb['subjectivity'] = imdb.reviews.apply(lambda s: TextBlob(unicode(s, errors='ignore')).sentiment.subjectivity)
imdb.head(15)
imdb.tail(10)

Unnamed: 0,ratings,reviews,polarity
68,10/10,It's really quite odd. When Titanic first came...,0.183944
69,,Such a beautiful love story. one of the most l...,0.308854
69,10/10,The best romance I've ever seen A very strong ...,0.787778
70,10/10,It's a crying freaking shame that this outstan...,0.168599
70,,"When you have a film this big and successful, ...",0.168932
71,10/10,It really physically hurts me when i don't see...,0.285317
72,,I am still crying as I am writing this review ...,0.310338
73,,"I don't know how to explain my feelings, but t...",0.0
74,,Such a beautiful love story. one of the most l...,0.308854
75,,"When you have a film this big and successful, ...",0.168932


In [118]:
stops = stopwords.words('english')
newstops = ['ive','u','one','many','every','titanic','film', 'movie', 'cinema','doe', 'ha', 'might', 'must', 'need', 'sha', 'wa', 'wo', 'arent', 'couldnt', 'didnt', 'doesnt', 'dont', 'hadnt', 'hasnt', 'havent', 'isnt', 'mightnt', 'mustnt', 'neednt', 'shant', 'shes', 'shouldnt', 'shouldve', 'thatll', 'wasnt', 'werent', 'wont', 'wouldnt', 'youd', 'youll', 'youre', 'youve']
stops.extend(newstops)

In [40]:
def tokenize(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    #stm = PorterStemmer()
    lemm = WordNetLemmatizer()
    #tokens = [stm.stem(w) for w in tokens]
    tokens = [lemm.lemmatize(w) for w in tokens]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    #import pdb;pdb.set_trace()
    return filtered_tokens

In [119]:
def _remove_noise(input_text):
    lemm = WordNetLemmatizer()
    #input_text = str(input_text).encode('ascii', 'ignore')
    #input_text = str(input_text).replace(",", "")
   # input_text = str(input_text).replace("\'", "")
   # input_text = str(input_text).replace("\\", "")
    input_text = str(input_text).replace("-", "")
   # input_text = str(input_text).replace(".", "")
    input_text = re.sub("\d+","", input_text)
    input_text = re.sub("[^a-zA-Z0-9 ]","", input_text)
    words = str(input_text).split()
    noise_free_words = [word for word in words if word.lower() not in stops]
    lower_words = [word.lower() for word in noise_free_words]
    lower_words = [lemm.lemmatize(w) for w in lower_words]
    return lower_words

In [114]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(",".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [141]:
term_idf_vectorizer       = TfidfVectorizer(max_df=0.95, max_features=2000,min_df=0.05, stop_words=stops, tokenizer=_remove_noise, ngram_range=(2,3))
%time term_idf_matrix     = term_idf_vectorizer.fit_transform(imdb.reviews) 
term_idf_feature_names    = term_idf_vectorizer.get_feature_names()
term_idf_matrix.shape

Wall time: 1.03 s


(275, 102)

In [142]:
print(term_idf_feature_names)

tf = zip(term_idf_vectorizer.get_feature_names(),np.asarray(term_idf_matrix.sum(axis=0)).ravel())
term_frequency = sorted(tf,key=lambda x: x[1],reverse=True)
term_frequency[0:10]

['academy award', 'actual sinking', 'amazing even', 'back remember', 'bad celine', 'best ever', 'best ever seen', 'best picture', 'billy zane', 'blue diamond', 'cal hockley', 'called heart', 'called heart ocean', 'cameron show', 'camerons gigantic', 'celine dions', 'class passenger', 'dewitt bukater', 'diamond called', 'diamond called heart', 'dicaprio kate', 'dicaprio kate winslet', 'director james', 'director james cameron', 'ever made', 'ever seen', 'fall love', 'first class', 'first class passenger', 'first time', 'full life', 'give look', 'great acting', 'great great', 'heart ocean', 'hit iceberg', 'horners music', 'ill give', 'im king', 'im king world', 'jack dawson', 'jack rose', 'james cameron', 'james camerons', 'james horners', 'james horners music', 'kate leo', 'kate winslet', 'kathy bates', 'key ingredient', 'king world', 'leonardo dicaprio', 'let face', 'look like', 'lost life', 'love even', 'love hate', 'love story', 'maiden voyage', 'make even', 'molly brown', 'much mone

[('ever seen', 24.069198371706104),
 ('love story', 21.176958297492686),
 ('first time', 18.49613172390732),
 ('kate winslet', 17.61579704530124),
 ('james cameron', 16.125410366061356),
 ('much money', 14.84640744065326),
 ('james camerons', 14.263652525374415),
 ('story ever', 14.110020974373924),
 ('ever made', 13.554799839550212),
 ('first class', 13.266682936426198)]

In [143]:
## generating topics from LDA algorithm 
lda = LatentDirichletAllocation(n_components=2, max_iter=20,learning_method='online',learning_offset=20,random_state=1)
%time lda.fit(term_idf_matrix)
print("\nTopics using Latent Dirichlet Allocation model with Term frequencies: \n")
print_top_words(lda, term_idf_feature_names, 10)

Wall time: 1.31 s

Topics using Latent Dirichlet Allocation model with Term frequencies: 

Topic #0:
ever seen,ever made,much money,love story,first time,story ever,let face,year later,great great,kate winslet
Topic #1:
first class,james camerons,billy zane,james horners,molly brown,special effect,james cameron,though know,class passenger,kathy bates



In [144]:
## creating the first matrix 
lda_Z = lda.fit_transform(term_idf_matrix)
print(lda_Z.shape)

(275, 2)


In [145]:
## Feature extraction vs labelling 
lda_Z[0:5]

array([[0.5       , 0.5       ],
       [0.80770119, 0.19229881],
       [0.83388867, 0.16611133],
       [0.2389188 , 0.7610812 ],
       [0.85284795, 0.14715205]])

In [147]:
## future classification of documents
lda_Z = lda_modellda_Z = lda.fit_transform(term_idf_matrix)
print(lda_Z.shape)

text = "The movie is a boring and full of artificail characters not worth watching, bizzare love story "
x = lda.transform(term_idf_vectorizer.transform([text]))[0]
print(x, x.sum())

(275, 2)
[0.71359744 0.28640256] 1.0
