## Analysis of Train to Busan (2020) Reviews

There are a few things I want to achieve through this analysis

I would like to 
* find keywords from the reviews
* get the average of reviews (/10)
* find similarities between reviews

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import bs4 as bs
import time

In [2]:
path = 'C:\Program Files (x86)\chromedriver.exe'
driver = webdriver.Chrome(path)
driver.get('https://www.imdb.com/title/tt5700672/reviews?ref_=tt_ov_rt')

# on IMDb website, there aren't pages but load more buttons, so to grab the source code of all reviews, need to use selenium
while True:
    try:
        # load more until there is no more
        load_more = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, 'load-more-trigger'))
        )
        time.sleep(2)
        ActionChains(driver).move_to_element(load_more).click(load_more).perform()
    except:
        # store the source code of all reviews
        html = driver.page_source
        driver.quit()
        break

In [3]:
titles = []
ratings = []
reviews = []
reviews = []

# web scraping using BeautifulSoup
soup = bs.BeautifulSoup(html, 'lxml')
all_containers = soup.findAll('div', {'class':'lister-item-content'})
for container in all_containers:
    # title cannot be empty
    titles.append(container.a.text.strip())
    if container.div.span.span is None:
        ratings.append(0)
    else:
        ratings.append(container.div.span.span.text)
    if container.findChildren('div')[2].div is None:
        reviews.append('None')
    else:
        reviews.append(container.findChildren('div')[2].div.text)

In [4]:
frame = {'Title':titles, 'Rating':ratings, 'Review':reviews}
review_df = pd.DataFrame(frame)
pd.set_option('display.max_rows', None)
review_df.head(15)

Unnamed: 0,Title,Rating,Review
0,Zombie Revival,8,Train to Busan was a treat. A look into Selfis...
1,As Promised. Action-Packed Film!,0,
2,"Cheerleader. zombies, knocked up",8,The film has it all - greedy CEO capitalist pi...
3,Packed with action and populated by both good ...,0,
4,Excellent,8,This film has it all.ActionMoralityA lesson fo...
5,"Zombies, train, babes",9,I own it it really made me think a lot i enjoy...
6,Schoolgirl And Intrepid Infected Masses,10,You got it. This schoolgirl isn't from Texas a...
7,Good action film but...,8,Is this really a zombie film? I am not a genre...
8,I Love Kpop Girls,8,Amazing Film!!!\nWe used to Listen to Wonder G...
9,This is better than the last twenty films of A...,10,I'm just a civilian of the US but I do follow ...


In [5]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import gensim
import re
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models.doc2vec import TaggedDocument
from gensim.models.doc2vec import Doc2Vec

cleaned_review = []

for review in review_df['Review']:
    cleaned_doc = re.sub(r"[^A-Za-z ]+", ' ', review)
    cleaned_doc = cleaned_doc.replace('  ', ' ')
    cleaned_doc = cleaned_doc.lower()
    cleaned_review.append(cleaned_doc)

review_df['Cleaned Review'] = cleaned_review
review_df.shape

(701, 4)

In [6]:
review_df.head()

Unnamed: 0,Title,Rating,Review,Cleaned Review
0,Zombie Revival,8,Train to Busan was a treat. A look into Selfis...,train to busan was a treat a look into selfish...
1,As Promised. Action-Packed Film!,0,,none
2,"Cheerleader. zombies, knocked up",8,The film has it all - greedy CEO capitalist pi...,the film has it all greedy ceo capitalist pig...
3,Packed with action and populated by both good ...,0,,none
4,Excellent,8,This film has it all.ActionMoralityA lesson fo...,this film has it all actionmoralitya lesson fo...


In [7]:
useless_rows = review_df.loc[review_df['Review'] == 'None'].index
review_df.drop(useless_rows, inplace=True)
review_df.reset_index(drop=True, inplace=True)
review_df.shape

(569, 4)

In [8]:
vectorizer = TfidfVectorizer(stop_words="english")
tfidf_matrix = vectorizer.fit_transform(review_df['Cleaned Review'])
length = tfidf_matrix.shape[0]
similarity = cosine_similarity(tfidf_matrix[0], tfidf_matrix)

for i in range(tfidf_matrix.shape[0]):
    similarity = cosine_similarity(tfidf_matrix[i], tfidf_matrix)
    print(f'{round(similarity.mean()*100, 4)}%')

2.8655%
2.0312%
0.8743%
3.1697%
1.3481%
3.1504%
2.1791%
3.782%
3.102%
1.5049%
5.5859%
1.8252%
3.5034%
5.1553%
4.3288%
2.6388%
5.5446%
6.4604%
6.0507%
7.138%
1.8508%
2.0686%
4.4032%
4.0105%
4.9315%
4.1225%
2.6501%
7.0889%
5.3218%
2.2069%
3.638%
7.4558%
7.5331%
4.8172%
1.5755%
4.1716%
5.0909%
8.26%
5.7034%
3.4309%
5.6852%
4.4742%
3.672%
4.485%
5.4114%
3.073%
4.6009%
4.6463%
6.7517%
3.5011%
5.239%
7.0986%
9.1599%
6.1616%
6.9562%
4.9002%
3.8133%
5.8163%
3.9272%
5.7167%
9.8141%
5.3937%
5.1778%
5.4559%
5.1524%
6.0019%
7.6753%
6.8475%
3.9263%
5.891%
4.8371%
7.5097%
7.2217%
9.0933%
5.9929%
8.6203%
5.2672%
7.0884%
5.8937%
6.9844%
5.1397%
8.9858%
4.8793%
7.5655%
1.7251%
4.0611%
4.1906%
4.1164%
5.2286%
3.6753%
4.7274%
5.1278%
3.1912%
4.1226%
6.6297%
4.9423%
4.4599%
6.1339%
6.4077%
4.5522%
4.8224%
3.9105%
4.5571%
3.7003%
7.1898%
3.8428%
6.5597%
5.8413%
7.1163%
4.4728%
4.1929%
3.7954%
5.3666%
3.4587%
4.7583%
4.8338%
4.969%
5.766%
1.8196%
4.6709%
4.7734%
4.3889%
5.0196%
5.7496%
4.1099%
3.2977%
4.896

In [13]:
review_df.head()

Unnamed: 0,Title,Rating,Review,Cleaned Review
0,Zombie Revival,8,Train to Busan was a treat. A look into Selfis...,train to busan was a treat a look into selfish...
1,"Cheerleader. zombies, knocked up",8,The film has it all - greedy CEO capitalist pi...,the film has it all greedy ceo capitalist pig...
2,Excellent,8,This film has it all.ActionMoralityA lesson fo...,this film has it all actionmoralitya lesson fo...
3,"Zombies, train, babes",9,I own it it really made me think a lot i enjoy...,i own it it really made me think a lot i enjoy...
4,Schoolgirl And Intrepid Infected Masses,10,You got it. This schoolgirl isn't from Texas a...,you got it this schoolgirl isn t from texas an...


In [26]:
bags_of_words = [TaggedDocument(word_tokenize(doc), [i]) for i, doc in enumerate(review_df['Cleaned Review'])]
model = Doc2Vec(vector_size=30, min_count=1, negative=10, workers=4)
model.build_vocab([x for x in bags_of_words])
model.train(bags_of_words, total_examples=len(bags_of_words), epochs=30)
model.wv.most_similar('zombies')

[('skinless', 0.8112742900848389),
 ('damned', 0.7764367461204529),
 ('sped', 0.7639884948730469),
 ('crowded', 0.7616419196128845),
 ('ones', 0.7357720136642456),
 ('move', 0.729682445526123),
 ('steroids', 0.7291717529296875),
 ('uninfected', 0.7210514545440674),
 ('things', 0.7060855627059937),
 ('deprived', 0.7056760191917419)]

In [27]:
for doc in review_df['Cleaned Review']:
    print(model.wv.n_similarity(review_df['Cleaned Review'][0].split(), doc.split()))

1.0
0.88639575
0.89808774
0.763178
0.7110362
0.9311526
0.8637819
0.797712
0.8806666
0.8148199
0.8702615
0.83659023
0.8745178
0.8471904
0.8544904
0.8350518
0.905361
0.8634278
0.88385236
0.88549036
0.8649215
0.8375974
0.8847565
0.94594634
0.9086208
0.9363015
0.8099438
0.9371197
0.88766974


KeyError: "word 'cannot' not in vocabulary"