# **Hubungkan ke Drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Scrapping Review Film Interstellar**

In [None]:
!pip install beautifulsoup4
!pip install pandas
!pip install selenium
!pip install --upgrade selenium


In [None]:
from bs4 import BeautifulSoup
import requests

In [None]:
!pip install selenium
!apt-get update
!apt-get install chromium-browser
!apt install chromium-chromedriver

In [None]:
from selenium import webdriver

In [None]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from concurrent.futures import ThreadPoolExecutor
import time

def web_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--verbose")
    options.add_argument('--no-sandbox')
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument("--window-size=1920, 1200")
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=options)
    return driver

def scrape_reviews(url):
    driver = web_driver()
    driver.get(url)

    for _ in range(10):
        try:
            load_more_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "div.ipl-load-more.ipl-load-more--loaded button.ipl-load-more__button"))
            )

            ActionChains(driver).move_to_element(load_more_button).click().perform()

            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "review-container")))

        except Exception as e:
            print(f"No more content to load: {e}")
            break

    soup = BeautifulSoup(driver.page_source, "html.parser")

    html = soup.find_all('div', {"class": "review-container"})

    usernames = []
    titles = []
    reviews = []
    dates = []

    for row in html:
        username_elem = row.find("span", {"class": "display-name-link"})
        username = username_elem.text if username_elem else None
        usernames.append(username)

        title_elem = row.find("a", {"class": "title"})
        title = title_elem.text if title_elem else None
        titles.append(title)

        review_elem = row.find("div", {"class": "text show-more__control"})
        review = review_elem.text if review_elem else None
        reviews.append(review)

        date_elem = row.find("span", {"class": "review-date"})
        date = date_elem.text if date_elem else None
        dates.append(date)

    driver.quit()

    return usernames, titles, reviews, dates

In [None]:
url = "https://www.imdb.com/title/tt0816692/reviews/?ref_=tt_ov_rt"
usernames, titles, reviews, dates = scrape_reviews(url)

# **Save Dataset to CSV**

In [None]:
import pandas as pd

df = pd.DataFrame({'Title' : titles, 'Username' : usernames, 'Tanggal' : dates, 'Review' : reviews})
df.to_csv('/content/drive/MyDrive/Dataset/IMDB_Review_Interstellar.csv', index = False, encoding = 'utf-8')

# **Open Dataset**

In [None]:
import numpy as np
import string
import re

In [None]:
data_review = pd.read_csv('/content/drive/MyDrive/Dataset/IMDB_Review_Interstellar.csv')
print(data_review)

# **Case Folding & Cleaning Text**

In [None]:
review = data_review['Review']

In [None]:
def clean_text(text):
    if isinstance(text, str):
        # Hapus https dan http (link)
        text = re.sub(r"http\S+|www\S+|https\S+", "", text)
        # Hapus tanda baca
        text = text.translate(str.maketrans('', '', string.punctuation))
        # Pertimbangkan huruf dan angka
        text = re.sub('[^a-zA-Z]', ' ', text)
        # Ganti line baru dengan spasi
        text = re.sub("\n", " ", text)
        # Ubah ke huruf kecil
        text = text.lower()
        # Hapus single char
        text = re.sub(r"\b[a-zA-Z]\b", " ", text)

        return text
    else:
        # If not a string, return None
        return None

In [None]:
data_review['Cleaned Review'] = data_review['Review'].apply(clean_text)

data_review = data_review[data_review['Cleaned Review'].notna()]

In [None]:
data_review.head(20)

# **Tokenizing**

In [None]:
from nltk.tokenize import RegexpTokenizer
regexp = RegexpTokenizer(r'\w+|$[0-9]+|\S+')
data_review['Tokenized'] = data_review['Cleaned Review'].apply(regexp.tokenize)

In [None]:
data_review.head(20)

# **Normalisasi**

In [None]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
data_review['Normalized'] = data_review['Tokenized'].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])

In [None]:
data_review.head(20)

# **Remove Stopwords**

In [None]:
from nltk.corpus import stopwords

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
data_review['Filtered'] = data_review['Normalized'].apply(lambda tokens: [token for token in tokens if token not in stop_words])

In [None]:
data_review.head(20)

# **Stemming**

In [None]:
from nltk.stem import PorterStemmer

porter_stemmer = PorterStemmer()

In [None]:
data_review['Stemmed'] = data_review['Filtered'].apply(lambda tokens: [porter_stemmer.stem(token) for token in tokens])

In [None]:
data_review.head(20)

# **Join All the Word**

In [None]:
data_review['Final Review'] = data_review['Stemmed'].apply(lambda tokens: ' '.join(tokens))

In [None]:
data_review.head(20)

# **Save the Cleaned Dataset Review**

In [None]:
review = pd.DataFrame(data_review)
review.to_csv('/content/drive/MyDrive/Dataset/IMDB_Review_Interstellar_Cleaned.csv', index = False, encoding = 'utf-8')

# **Labeling Dataset Sentimen Menggunakan Model VADER**

In [None]:
!pip install transformers
!pip install googletrans==3.1.0a0

In [None]:
import pandas as pd
import numpy as np

In [None]:
data_sentiment = pd.read_csv('/content/drive/MyDrive/Dataset/IMDB_Review_Interstellar_Cleaned.csv')

data_sentiment.head(20)

Unnamed: 0,Title,Username,Tanggal,Review,Cleaned Review,Tokenized,Normalized,Filtered,Stemmed,Final Review
0,Out of this world\n,kosmasp,31 May 2015,A lot has been said and written about Interste...,lot has been said and written about interste...,"['lot', 'has', 'been', 'said', 'and', 'written...","['lot', 'ha', 'been', 'said', 'and', 'written'...","['lot', 'ha', 'said', 'written', 'interstellar...","['lot', 'ha', 'said', 'written', 'interstellar...",lot ha said written interstellar obvious take ...
1,Masterpiece\n,aheaven2005,25 June 2022,A science-fiction masterpiece. Nolan executes ...,sciencefiction masterpiece nolan executes ...,"['sciencefiction', 'masterpiece', 'nolan', 'ex...","['sciencefiction', 'masterpiece', 'nolan', 'ex...","['sciencefiction', 'masterpiece', 'nolan', 'ex...","['sciencefict', 'masterpiec', 'nolan', 'execut...",sciencefict masterpiec nolan execut marvel dir...
2,Possibly the best movie of all time\n,theoledoux,6 April 2021,I think just about everything has been said ab...,think just about everything has been said ab...,"['think', 'just', 'about', 'everything', 'has'...","['think', 'just', 'about', 'everything', 'ha',...","['think', 'everything', 'ha', 'said', 'film', ...","['think', 'everyth', 'ha', 'said', 'film', 'st...",think everyth ha said film still tell masterpi...
3,I waited 5 years to watch it again\n,Ksa-2010,26 June 2019,After watching this insane movie in the theatr...,after watching this insane movie in the theatr...,"['after', 'watching', 'this', 'insane', 'movie...","['after', 'watching', 'this', 'insane', 'movie...","['watching', 'insane', 'movie', 'theatre', 'ba...","['watch', 'insan', 'movi', 'theatr', 'back', '...",watch insan movi theatr back swore god wait ye...
4,Masterpiece\n,e-jackson1985,8 May 2022,Amongst the best movies of all time. The story...,amongst the best movies of all time the story ...,"['amongst', 'the', 'best', 'movies', 'of', 'al...","['amongst', 'the', 'best', 'movie', 'of', 'all...","['amongst', 'best', 'movie', 'time', 'story', ...","['amongst', 'best', 'movi', 'time', 'stori', '...",amongst best movi time stori act script cinema...
5,7 years later\n,ravesch-83770,29 October 2021,Sometimes I just need to see the start. Or end...,sometimes just need to see the start or end ...,"['sometimes', 'just', 'need', 'to', 'see', 'th...","['sometimes', 'just', 'need', 'to', 'see', 'th...","['sometimes', 'need', 'see', 'start', 'end', '...","['sometim', 'need', 'see', 'start', 'end', 'tr...",sometim need see start end trailer music theme...
6,A journey across the galaxy to save humanity\n,Tweekums,27 January 2016,Set in a future where crop species are going e...,set in future where crop species are going e...,"['set', 'in', 'future', 'where', 'crop', 'spec...","['set', 'in', 'future', 'where', 'crop', 'spec...","['set', 'future', 'crop', 'specie', 'going', '...","['set', 'futur', 'crop', 'speci', 'go', 'extin...",set futur crop speci go extinct one anoth form...
7,Excellent Movie\n,frank-ancestor-hunter,6 April 2015,I judge a movie by how long it takes me to rea...,judge movie by how long it takes me to rea...,"['judge', 'movie', 'by', 'how', 'long', 'it', ...","['judge', 'movie', 'by', 'how', 'long', 'it', ...","['judge', 'movie', 'long', 'take', 'realize', ...","['judg', 'movi', 'long', 'take', 'realiz', 'ne...",judg movi long take realiz need bathroom long ...
8,Absolutely Brilliant\n,gavin6942,25 January 2015,A team of explorers travel through a wormhole ...,team of explorers travel through wormhole ...,"['team', 'of', 'explorers', 'travel', 'through...","['team', 'of', 'explorer', 'travel', 'through'...","['team', 'explorer', 'travel', 'wormhole', 'at...","['team', 'explor', 'travel', 'wormhol', 'attem...",team explor travel wormhol attempt ensur human...
9,I would rate 11/10\n,mysteryvoiceman,24 June 2018,I hadn't seen this but movie and caught it on ...,hadnt seen this but movie and caught it on ...,"['hadnt', 'seen', 'this', 'but', 'movie', 'and...","['hadnt', 'seen', 'this', 'but', 'movie', 'and...","['hadnt', 'seen', 'movie', 'caught', 'flight',...","['hadnt', 'seen', 'movi', 'caught', 'flight', ...",hadnt seen movi caught flight back dr one favo...


In [None]:
print(f'shape: {data_sentiment.shape}')

shape: (165, 10)


In [None]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download("vader_lexicon")

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [None]:
sentiments = SentimentIntensityAnalyzer()
data_sentiment["Positive"] = [sentiments.polarity_scores(i)["pos"] for i in data_sentiment["Final Review"]]
data_sentiment["Negative"] = [sentiments.polarity_scores(i)["neg"] for i in data_sentiment["Final Review"]]
data_sentiment["Neutral"] = [sentiments.polarity_scores(i)["neu"] for i in data_sentiment["Final Review"]]
data_sentiment["Compound"] = [sentiments.polarity_scores(i)["compound"] for i in data_sentiment["Final Review"]]
data_sentiment.head()

In [None]:
score = data_sentiment["Compound"].values
sentiment = []
for i in score:
  if i >= 0.05:
    sentiment.append("Positif")
  elif i <= -0.05:
    sentiment.append("Negatif")
  else:
    sentiment.append("Netral")
data_sentiment["Sentiment"] = sentiment
data_sentiment.head(60)

In [None]:
!pip install plotly
import plotly.express as px



In [None]:
sentiment_counts = data_sentiment['Sentiment'].value_counts().reset_index()

sentiment_counts.columns = ['Sentiment', 'Count']

fig = px.bar(sentiment_counts, x='Sentiment', y='Count', color='Sentiment',
             labels={'Count': 'Total Count'},
             title='Sentiment Distribution')

for i, row in sentiment_counts.iterrows():
    fig.add_annotation(text=row['Count'],
                       x=row['Sentiment'],
                       y=row['Count'] + 0.1,
                       showarrow=False,
                       font=dict(color='black', size=12))

fig.show()

# **Analisis Emosi Menggunakan NRCLex**

In [None]:
data_emosi = pd.read_csv('/content/drive/MyDrive/Dataset/IMDB_Review_Interstellar_Cleaned.csv')

data_emosi.head(20)

In [None]:
!pip install NRCLex

In [None]:
from nrclex import NRCLex
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
data_emosi['Emosi'] = data_emosi['Final Review'].apply(lambda x: NRCLex(x).top_emotions)

data_emosi.head(60)

In [None]:
EmotionDf = NRCLex(' '.join(data_emosi['Final Review']))

EmotionDF = pd.DataFrame.from_dict(EmotionDf.affect_frequencies, orient='index').sort_values(by=0, ascending=False).reset_index()

EmotionDF.columns = ['Emosi', 'Frequency']

print(EmotionDF.head(40))

           Emosi  Frequency
0       positive   0.233407
1   anticipation   0.149283
2          trust   0.138258
3       negative   0.123594
4            joy   0.105513
5           fear   0.068578
6        sadness   0.065380
7       surprise   0.050717
8          anger   0.037707
9        disgust   0.027563
10       anticip   0.000000


In [None]:
import plotly.express as px
l
fig = px.pie(EmotionDF, values='Frequency', names='Emosi',
             title='Emotion Frequency For Reviews',
             hover_data=['Emosi'], labels={'Emosi': 'Emotion'})

fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()

In [None]:
fig = px.bar(EmotionDF, x='Emosi', y='Frequency',
             color='Emosi',
             title='Emotion Frequency For Reviews')

fig.update_traces(texttemplate='%{y:.2f}', textposition='outside')

fig.update_xaxes(title='Emosi')
fig.update_yaxes(title='Frequency')

fig.show()