<a href="https://colab.research.google.com/github/wooihaw/ERA3036_T2310/blob/main/Chapter_3/Chapter_3_Examples.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Example 1
### Movie Review Sentiment Analysis

In [9]:
# Load movie reviews
import sys
import requests
from io import BytesIO
from zipfile import ZipFile
from sklearn.datasets import load_files


# Send a GET request to the URL
response = requests.get("https://raw.githubusercontent.com/wooihaw/datasets/main/movie_reviews.zip")

# Ensure the request was successful
if response.status_code == 200:
  # Determine the environment
  is_colab = 'google.colab' in sys.modules
  # # Load the dataset conditionally
  if is_colab:
    # Code for Google Colab environment
    moviedir = 'sample_data/movie_reviews'
    zipfile = ZipFile(BytesIO(response.content))
    zipfile.extractall("sample_data/")
  else:
     # Code for local Jupyter Notebook environment
    moviedir = '../data/movie_reviews/'
    zipfile = ZipFile(BytesIO(response.content))
    zipfile.extractall("../data/")
else:
    print("Failed to retrieve the file")

movies = load_files(moviedir, shuffle=True)

In [10]:
# target names ("classes") are automatically generated from subfolder names
print(movies.target_names)

['neg', 'pos']


In [11]:
# Split data into training and test sets
from sklearn.model_selection import train_test_split
docs_train, docs_test, y_train, y_test = train_test_split(movies.data, movies.target, test_size = 0.20, random_state = 12)

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
movieVzer = CountVectorizer(min_df=2, ngram_range=(1, 2))

# fit and tranform using training text
docs_train_counts = movieVzer.fit_transform(docs_train)

In [13]:
# Convert raw frequency counts into TF-IDF values
from sklearn.feature_extraction.text import TfidfTransformer
movieTfmer = TfidfTransformer(use_idf=True)
docs_train_tfidf = movieTfmer.fit_transform(docs_train_counts)

In [14]:
# Using the fitted vectorizer and transformer, tranform the test data
docs_test_counts = movieVzer.transform(docs_test)
docs_test_tfidf = movieTfmer.transform(docs_test_counts)

In [15]:
# Load Multinominal Naive Bayes classier from sklearn
from sklearn.naive_bayes import MultinomialNB

In [16]:
# Train a Multinomial Naive Bayes classifier
clf = MultinomialNB()
clf.fit(docs_train_tfidf, y_train)

In [17]:
# Predict the Test set results, find accuracy
clf.score(docs_test_tfidf, y_test)

0.81

In [18]:
# Test with short movie reviews
reviews_new = ['This movie was excellent', 'Absolute joy ride',
               'Tom Hanks was amazing', 'Tom Cruise shone through.',
              'This is a huge letdown', 'Two thumbs up', 'I fell asleep halfway through',
              "Can't wait for the sequel", 'I cannot recommend this highly enough',
              'Instant classic.', 'Tom Hanks performance was Oscar-worthy.',
              'A must-see event for all moviegoers',
               "Endgame isn't a great movie, but there are flashes of greatness in it"]

reviews_new_counts = movieVzer.transform(reviews_new)         # turn text into count vector
reviews_new_tfidf = movieTfmer.transform(reviews_new_counts)  # turn into tfidf vector

In [19]:
# have classifier make a prediction
pred = clf.predict(reviews_new_tfidf)

In [20]:
# print out results
for review, category in zip(reviews_new, pred):
    print('%r => %s' % (review, movies.target_names[category]))

'This movie was excellent' => pos
'Absolute joy ride' => pos
'Tom Hanks was amazing' => pos
'Tom Cruise shone through.' => pos
'This is a huge letdown' => neg
'Two thumbs up' => neg
'I fell asleep halfway through' => neg
"Can't wait for the sequel" => neg
'I cannot recommend this highly enough' => pos
'Instant classic.' => pos
'Tom Hanks performance was Oscar-worthy.' => pos
'A must-see event for all moviegoers' => pos
"Endgame isn't a great movie, but there are flashes of greatness in it" => pos
