## Example 1
### Movie Review Sentiment Analysis

In [None]:
# Load movie reviews
import sys
from sklearn.datasets import load_files

# Determine the environment
is_colab = 'google.colab' in sys.modules
# # Load the dataset conditionally
if is_colab:
    # Code for Google Colab environment
    moviedir = 'data/movie_reviews/'
else:
    # Code for local Jupyter Notebook environment
    moviedir = '../data/movie_reviews/'

movies = load_files(moviedir, shuffle=True)

In [None]:
# target names ("classes") are automatically generated from subfolder names
print(movies.target_names)

In [None]:
# Split data into training and test sets
from sklearn.model_selection import train_test_split
docs_train, docs_test, y_train, y_test = train_test_split(movies.data, movies.target, test_size = 0.20, random_state = 12)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
movieVzer = CountVectorizer(min_df=2, ngram_range=(1, 2))

# fit and tranform using training text 
docs_train_counts = movieVzer.fit_transform(docs_train)

In [None]:
# Convert raw frequency counts into TF-IDF values
from sklearn.feature_extraction.text import TfidfTransformer
movieTfmer = TfidfTransformer(use_idf=True)
docs_train_tfidf = movieTfmer.fit_transform(docs_train_counts)

In [None]:
# Using the fitted vectorizer and transformer, tranform the test data
docs_test_counts = movieVzer.transform(docs_test)
docs_test_tfidf = movieTfmer.transform(docs_test_counts)

In [None]:
# Load Multinominal Naive Bayes classier from sklearn
from sklearn.naive_bayes import MultinomialNB

In [None]:
# Train a Multinomial Naive Bayes classifier
clf = MultinomialNB()
clf.fit(docs_train_tfidf, y_train)

In [None]:
# Predict the Test set results, find accuracy
clf.score(docs_test_tfidf, y_test)

In [None]:
# Test with short movie reviews
reviews_new = ['This movie was excellent', 'Absolute joy ride', 
               'Tom Hanks was amazing', 'Tom Cruise shone through.', 
              'This is a huge letdown', 'Two thumbs up', 'I fell asleep halfway through', 
              "Can't wait for the sequel", 'I cannot recommend this highly enough', 
              'Instant classic.', 'Tom Hanks performance was Oscar-worthy.',
              'A must-see event for all moviegoers', 
               "Endgame isn't a great movie, but there are flashes of greatness in it"]

reviews_new_counts = movieVzer.transform(reviews_new)         # turn text into count vector
reviews_new_tfidf = movieTfmer.transform(reviews_new_counts)  # turn into tfidf vector

In [None]:
# have classifier make a prediction
pred = clf.predict(reviews_new_tfidf)

In [None]:
# print out results
for review, category in zip(reviews_new, pred):
    print('%r => %s' % (review, movies.target_names[category]))