In [138]:
import io
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import nltk
import joblib

In [139]:
nltk.download('stopwords')
forest = RandomForestClassifier()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/andrey/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [140]:
import joblib

model_filename = './new_data/forest.pkl'
loaded_model_forest = joblib.load(model_filename)

In [141]:
test_df = pd.read_csv('./new_data/testData.tsv', sep='\t', header=0)

In [142]:
def review_to_words(raw_review):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review).get_text()

    # 2. Remove non-letters
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()
    #
    # 4. In Python, searching a set is much faster than searching
    # a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))
    #
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]
    #
    # 6. Join the words back into one string separated by space,
    # and return the result.
    return( " ".join( meaningful_words ))

In [143]:
test_df.head()

Unnamed: 0,id,review
0,12311_10,Naturally in a film who's main themes are of m...
1,8348_2,This movie is a disaster within a disaster fil...
2,5828_4,"All in all, this is a movie for kids. We saw i..."
3,7186_2,Afraid of the Dark left me with the impression...
4,12128_7,A very accurate depiction of small time mob li...


In [144]:
test_reviews = test_df['review']
test_reviews

0        Naturally in a film who's main themes are of m...
1        This movie is a disaster within a disaster fil...
2        All in all, this is a movie for kids. We saw i...
3        Afraid of the Dark left me with the impression...
4        A very accurate depiction of small time mob li...
                               ...                        
24995    Sony Pictures Classics, I'm looking at you! So...
24996    I always felt that Ms. Merkerson had never got...
24997    I was so disappointed in this movie. I am very...
24998    From the opening sequence, filled with black a...
24999    This is a great horror film for people who don...
Name: review, Length: 25000, dtype: object

In [145]:
num_reviews = test_df["review"].size
clean_test_reviews = []

print(type(test_df["review"][0]))

for i in range( 0, num_reviews ):
    clean_test_reviews.append(review_to_words(test_df["review"][i]))

<class 'str'>


  review_text = BeautifulSoup(raw_review).get_text()


In [146]:
print(clean_test_reviews[0])

naturally film main themes mortality nostalgia loss innocence perhaps surprising rated highly older viewers younger ones however craftsmanship completeness film anyone enjoy pace steady constant characters full engaging relationships interactions natural showing need floods tears show emotion screams show fear shouting show dispute violence show anger naturally joyce short story lends film ready made structure perfect polished diamond small changes huston makes inclusion poem fit neatly truly masterpiece tact subtlety overwhelming beauty


In [147]:
vectorizer = CountVectorizer(analyzer = "word",
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = None,
                             max_features = 5000)

test_features = vectorizer.fit_transform(test_reviews)

In [148]:
test_data_features = vectorizer.fit_transform(clean_test_reviews)
test_data_features = test_data_features.toarray()

In [149]:
vocab = vectorizer.get_feature_names_out()

In [150]:
# Sum up the counts of each vocabulary word
dist = np.sum(test_data_features, axis=0)
# For each, print the vocabulary word and the number of times it
# appears in the training set
for tag, count in zip(vocab, dist):
    print(count, tag)

129 aaron
194 abandoned
92 abc
128 abilities
460 ability
1364 able
85 aboard
98 absence
95 absent
325 absolute
1509 absolutely
249 absurd
203 abuse
82 abused
96 abysmal
271 academy
460 accent
189 accents
315 accept
143 acceptable
184 accepted
343 accident
193 accidentally
114 accompanied
115 accomplished
262 according
184 account
315 accurate
86 accurately
131 accused
156 achieve
142 achieved
111 achievement
1008 across
1227 act
691 acted
6368 acting
3142 action
346 actions
77 active
83 activities
2255 actor
4462 actors
1110 actress
346 actresses
395 acts
693 actual
4229 actually
169 ad
225 adam
92 adams
388 adaptation
88 adaptations
130 adapted
857 add
462 added
76 addict
97 addiction
175 adding
297 addition
369 adds
91 adequate
134 admire
574 admit
119 admittedly
90 adorable
414 adult
324 adults
94 advance
147 advantage
475 adventure
184 adventures
99 advertising
270 advice
108 advise
377 affair
99 affairs
115 affected
103 affection
104 afford
88 afghanistan
95 aforementioned
302 afr

In [151]:
predictions = loaded_model_forest.predict(test_data_features)


In [156]:
result_df_cleaned_reviews = pd.DataFrame({'id': test_df['id'], 'predicted_sentiment': predictions, 'review': clean_test_reviews})

Unnamed: 0,id,predicted_sentiment,review
0,12311_10,0,naturally film main themes mortality nostalgia...
1,8348_2,0,movie disaster within disaster film full great...
2,5828_4,0,movie kids saw tonight child loved one point k...
3,7186_2,0,afraid dark left impression several different ...
4,12128_7,0,accurate depiction small time mob life filmed ...
...,...,...,...
2447,6028_1,0,also wife abusive husband even situation attac...
2448,6000_4,0,thunderbolt probably jackie chan worst movie s...
2449,8712_1,0,nearly fell asleep screening boring story seem...
2450,3969_8,0,accepted teen flick stars guy mac pc ads movie...


In [153]:
result_df_og_reviews = pd.DataFrame({'id': test_df['id'], 'predicted_sentiment': predictions, 'review': test_df['review']})
result_df_og_reviews.head()

Unnamed: 0,id,predicted_sentiment,review
0,12311_10,0,Naturally in a film who's main themes are of m...
1,8348_2,0,This movie is a disaster within a disaster fil...
2,5828_4,0,"All in all, this is a movie for kids. We saw i..."
3,7186_2,0,Afraid of the Dark left me with the impression...
4,12128_7,0,A very accurate depiction of small time mob li...
