In [1]:
import pandas as pd
import numpy as np
import text_normalizer as tn
import warnings

warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv(r"movie_reviews.csv")

reviews = np.array(data['review'])
sentiments = np.array(data['sentiment'])

train_reviews = reviews[:35000]
train_sentiments = sentiments[:35000]
test_reviews = reviews[35000:]
test_sentiments = sentiments[35000:]

print(data.head())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [None]:
# normalize datasets
norm_train_reviews = tn.normalize_corpus(train_reviews)
norm_test_reviews = tn.normalize_corpus(test_reviews)

In [None]:
'''
with open('train_result.txt','r') as file:
    data = file.read()

documents = data.split(",")
docs = [review[2:-1] for review in documents]
docs[-1] = docs[-1][:-1] # special for the last one

norm_train_reviews = docs

with open('test_result.txt','r') as file:
    data = file.read()

test_documents = data.split(",")
test_docs = [review[2:-1] for review in test_documents]
test_docs[-1] = test_docs[-1][:-1]

norm_test_reviews = test_docs # special for the last one
'''

## Extract features from positive and negative reviews

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
# consolidate all normalized review
norm_reviews = norm_train_reviews + norm_test_reviews

# get tf-idf features for positive-only reviews
positive_reviews = [review for review, sentiment in zip(norm_reviews, sentiments) if sentiment == 'positive']
ptvf = TfidfVectorizer(use_idf = True,
                       min_df = 0.05,
                       max_df = 0.95,
                       ngram_range = (1,1),
                       sublinear_tf = True)
ptvf_features = ptvf.fit_transform(positive_reviews)


# get tf-idf features for negative-only reviews
negative_reviews = [review for review, sentiment in zip(norm_reviews, sentiments) if sentiment == 'negative']
ntvf = TfidfVectorizer(use_idf = True,
                       min_df = 0.05,
                       max_df = 0.95,
                       ngram_range = (1,1),
                       sublinear_tf = True)
ntvf_features = ntvf.fit_transform(negative_reviews)

# view feature set dimensions
print(ptvf_features.shape, ntvf_features.shape)

(25000, 332) (25000, 334)


## Topic Modeling on Reviews

In [10]:
import pyLDAvis
import pyLDAvis.sklearn
from sklearn.decomposition import NMF
import topic_model_utils as tmu

In [11]:
pyLDAvis.enable_notebook()
total_topics = 10

### Display and visualize topics for positive reviews

In [12]:
# build topic model on positive sentiment review features
pos_nmf = NMF(n_components = total_topics,
              random_state = 42,
              alpha = 0.1,
              l1_ratio = 0.2)
pos_nmf.fit(ptvf_features)

NMF(alpha=0.1, beta_loss='frobenius', init=None, l1_ratio=0.2, max_iter=200,
  n_components=10, random_state=42, shuffle=False, solver='cd', tol=0.0001,
  verbose=0)

In [13]:
# extract features and component weights
pos_feature_names = ptvf.get_feature_names()
pos_weights = pos_nmf.components_

# extract and display topics and relavant components
pos_topics = tmu.get_topics_terms_weights(pos_weights, pos_feature_names)
tmu.print_topics_udf(topics=pos_topics,
                 total_topics=total_topics,
                 num_terms=15,
                 display_weights=False)

Topic #1 without weights
['like', 'think', 'not', 'really', 'say', 'get', 'go', 'would', 'know', 'thing', 'people', 'bad', 'want', 'could', 'much']

Topic #2 without weights
['movie', 'good', 'great', 'see', 'watch', 'not', 'one', 'enjoy', 'acting', 'ever', 'recommend', 'make', 'like', 'fan', 'think']

Topic #3 without weights
['year', 'time', 'see', 'dvd', 'first', 'still', 'old', 'remember', 'release', 'would', 'come', 'since', 'watch', 'ever', 'day']

Topic #4 without weights
['show', 'episode', 'series', 'tv', 'good', 'watch', 'great', 'one', 'fan', 'not', 'new', 'character', 'ever', 'original', 'first']

Topic #5 without weights
['play', 'role', 'performance', 'actor', 'cast', 'good', 'star', 'great', 'excellent', 'give', 'well', 'support', 'john', 'also', 'actress']

Topic #6 without weights
['film', 'see', 'not', 'one', 'make', 'great', 'good', 'horror', 'cinema', 'director', 'recommend', 'must', 'ever', 'art', 'highly']

Topic #7 without weights
['character', 'scene', 'story', 

In [14]:
pyLDAvis.sklearn.prepare(pos_nmf, ptvf_features, ptvf, R=15)

### Display and visualize topics for negative reviews

In [15]:
neg_nmf = NMF(n_components=10, 
              random_state=42, 
              alpha=0.1, 
              l1_ratio=0.2)
neg_nmf.fit(ntvf_features)      
# extract features and component weights
neg_feature_names = ntvf.get_feature_names()
neg_weights = neg_nmf.components_
# extract and display topics and their components
neg_topics = tmu.get_topics_terms_weights(neg_weights, neg_feature_names)
tmu.print_topics_udf(topics=neg_topics,
                     total_topics=total_topics,
                     num_terms=15,
                     display_weights=False)

Topic #1 without weights
['get', 'go', 'man', 'take', 'know', 'guy', 'kill', 'come', 'woman', 'back', 'people', 'girl', 'one', 'thing', 'around']

Topic #2 without weights
['bad', 'ever', 'see', 'acting', 'movie', 'one', 'terrible', 'awful', 'even', 'act', 'horrible', 'plot', 'make', 'no', 'script']

Topic #3 without weights
['film', 'not', 'make', 'see', 'would', 'director', 'time', 'watch', 'many', 'may', 'one', 'good', 'say', 'feel', 'think']

Topic #4 without weights
['book', 'read', 'story', 'version', 'base', 'write', 'change', 'love', 'not', 'would', 'original', 'movie', 'never', 'many', 'see']

Topic #5 without weights
['movie', 'watch', 'not', 'think', 'would', 'good', 'waste', 'time', 'see', 'like', 'could', 'say', 'money', 'make', 'want']

Topic #6 without weights
['funny', 'comedy', 'laugh', 'joke', 'try', 'not', 'stupid', 'suppose', 'fun', 'moment', 'like', 'really', 'black', 'even', 'annoying']

Topic #7 without weights
['play', 'actor', 'role', 'cast', 'good', 'performan

In [16]:
pyLDAvis.sklearn.prepare(neg_nmf, ntvf_features, ntvf, R=15)