In [1]:
%run -i "../util/util_simple_classifier.ipynb"

In [2]:
from langdetect import detect # detect language
from nltk import word_tokenize # word tokenizer
from nltk.probability import FreqDist # frequency distribution
from nltk.corpus import stopwords # stopwords
from string import punctuation # punctuation characters

In [3]:
(train_df, test_df) = load_train_test_dataset_pd("train", "test")
print(train_df.head())
print(test_df.head())

                                                text  label
0  the rock is destined to be the 21st century's ...      1
1  the gorgeously elaborate continuation of " the...      1
2                     effective but too-tepid biopic      1
3  if you sometimes like to go to the movies to h...      1
4  emerges as something rare , an issue movie tha...      1
                                                text  label
0  lovingly photographed in the manner of a golde...      1
1              consistently clever and suspenseful .      1
2  it's like a " big chill " reunion of the baade...      1
3  the story gives ample opportunity for large-sc...      1
4                  red dragon " never cuts corners .      1


In [4]:
# Filter out non-English reviews
print(len(train_df))
train_df["lang"] = train_df["text"].apply(detect)
train_df = train_df[train_df["lang"] == "en"]
print(len(train_df))

8530
8350


In [5]:
test_df["lang"] = test_df["text"].apply(detect)
test_df = test_df[test_df["lang"] == "en"]

In [6]:
# Tokenization
train_df["tokenized_text"] = train_df["text"].apply(word_tokenize)
test_df["tokenized_text"] = test_df["text"].apply(word_tokenize)
print(train_df["tokenized_text"].head())

0    [the, rock, is, destined, to, be, the, 21st, c...
1    [the, gorgeously, elaborate, continuation, of,...
2                  [effective, but, too-tepid, biopic]
3    [if, you, sometimes, like, to, go, to, the, mo...
4    [emerges, as, something, rare, ,, an, issue, m...
Name: tokenized_text, dtype: object


In [7]:
# Remove stopwords and punctuation
stop_words = list(stopwords.words('english')) + list(punctuation)
stop_words.append("``")
stop_words.append("'s")
def remove_stopwords(x):
    return [word for word in x if word.lower() not in stop_words]
train_df["tokenized_text"] = train_df["tokenized_text"].apply(remove_stopwords)
test_df["tokenized_text"] = test_df["tokenized_text"].apply(remove_stopwords)
print(train_df["tokenized_text"].head())

0    [rock, destined, 21st, century, new, conan, go...
1    [gorgeously, elaborate, continuation, lord, ri...
2                       [effective, too-tepid, biopic]
3    [sometimes, like, go, movies, fun, wasabi, goo...
4    [emerges, something, rare, issue, movie, hones...
Name: tokenized_text, dtype: object


In [8]:
# Check label balance
print(train_df.groupby('label').count())
print(test_df.groupby('label').count())

       text  lang  tokenized_text
label                            
0      4184  4184            4184
1      4166  4166            4166
       text  lang  tokenized_text
label                            
0       526   526             526
1       522   522             522


In [9]:
# Save cleaned data to disk
# JSON makes your data portable, easy to share, and flexible for future use.
train_df.to_json("../data/rotten_tomatoes_train.json")
test_df.to_json("../data/rotten_tomatoes_test.json")

In [10]:
# Define a function to extract the most common words from a list of tokenized texts
def get_stats(word_list, num_words=200):
    freq_dist = FreqDist(word_list)
    return freq_dist.most_common(num_words)

In [11]:
# Compare the most frequent words in positive and negative reviews
# Sum (concatenate) all tokenized words in positive reviews
positive_train_words = train_df[train_df["label"] == 1]["tokenized_text"].sum() 
negative_train_words = train_df[train_df["label"] == 0]["tokenized_text"].sum()
positive_fd = get_stats(positive_train_words, 200)
negative_fd = get_stats(negative_train_words, 200)
print("Most common words in positive reviews:")
print(positive_fd)

Most common words in positive reviews:
[('film', 684), ('movie', 429), ("n't", 286), ('one', 280), ('--', 271), ('like', 208), ('story', 194), ('comedy', 160), ('good', 150), ('even', 144), ('funny', 137), ('way', 135), ('time', 127), ('best', 126), ('characters', 125), ('make', 124), ('life', 124), ('us', 123), ('much', 122), ('love', 118), ('performances', 117), ('makes', 115), ('may', 113), ('work', 111), ('director', 110), ('enough', 105), ('look', 103), ('still', 96), ('little', 94), ('well', 93), ('new', 92), ('films', 92), ('movies', 89), ('fun', 89), ('great', 88), ('drama', 87), ('two', 85), ('performance', 82), ('never', 81), ('could', 80), ('see', 77), ('world', 77), ('people', 76), ('cast', 75), ('many', 74), ('also', 73), ('though', 73), ('tale', 71), ('first', 70), ('documentary', 69), ('without', 69), ('entertaining', 68), ('big', 68), ('made', 67), ('heart', 66), ('ever', 65), ('family', 65), ('often', 64), ('would', 64), ('humor', 64), ("'re", 63), ('sense', 63), ('hum

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report

In [13]:
# Load JSON files
train_df = pd.read_json("../data/rotten_tomatoes_train.json")
test_df = pd.read_json("../data/rotten_tomatoes_test.json")

In [14]:
train_df.head()

Unnamed: 0,text,label,lang,tokenized_text
0,the rock is destined to be the 21st century's ...,1,en,"[rock, destined, 21st, century, new, conan, go..."
1,"the gorgeously elaborate continuation of "" the...",1,en,"[gorgeously, elaborate, continuation, lord, ri..."
2,effective but too-tepid biopic,1,en,"[effective, too-tepid, biopic]"
3,if you sometimes like to go to the movies to h...,1,en,"[sometimes, like, go, movies, fun, wasabi, goo..."
4,"emerges as something rare , an issue movie tha...",1,en,"[emerges, something, rare, issue, movie, hones..."


In [15]:
len(train_df), len(test_df)

(8350, 1048)

In [16]:
# Raw text version (without vectorization) of positive and negative reviews
positive_train_words = train_df[train_df["label"] == 1].tokenized_text.sum()
negative_train_words = train_df[train_df["label"] == 0].tokenized_text.sum()
word_intersection = set(positive_train_words) & set(negative_train_words)
positive_filtered = list(set(positive_train_words) - word_intersection)
negative_filtered = list(set(negative_train_words) - word_intersection)

In [17]:
len(positive_filtered), len(negative_filtered)

(5486, 5977)

In [18]:
positive_filtered

["'blue",
 'creep',
 'dwarfs',
 'milestone',
 'embers',
 'onstage',
 'confronted',
 'jeff',
 'tully',
 'substances',
 'low-down',
 'bigotry',
 'intently',
 'annoyances',
 'fore',
 'herbivore',
 'telescope',
 'rise-and-fall',
 'usurp',
 'satisfyingly',
 'polson',
 'quotient',
 'cahill',
 'gerbosi',
 "'laugh",
 'prefer',
 'optimistic',
 'divisions',
 'speeds/',
 'locals',
 'tact',
 'exposure',
 'effusion',
 'worship',
 'temptations',
 'fury',
 'homosexuality',
 'foreboding',
 'fosters',
 'illustrating',
 'fillm',
 'rapt',
 'whipping',
 'melted',
 'judicious',
 'mending',
 'sanders',
 'overt',
 'supportive',
 'excites',
 'flames',
 'illustrated',
 'wiseman',
 'disregard',
 'avis',
 'natural-seeming',
 'bellini',
 'households',
 'swiftly',
 'dared',
 'scandals',
 'imparted',
 'holistic',
 'coral',
 'steadily',
 'withering',
 'wen',
 'dude',
 'fiascoes',
 'merrily',
 'canadians',
 'relief',
 'lately',
 'painterly',
 'model',
 'junior-high',
 'seacoast',
 'small-scale',
 'aristocratic',
 're

In [19]:
# Vectorizer function
def create_vectorizers(word_lists):
    vectorizers = []
    for word_list in word_lists:
        vectorizer = CountVectorizer(vocabulary=word_list)
        vectorizers.append(vectorizer)
    return vectorizers

In [20]:
vectorizers = create_vectorizers([positive_filtered, negative_filtered])

In [None]:
# Defines a function to count words from different vocabularies in a tokenized text.
def vectorize(text_list, vectorizers):
    # text_list: list of tokens
    # vectorizers: list of fitted CountVectorizer objects
    text = " ".join(text_list) # joint the list of tokens into a space-separated string
    scores = [] # list of scores from each vectorizer
    for vectorizer in vectorizers:
        output = vectorizer.transform([text]) # sparse matrix (1,, n_vocab)
        output_sum = sum(output.todense().tolist()[0])
        scores.append(output_sum)
    return scores # total number of matched words from each vectorizer

In [None]:
# Simple classification rule based on the highest score
def classify(score_list):
    return max(enumerate(score_list), key=lambda x: x[1])[0]

In [23]:
train_df["prediction"] = train_df["tokenized_text"].apply(lambda x: classify(vectorize(x, vectorizers)))
test_df["prediction"] = test_df["tokenized_text"].apply(lambda x: classify(vectorize(x, vectorizers)))

In [24]:
train_df.head()

Unnamed: 0,text,label,lang,tokenized_text,prediction
0,the rock is destined to be the 21st century's ...,1,en,"[rock, destined, 21st, century, new, conan, go...",0
1,"the gorgeously elaborate continuation of "" the...",1,en,"[gorgeously, elaborate, continuation, lord, ri...",0
2,effective but too-tepid biopic,1,en,"[effective, too-tepid, biopic]",0
3,if you sometimes like to go to the movies to h...,1,en,"[sometimes, like, go, movies, fun, wasabi, goo...",0
4,"emerges as something rare , an issue movie tha...",1,en,"[emerges, something, rare, issue, movie, hones...",0


In [25]:
print(classification_report(train_df["label"], train_df["prediction"]))

              precision    recall  f1-score   support

           0       0.19      0.23      0.21      4184
           1       0.01      0.01      0.01      4166

    accuracy                           0.12      8350
   macro avg       0.10      0.12      0.11      8350
weighted avg       0.10      0.12      0.11      8350



In [26]:
print(classification_report(test_df["label"], test_df["prediction"]))

              precision    recall  f1-score   support

           0       0.40      0.56      0.47       526
           1       0.27      0.16      0.20       522

    accuracy                           0.36      1048
   macro avg       0.34      0.36      0.34      1048
weighted avg       0.34      0.36      0.34      1048

