# Spam Detector

## Data Retrieve

In [None]:
from google.colab import files
files.upload()

Output hidden; open in https://colab.research.google.com to view.

In [None]:
!unzip -q op_spam_v1.4.zip -d datasets/

In [None]:
import os

DATA_PATH = "/content/datasets/"
FILENAME = "op_spam_v1.4"
FULL_DATA_PATH = os.path.join(DATA_PATH, FILENAME)

neg_dec_dir = FULL_DATA_PATH + '/negative_polarity' + '/deceptive_from_MTurk/'
neg_tru_dir = FULL_DATA_PATH + '/negative_polarity' + '/truthful_from_Web/'
pos_dec_dir = FULL_DATA_PATH + '/positive_polarity' + '/deceptive_from_MTurk/'
pos_tru_dir = FULL_DATA_PATH + '/positive_polarity' + '/truthful_from_TripAdvisor/'

folds = ['fold1', 'fold2', 'fold3', 'fold4', 'fold5']

In [None]:
import pandas as pd

_review = []
_is_truthful = []
_filename = []
_hotel_name = []
_polarity = []
_source = []
_fold = []

for p, polar, src in zip([neg_dec_dir, neg_tru_dir, pos_dec_dir, pos_tru_dir],
                         ['negative', 'negative', 'positive', 'positive'],
                         ['MTurk', 'Web', 'MTurk', 'TripAdvisor']):
    for fold in folds:
        for filename in os.listdir(p + fold):
            with open(p + fold + '/' + filename) as f:
                msg = f.read()
            _review.append(msg.strip())
            _is_truthful.append(int(filename.startswith('t')))
            _filename.append(filename)
            _hotel_name.append(filename.split('_')[1])
            _polarity.append(polar)
            _source.append(src)
            _fold.append(int(fold[-1]))
            
reviews = pd.DataFrame({
    'text': _review,
    'is_truthful': _is_truthful,
    'polarity': _polarity,
    'hotel_name': _hotel_name,
    'source': _source,
    'fold': _fold,
    'filename': _filename,
})

In [None]:
reviews.head()

Unnamed: 0,text,is_truthful,polarity,hotel_name,source,fold,filename
0,When we got checked and arrived at our room th...,0,negative,monaco,MTurk,1,d_monaco_2.txt
1,"The James Chicago is a stuffy, uninviting hote...",0,negative,james,MTurk,1,d_james_9.txt
2,We booked a room at the Hilton Chicago for two...,0,negative,hilton,MTurk,1,d_hilton_17.txt
3,"For a hotel rated with four diamonds by AAA, o...",0,negative,hilton,MTurk,1,d_hilton_15.txt
4,I was very disappointed with this hotel. The f...,0,negative,hilton,MTurk,1,d_hilton_9.txt


In [None]:
X = list(reviews['text'].copy())
print(type(X))
y = list(reviews['is_truthful'].copy())

<class 'list'>


## Transformer

#### Stemmer

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
import nltk
from nltk.stem import PorterStemmer
nltk.download('punkt')

'''
TextStemmer:
  The class will do following things:
    - convert to lowercase
    - remove punctuation
    - replace numbers with the string 'NUMBER'
    - perform stemming (trim word ending with library)
  Return a list with stemmed text
'''
class TextStemmer(BaseEstimator, TransformerMixin):
  def __init__(self, 
               lowercaseConversion=True,
               punctuationRemoval=True,
               numberReplacement=True,
               stemming=True
               ):
    self.lowercaseConversion = lowercaseConversion
    self.punctuationRemoval = punctuationRemoval
    self.numberReplacement = numberReplacement
    self.stemming = stemming
    self.stemmer = PorterStemmer()
  
  def fit(self, X, y=None):
    return self
  
  def transform(self, X, y=None):
    X_stemmed = []

    for review in X:
      text = review
      if text is None:
        text = 'Null'

      if self.punctuationRemoval:
        text = text.replace('.', '')
        text = text.replace(',', '')
        text = text.replace('?', '')
        text = text.replace('!', '')
        text = text.replace('(', '')
        text = text.replace(')', '')
        text = text.replace('\'s', '')
    
      token_words = nltk.word_tokenize(text)
      stem_text_list = []

      for word in token_words:
        stem_text_list.append(self.stemmer.stem(word))
        stem_text_list.append(" ")
      
      text_stemmed = "".join(stem_text_list)
      
      X_stemmed.append(text_stemmed)
    
    return X_stemmed

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
stem_transformer = TextStemmer()

out = stem_transformer.fit_transform(X[0:3])
print(out[0])

when we got check and arriv at our room the first thing we notic wa the light did n't come on when we flip the switch upon enter the room wa not veri clean at all dust had collect on the back of the tv and on the stand behind it on the window sill and stain on the carpet We had request wireless internet servic but it wa down the entir first night the mattress wa veri hard make for an uncomfort night the jacuzzi wa out of order also when ask to be move to a differ room we were inform that there were not other avail 


#### Lemmatization

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('stopwords')

'''
TextLemmatizer:
  The class will do following things:
    - convert to lowercase
    - remove punctuation
    - replace numbers with the string 'NUMBER'
    - perform lemmatizing (transform into root form)
    - remove stop words
  Return a list with stemmed text
'''
class TextLemmatizer(BaseEstimator, TransformerMixin):
  def __init__(self, 
               lowercaseConversion=True,
               punctuationRemoval=True,
               numberReplacement=True,
               lemmatizing=True,
               removeStopwords=True
               ):
    self.lowercaseConversion = lowercaseConversion
    self.punctuationRemoval = punctuationRemoval
    self.numberReplacement = numberReplacement
    self.lemmatizing = lemmatizing
    self.lemmatizer = nltk.WordNetLemmatizer()
    
    self.removeStopwords = removeStopwords
    self.stopwords = nltk.corpus.stopwords.words('english')
    self.stopwords.extend(['!', ',', '.', '?', '-s', '-ly', '</s>',
                        's', '(', ')', '\'s', 'n\'t', '$', '2',
                        ':', '\'\'', '``', '-', '--'])
  
  def fit(self, X, y=None):
    return self
  
  def transform(self, X, y=None):
    X_lemmatized = []

    for review in X:
      text = review
      if text is None:
        text = 'Null'

      if self.lowercaseConversion:
        text = text.lower()

      if self.punctuationRemoval:
        text = text.replace('.', '')
        text = text.replace(',', '')
        text = text.replace('?', '')
        text = text.replace('!', '')
        text = text.replace('(', '')
        text = text.replace(')', '')
    
      token_words = nltk.word_tokenize(text)
      lemma_text_list = []

      for word in token_words:
        # notice that we have to give a context for lemmatizer
        lemma_text_list.append(self.lemmatizer.lemmatize(word, pos='v'))
      
      if self.removeStopwords:
        lemma_text_list_filtered = [ w for w in lemma_text_list if not w in self.stopwords ]

      text_lemmatized = " ".join(lemma_text_list_filtered)
      
      X_lemmatized.append(text_lemmatized)
    
    self.result = X_lemmatized
    
    return X_lemmatized

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
from collections import Counter

lemma_transformer = TextLemmatizer()

out = lemma_transformer.fit_transform(X[0:3])
print(out[0])
token_words = nltk.word_tokenize(out[0])
print(Counter(token_words))

get check arrive room first thing notice light come flip switch upon enter room clean dust collect back tv stand behind window sill stain carpet request wireless internet service entire first night mattress hard make uncomfortable night jacuzzi order also ask move different room inform others available
Counter({'room': 3, 'first': 2, 'night': 2, 'get': 1, 'check': 1, 'arrive': 1, 'thing': 1, 'notice': 1, 'light': 1, 'come': 1, 'flip': 1, 'switch': 1, 'upon': 1, 'enter': 1, 'clean': 1, 'dust': 1, 'collect': 1, 'back': 1, 'tv': 1, 'stand': 1, 'behind': 1, 'window': 1, 'sill': 1, 'stain': 1, 'carpet': 1, 'request': 1, 'wireless': 1, 'internet': 1, 'service': 1, 'entire': 1, 'mattress': 1, 'hard': 1, 'make': 1, 'uncomfortable': 1, 'jacuzzi': 1, 'order': 1, 'also': 1, 'ask': 1, 'move': 1, 'different': 1, 'inform': 1, 'others': 1, 'available': 1})


#### Vectorize

In [None]:
from scipy.sparse import csr_matrix
from nltk.corpus import stopwords
nltk.download('stopwords')

'''
TextVectorizer:
  The class will do following things:
    - count word frequency
    - generate feature vector consisting of top n frequent words
  Return a tuple (most_common_voc, voc_cnt, sparse_matrix)
'''

class TextVectorizer(BaseEstimator, TransformerMixin):
  def __init__(self, sizeOfVocabulary=1000):
    self.sizeOfVocabulary = sizeOfVocabulary
  
  def fit(self, X, y=None):
    # statistic of filtered words, after removing stop words
    wordCnt = []
    for text in X:
      token_words = nltk.word_tokenize(text)
      wordCnt.append(Counter(token_words))
    self.wordCnt = wordCnt

    WordCnt_all = Counter()
    for counter in wordCnt:
      for word, count in counter.items():
        WordCnt_all[word] += count
    self.most_common = WordCnt_all.most_common()[:self.sizeOfVocabulary]
    self.vocabulary = { word: index for index, (word, count) in enumerate(self.most_common) }

    return self

  def transform(self, X, y=None):
    wordCnt = []
    for text in X:
      token_words = nltk.word_tokenize(text)
      wordCnt.append(Counter(token_words))

    rows = []
    cols = []
    data = []
    for row, counter in enumerate(wordCnt):
      for word, count in counter.items():
        rows.append(row)
        # the last column is the sum of the count of words not in the vocabulary
        cols.append(self.vocabulary.get(word, self.sizeOfVocabulary))
        data.append(count)
    
    csrMat = csr_matrix((data, (rows, cols)), shape=(len(X), self.sizeOfVocabulary + 1))
    return (self.most_common, csrMat)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
vectorizer = TextVectorizer(sizeOfVocabulary=10)
out_tup = vectorizer.fit_transform(out)

# use csr_matrix.toarray() to get a list
print(out_tup[1].toarray())
print(out_tup[0])

[[ 3  0  2  1  0  0  0  2  1  1 37]
 [ 1  3  0  0  1  1  0  0  0  0 25]
 [ 2  2  2  2  2  2  3  0  1  1 46]]
[('room', 6), ('hotel', 5), ('night', 4), ('available', 3), ('chicago', 3), ('stay', 3), ('hilton', 3), ('first', 2), ('come', 2), ('upon', 2)]


Two routes: (stemming + vector) or (lemma + vector), no difference, but the latter has meaningful words.

## Bag of Words Analysis

#### Mix Reviews Analysis

In [None]:
from sklearn.pipeline import Pipeline

preprocessPipeline = Pipeline([
  ("lemmatize text", TextLemmatizer()),
  ("text to feature", TextVectorizer(sizeOfVocabulary=100)),
])

most_common_voc, X_preprocessed = preprocessPipeline.fit_transform(X)

print(X_preprocessed.toarray())

[[ 3  0  0 ...  0  0 30]
 [ 1  3  1 ...  0  0 19]
 [ 2  2  2 ...  0  0 37]
 ...
 [ 1  0  0 ...  0  0 13]
 [ 4  0  3 ...  0  0 47]
 [ 2  0  1 ...  0  0 35]]


In [None]:
import numpy as np

# TODO: construct dataframe
data = np.c_[X_preprocessed.toarray(), y]

cols = []
for name, count in most_common_voc:
  cols.append(name)
cols.append('others')
cols.append('is_truthful')

data_df = pd.DataFrame(data, columns=cols)

In [None]:
data_df.head()

Unnamed: 0,room,hotel,stay,chicago,get,would,service,great,staff,go,one,bed,time,make,like,could,us,night,clean,even,desk,check,nice,location,look,place,take,say,call,also,find,good,experience,back,front,come,view,first,give,recommend,...,work,much,need,city,wait,think,visit,leave,better,breakfast,area,seem,food,park,definitely,best,trip,weekend,bar,expect,restaurant,everything,small,helpful,free,offer,downtown,another,love,right,beautiful,way,husband,know,enjoy,ever,minutes,shop,others,is_truthful
0,3,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,2,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,30,0
1,1,3,1,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,19,0
2,2,2,2,2,0,0,0,0,0,0,1,0,0,0,0,1,1,2,1,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,37,0
3,8,6,1,1,10,3,0,0,0,8,3,3,2,4,3,6,3,1,3,4,3,1,0,0,2,1,3,1,3,0,3,0,0,3,1,0,1,1,3,0,...,0,1,0,0,0,1,0,2,0,0,1,1,0,0,0,0,2,0,0,0,0,1,0,0,1,0,0,3,0,0,0,1,0,4,0,1,0,0,234,0
4,7,1,1,0,2,0,0,0,0,2,0,1,0,2,4,0,0,0,1,0,2,1,0,0,3,0,1,0,2,0,1,0,1,1,2,2,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,79,0


In [None]:
data_df.to_csv('opspam_preprocessed.csv', index=False)

In [None]:
corr_matrix = data_df.corr()
corr_matrix["is_truthful"].sort_values(ascending=False)

is_truthful    1.000000
location       0.246782
floor          0.190081
small          0.170599
great          0.156438
                 ...   
look          -0.121371
hotel         -0.130839
visit         -0.143806
experience    -0.146691
chicago       -0.337182
Name: is_truthful, Length: 102, dtype: float64

#### Sentiment Analysis

##### Positive Reviews

In [None]:
X_pos = list((reviews.loc[reviews['polarity'] == 'positive'])['text'].copy())
y_pos = list((reviews.loc[reviews['polarity'] == 'positive'])['is_truthful'].copy())
most_common_voc, X_pos_preprocessed = preprocessPipeline.fit_transform(X_pos)

# TODO: construct dataframe
data = np.c_[X_pos_preprocessed.toarray(), y_pos]

cols = []
for name, count in most_common_voc:
  cols.append(name)
cols.append('others')
cols.append('is_truthful')

data_df_pos = pd.DataFrame(data, columns=cols)

# print most common words
print(cols)

['hotel', 'room', 'stay', 'chicago', 'great', 'staff', 'would', 'service', 'get', 'bed', 'location', 'clean', 'nice', 'time', 'go', 'one', 'like', 'comfortable', 'place', 'friendly', 'recommend', 'make', 'walk', 'view', 'us', 'also', 'could', 'even', 'night', 'city', 'good', 'well', 'definitely', 'really', 'best', 'love', 'business', 'visit', 'wonderful', 'beautiful', 'enjoy', 'shop', 'helpful', 'area', 'experience', 'back', 'excellent', 'everything', 'take', 'breakfast', 'look', 'next', 'hotels', 'restaurant', 'downtown', 'michigan', 'need', 'day', 'want', 'trip', 'weekend', 'find', 'desk', 'perfect', 'amaze', 'check', 'park', 'food', 'say', 'come', 'bathroom', 'first', 'feel', 'lobby', 'bar', 'husband', 'right', 'book', 'free', 'two', 'floor', 'close', 'nights', 'price', 'front', 'ever', 'spacious', 'away', 'home', 'every', 'suite', 'return', 'large', 'highly', 'give', 'wife', 'much', 'travel', 'see', 'use', 'others', 'is_truthful']


In [None]:
corr_matrix = data_df_pos.corr()
corr_matrix["is_truthful"].sort_values(ascending=False)

is_truthful    1.000000
location       0.283261
floor          0.247632
bathroom       0.219070
michigan       0.216349
                 ...   
experience    -0.148978
husband       -0.160058
amaze         -0.181791
visit         -0.193109
chicago       -0.333119
Name: is_truthful, Length: 102, dtype: float64

##### Negative Reviews

In [None]:
X_neg = list((reviews.loc[reviews['polarity'] == 'negative'])['text'].copy())
y_neg = list((reviews.loc[reviews['polarity'] == 'negative'])['is_truthful'].copy())
most_common_voc, X_neg_preprocessed = preprocessPipeline.fit_transform(X_neg)

# TODO: construct dataframe
data = np.c_[X_neg_preprocessed.toarray(), y_neg]

cols = []
for name, count in most_common_voc:
  cols.append(name)
cols.append('others')
cols.append('is_truthful')

data_df_neg = pd.DataFrame(data, columns=cols)

# print most common words
print(cols)

['room', 'hotel', 'stay', 'get', 'chicago', 'would', 'service', 'go', 'one', 'make', 'call', 'bed', 'desk', 'could', 'staff', 'check', 'us', 'time', 'like', 'night', 'look', 'even', 'say', 'take', 'front', 'find', 'tell', 'give', 'clean', 'experience', 'come', 'back', 'ask', 'never', 'first', 'nice', 'arrive', 'bathroom', 'good', 'also', 'wait', 'pay', 'place', 'two', 'seem', 'book', 'price', 'great', 'next', 'day', 'floor', 'use', 'better', 'expect', 'work', 'think', 'much', 'location', 'leave', 'want', 'reservation', 'another', 'hotels', 'lobby', 'minutes', 'finally', 'charge', 'try', 'really', 'small', 'disappoint', 'rude', 'request', 'well', 'door', 'smell', 'people', 'need', 'way', 'water', 'view', 'know', 'offer', 'food', 'morning', 'business', 'phone', 'nothing', 'bad', 'bar', 'wall', 'park', 'money', 'though', 'around', 'recommend', 'walk', 'many', 'hear', 'hilton', 'others', 'is_truthful']


In [None]:
corr_matrix = data_df_neg.corr()
corr_matrix["is_truthful"].sort_values(ascending=False)

is_truthful    1.000000
location       0.211325
great          0.170688
floor          0.163444
small          0.152139
                 ...   
finally       -0.165084
expect        -0.169879
seem          -0.189681
smell         -0.238975
chicago       -0.353530
Name: is_truthful, Length: 102, dtype: float64

Feature words for positive reviews are different from those for negative reviews. Feature words refer to those with high correlation with truthfulness of the review.

Positive: `location, floor, bathroom, michigan, experience, husband, amaze, visit, chicago`

Negative: `location, great, floor, small, finally, expect, seem, smell, chicago`

## Bigrams, Trigrams and Quadgrams

#### Positive

Find the most common word or clause in all reviews

In [None]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [None]:
# Combine all positive reviews into one text
X_pos = list((reviews.loc[reviews['polarity'] == 'positive'])['text'].copy())

text_pos = ''
for text in X_pos:
  text_pos += text

# get rid of punctuation and tokenize
tokenizer = nltk.RegexpTokenizer(r"\w+") 
token_words = tokenizer.tokenize(text_pos.lower())

# remove stopwords
stopwords = nltk.corpus.stopwords.words("english")
token_words = [w for w in token_words if w not in stopwords]

In [None]:
finder = nltk.collocations.BigramCollocationFinder.from_words(token_words)
finder.ngram_fd.most_common(10)

[(('front', 'desk'), 85),
 (('room', 'service'), 80),
 (('recommend', 'hotel'), 80),
 (('downtown', 'chicago'), 77),
 (('staff', 'friendly'), 77),
 (('chicago', 'hotel'), 75),
 (('walking', 'distance'), 68),
 (('highly', 'recommend'), 65),
 (('would', 'definitely'), 62),
 (('hotel', 'chicago'), 60)]

In [None]:
finder = nltk.collocations.TrigramCollocationFinder.from_words(token_words)
finder.ngram_fd.most_common(10)

[(('within', 'walking', 'distance'), 34),
 (('hard', 'rock', 'hotel'), 34),
 (('ambassador', 'east', 'hotel'), 29),
 (('would', 'definitely', 'stay'), 28),
 (('recommend', 'hotel', 'anyone'), 27),
 (('highly', 'recommend', 'hotel'), 23),
 (('staff', 'friendly', 'helpful'), 23),
 (('flat', 'screen', 'tv'), 22),
 (('would', 'definitely', 'recommend'), 20),
 (('rock', 'hotel', 'chicago'), 20)]

#### Negative

In [None]:
# Combine all negative reviews into one text
X_neg = list((reviews.loc[reviews['polarity'] == 'negative'])['text'].copy())

text_neg = ''
for text in X_neg:
  text_neg += text

tokenizer = nltk.RegexpTokenizer(r"\w+") # get rid of punctuation
token_words = tokenizer.tokenize(text_neg.lower())

stopwords = nltk.corpus.stopwords.words("english")
token_words = [w for w in token_words if w not in stopwords]

In [None]:
finder = nltk.collocations.BigramCollocationFinder.from_words(token_words)
finder.ngram_fd.most_common(10)

[(('front', 'desk'), 255),
 (('room', 'service'), 141),
 (('hotel', 'chicago'), 80),
 (('stay', 'hotel'), 72),
 (('customer', 'service'), 64),
 (('chicago', 'hotel'), 62),
 (('got', 'room'), 54),
 (('even', 'though'), 54),
 (('would', 'recommend'), 52),
 (('hard', 'rock'), 49)]

In [None]:
finder = nltk.collocations.TrigramCollocationFinder.from_words(token_words)
finder.ngram_fd.most_common(10)

[(('called', 'front', 'desk'), 43),
 (('front', 'desk', 'staff'), 29),
 (('hard', 'rock', 'hotel'), 29),
 (('non', 'smoking', 'room'), 28),
 (('would', 'recommend', 'hotel'), 25),
 (('fairmont', 'chicago', 'millennium'), 21),
 (('chicago', 'millennium', 'park'), 20),
 (('4', 'star', 'hotel'), 19),
 (('rock', 'hotel', 'chicago'), 19),
 (('front', 'desk', 'clerk'), 17)]

#### Sentiment Analysis

##### Default Analyzer

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

# test if default analyzer works
print(sia.polarity_scores(X_pos[0]))
print(sia.polarity_scores(X_neg[0]))

{'neg': 0.006, 'neu': 0.821, 'pos': 0.172, 'compound': 0.9823}
{'neg': 0.065, 'neu': 0.935, 'pos': 0.0, 'compound': -0.7333}




In [None]:
from statistics import mean

def is_positive(text):
  # score every sentence
  scores_list = [ sia.polarity_scores(sentence)['compound'] 
            for sentence in nltk.sent_tokenize(text) ]
  
  # compound value > 0 if positive
  return mean(scores_list) > 0

print(is_positive(X_pos[0]))
print(is_positive(X_neg[0]))

True
False


In [None]:
# test for positive reviews
total = len(X_pos)
cnt_error = 0
for text in X_pos:
  if is_positive(text) is False:
    cnt_error += 1

print(f"Positive accuracy: {(total - cnt_error) / total:.2%}")

# test for negative reviews
cnt_error = 0
for text in X_neg:
  if is_positive(text) is True:
    cnt_error += 1

print(f"Negative accuracy: {(total - cnt_error) / total:.2%}")    

Positive accuracy: 99.50%
Negative accuracy: 49.38%


##### Customize Analyzer

In [None]:
nltk.download('averaged_perceptron_tagger') # for pos tag

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
# TODO: Find top 100 feature words for positive and negative

X_pos = list((reviews.loc[reviews['polarity'] == 'positive'])['text'].copy())
text_pos = ''
for text in X_pos:
  text_pos += text

# get rid of punctuation and tokenize
tokenizer = nltk.RegexpTokenizer(r"\w+") 
token_words = tokenizer.tokenize(text_pos.lower())

# remove stopwords
stopwords = nltk.corpus.stopwords.words("english")
token_words = [w for w in token_words if w not in stopwords]

# print(nltk.pos_tag(token_words))

def skip_unwanted_word(pos_tag_tuple):
  word, tag = pos_tag_tuple
  # skip non-alpha words
  if not word.isalpha():
    return False

  # skip words tagged with 'NN'
  if tag.startswith('NN'):
    return False

  return True

positive_words = [word for word, tag in filter(skip_unwanted_word, nltk.pos_tag(token_words))]
# print(positive_words)         

In [None]:
X_neg = list((reviews.loc[reviews['polarity'] == 'negative'])['text'].copy())
text_neg = ''
for text in X_neg:
  text_neg += text

# get rid of punctuation and tokenize
tokenizer = nltk.RegexpTokenizer(r"\w+") 
token_words = tokenizer.tokenize(text_neg.lower())

# remove stopwords
stopwords = nltk.corpus.stopwords.words("english")
token_words = [w for w in token_words if w not in stopwords]

negative_words = [word for word, tag in filter(skip_unwanted_word, nltk.pos_tag(token_words))]
# print(negative_words)   

In [None]:
positive_fd = nltk.FreqDist(positive_words)
negative_fd = nltk.FreqDist(negative_words)

common_set = set(positive_fd).intersection(negative_fd)

for word in common_set:
  del positive_fd[word]
  del negative_fd[word]

top_positive_words = {word for word, count in positive_fd.most_common(100)}
top_negative_words = {word for word, count in negative_fd.most_common(100)}

In [None]:
print(top_positive_words)

{'flawless', 'award', 'alternative', 'definetly', 'pump', 'recomend', 'rex', 'dear', 'mag', 'asian', 'soldier', 'gourmet', 'warmly', 'touring', 'flew', 'spotlessly', 'outing', 'spotless', 'surpassed', 'shedd', 'minus', 'pricey', 'boyfriend', 'assist', 'bedroom', 'historical', 'heaven', 'clubs', 'boarding', 'hare', 'delight', 'virtual', 'gold', 'delivers', 'shea', 'speedy', 'cultural', 'notch', 'winning', 'rested', 'unbeatable', 'hustle', 'tastefully', 'breathtaking', 'stunned', 'knowledgable', 'immaculately', 'painless', 'efficient', 'phenomenal', 'ihome', 'superb', 'mall', 'robes', 'coordinate', 'everyday', 'experince', 'ultra', 'unbelievably', 'bartender', 'booth', 'commented', 'aveda', 'remarkable', 'soothing', 'n', 'attentiveness', 'weekends', 'impeccably', 'bathrooms', 'museums', 'crisp', 'magnificient', 'stunning', 'largely', 'social', 'starwood', 'rose', 'penthouse', 'die', 'lets', 'unwind', 'ive', 'bacon', 'breath', 'scheduling', 'cocktail', 'heated', 'suburbs', 'moderately', '

In [None]:
print(top_negative_words)

{'mold', 'ignored', 'calls', 'deliver', 'refused', 'okay', 'standing', 'peeling', 'claimed', 'poorly', 'refund', 'demanded', 'specifically', 'complaining', 'mistake', 'receptionist', 'worse', 'holiday', 'attempted', 'appears', 'constantly', 'settled', 'switch', 'card', 'carpet', 'hoping', 'luckily', 'yellow', 'musty', 'incompetent', 'unfriendly', 'miserable', 'appear', 'sub', 'imagined', 'removed', 'ugly', 'sad', 'lying', 'counter', 'stale', 'dated', 'correct', 'argued', 'appalled', 'girl', 'unfortunate', 'pictures', 'resolve', 'explain', 'worst', 'neither', 'eventually', 'unpleasant', 'broken', 'wonder', 'smelly', 'carry', 'severely', 'outrageous', 'ruined', 'unreasonable', 'unacceptable', 'needless', 'walls', 'serious', 'wallpaper', 'visible', 'washed', 'weak', 'stains', 'hearing', 'rolled', 'reached', 'sort', 'sorry', 'apologize', 'remote', 'assume', 'apologized', 'clerk', 'agreed', 'confused', 'hung', 'mediocre', 'guess', 'disgusting', 'avoid', 'packed', 'acted', 'odor', 'lukewarm'

In [None]:
# TODO: Define feature
from statistics import mean

'''
Text feature is defined as ('mean_positive', 'mean_negative', 
'count_of_positive_words', 'count_of_negative_words')
'''
def feature_extractor(text):
  features = list()
  positive_scores = list()
  negative_scores = list()
  wordcnt_neg = 0;
  wordcnt_pos = 0;
  for sentence in nltk.sent_tokenize(text):
    for word in nltk.word_tokenize(sentence):
      if word.lower() in top_positive_words:
        wordcnt_pos += 1
      if word.lower() in top_negative_words:
        wordcnt_neg += 1

      positive_scores.append(sia.polarity_scores(sentence)["pos"])
      negative_scores.append(sia.polarity_scores(sentence)["neg"])

  features.append(mean(positive_scores))
  features.append(mean(negative_scores))
  features.append(wordcnt_pos)
  features.append(wordcnt_neg)

  return features

In [None]:
print(feature_extractor(X_pos[0]))
print(feature_extractor(X_neg[0]))

[0.16575238095238096, 0.0070857142857142855, 0, 0]
[0.0, 0.05288888888888889, 0, 3]


In [None]:
# TODO: Extract feature
X_sentiment = [feature_extractor(text) for text in X]

In [None]:
y_sentiment = list(reviews['polarity'].copy())

In [None]:
import numpy as np
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(X_sentiment, y_sentiment, test_size=0.2, random_state=40)
sentiment_clf = LogisticRegression(solver="liblinear", random_state=40)
score = cross_val_score(sentiment_clf, X_train, y_train, cv=4)
print('Cross Validation Score: ', score.mean())

Cross Validation Score:  0.9085937500000001


## Fine-Tune Model

#### Fine-tune Model Parameter

In [None]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

param_grid = [
    {'n_estimators': [20, 30, 40, 50], 'max_features': [1, 2, 3, 4]},
]

forest_cls = RandomForestClassifier()
grid_search = GridSearchCV(forest_cls, param_grid, cv=5,
                           scoring='accuracy')
grid_search.fit(X_sentiment, y_sentiment)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [None]:
print(grid_search.best_params_)

{'max_features': 2, 'n_estimators': 20}


In [None]:
cvres = grid_search.cv_results_
for accuracy, params in zip(cvres['mean_test_score'], cvres['params']):
  print(accuracy, params)

0.9293750000000001 {'max_features': 1, 'n_estimators': 20}
0.9318750000000001 {'max_features': 1, 'n_estimators': 30}
0.93125 {'max_features': 1, 'n_estimators': 40}
0.9331250000000001 {'max_features': 1, 'n_estimators': 50}
0.9337499999999999 {'max_features': 2, 'n_estimators': 20}
0.930625 {'max_features': 2, 'n_estimators': 30}
0.928125 {'max_features': 2, 'n_estimators': 40}
0.928125 {'max_features': 2, 'n_estimators': 50}
0.928125 {'max_features': 3, 'n_estimators': 20}
0.9262499999999999 {'max_features': 3, 'n_estimators': 30}
0.9293750000000001 {'max_features': 3, 'n_estimators': 40}
0.9237500000000001 {'max_features': 3, 'n_estimators': 50}
0.9262499999999999 {'max_features': 4, 'n_estimators': 20}
0.921875 {'max_features': 4, 'n_estimators': 30}
0.9231250000000001 {'max_features': 4, 'n_estimators': 40}
0.9275 {'max_features': 4, 'n_estimators': 50}


#### Fine-tune Transformer Parameter

In [None]:
from sklearn.model_selection import cross_val_score, train_test_split

# Best model params: {'max_features': 20, 'n_estimators': 50}
forest_cls = RandomForestClassifier(max_features=20, n_estimators=50)

param_grid = [100, 200, 300, 400, 500, 1000, 1500, 2000, 4000]

best_accuracy = 0
best_params = 0
for sizeOfVoc in param_grid:
  # transform manually
  lemmatizer = TextLemmatizer()
  X_lemma = lemmatizer.fit_transform(X)
  vectorizer = TextVectorizer(sizeOfVocabulary=sizeOfVoc)
  most_common, X_preprocessed = vectorizer.fit_transform(X_lemma)

  # cross validation
  X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=40)
  accuracy = cross_val_score(forest_cls, X_train, y_train, cv=5, scoring='accuracy')
  print('size of vocabulary: {}, accuracy: {}'.format(sizeOfVoc, accuracy.mean()))
  if accuracy.mean() > best_accuracy:
    best_accuracy = accuracy.mean()
    best_params = sizeOfVoc

print('best accuracy: {}, best params: {}'.format(best_accuracy, best_params))

size of vocabulary: 100, accuracy: 0.746875
size of vocabulary: 200, accuracy: 0.7734375
size of vocabulary: 300, accuracy: 0.80546875
size of vocabulary: 400, accuracy: 0.8203125
size of vocabulary: 500, accuracy: 0.81484375
size of vocabulary: 1000, accuracy: 0.8265625
size of vocabulary: 1500, accuracy: 0.8375
size of vocabulary: 2000, accuracy: 0.825
size of vocabulary: 4000, accuracy: 0.81875
best accuracy: 0.8375, best params: 1500


The best size of vocabulary is between 500 and 2000.

## Training Example

##### Mix Test

In [None]:
from sklearn.pipeline import Pipeline

# Retrieve data from reviews csv
X = list(reviews['text'].copy())
y = list(reviews['is_truthful'].copy())

# Define pipeline
preprocessPipeline = Pipeline([
  ("lemmatize text", TextLemmatizer()),
  ("text to feature", TextVectorizer(sizeOfVocabulary=100)),
])

# Preprocess data
most_common_voc, X_preprocessed_mat = preprocessPipeline.fit_transform(X)

# Notice that X_preprocessed_mat is a sparse matrix, to transform it into array
# by toarray()
X_preprocessed = X_preprocessed_mat.toarray()

In [None]:
import numpy as np
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=40)
log_clf = LogisticRegression(solver="liblinear", random_state=40)
score = cross_val_score(log_clf, X_train, y_train, cv=4)
print('Cross Validation Score: ', score.mean())

Cross Validation Score:  0.7796875


##### Positive Review Test

In [None]:
from sklearn.pipeline import Pipeline

# Retrieve data from reviews csv
X_pos = list((reviews.loc[reviews['polarity'] == 'positive'])['text'].copy())
y_pos = list((reviews.loc[reviews['polarity'] == 'positive'])['is_truthful'].copy())

# Define pipeline
preprocessPipeline = Pipeline([
  ("lemmatize text", TextLemmatizer()),
  ("text to feature", TextVectorizer()),
])

# Preprocess data
most_common_voc, X_preprocessed_mat = preprocessPipeline.fit_transform(X_pos)

# Notice that X_preprocessed_mat is a sparse matrix, to transform it into array
# by toarray()
X_pos_preprocessed = X_preprocessed_mat.toarray()

In [None]:
import numpy as np
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(X_pos_preprocessed, y_pos, test_size=0.2, random_state=40)
log_clf = LogisticRegression(solver="liblinear", random_state=40)
score = cross_val_score(log_clf, X_train, y_train, cv=4)
print('Cross Validation Score: ', score.mean())

Cross Validation Score:  0.8390624999999999


##### Negative Review Test

In [None]:
from sklearn.pipeline import Pipeline

# Retrieve data from reviews csv
X_neg = list((reviews.loc[reviews['polarity'] == 'negative'])['text'].copy())
y_neg = list((reviews.loc[reviews['polarity'] == 'negative'])['is_truthful'].copy())

# Preprocess data
most_common_voc, X_preprocessed_mat = preprocessPipeline.fit_transform(X_neg)

# Notice that X_preprocessed_mat is a sparse matrix, to transform it into array
# by toarray()
X_neg_preprocessed = X_preprocessed_mat.toarray()

In [None]:
import numpy as np
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(X_neg_preprocessed, y_neg, test_size=0.2, random_state=40)
log_clf = LogisticRegression(solver="liblinear", random_state=40)
score = cross_val_score(log_clf, X_train, y_train, cv=4)
print('Cross Validation Score: ', score.mean())

Cross Validation Score:  0.8546875
