# Preprocess

## Data Analysis

#### Retrieve Data

In [1]:
from google.colab import files
files.upload()

Output hidden; open in https://colab.research.google.com to view.

In [2]:
!unzip -q op_spam_v1.4.zip -d datasets/

In [3]:
import os

DATA_PATH = "/content/datasets/"
FILENAME = "op_spam_v1.4"
FULL_DATA_PATH = os.path.join(DATA_PATH, FILENAME)

neg_dec_dir = FULL_DATA_PATH + '/negative_polarity' + '/deceptive_from_MTurk/'
neg_tru_dir = FULL_DATA_PATH + '/negative_polarity' + '/truthful_from_Web/'
pos_dec_dir = FULL_DATA_PATH + '/positive_polarity' + '/deceptive_from_MTurk/'
pos_tru_dir = FULL_DATA_PATH + '/positive_polarity' + '/truthful_from_TripAdvisor/'

folds = ['fold1', 'fold2', 'fold3', 'fold4', 'fold5']

In [4]:
import pandas as pd

_review = []
_is_truthful = []
_filename = []
_hotel_name = []
_polarity = []
_source = []
_fold = []

for p, polar, src in zip([neg_dec_dir, neg_tru_dir, pos_dec_dir, pos_tru_dir],
                         ['negative', 'negative', 'positive', 'positive'],
                         ['MTurk', 'Web', 'MTurk', 'TripAdvisor']):
    for fold in folds:
        for filename in os.listdir(p + fold):
            with open(p + fold + '/' + filename) as f:
                msg = f.read()
            _review.append(msg.strip())
            _is_truthful.append(int(filename.startswith('t')))
            _filename.append(filename)
            _hotel_name.append(filename.split('_')[1])
            _polarity.append(polar)
            _source.append(src)
            _fold.append(int(fold[-1]))
            
reviews = pd.DataFrame({
    'text': _review,
    'is_truthful': _is_truthful,
    'polarity': _polarity,
    'hotel_name': _hotel_name,
    'source': _source,
    'fold': _fold,
    'filename': _filename,
})

In [5]:
reviews.head()

Unnamed: 0,text,is_truthful,polarity,hotel_name,source,fold,filename
0,When we got checked and arrived at our room th...,0,negative,monaco,MTurk,1,d_monaco_2.txt
1,"The James Chicago is a stuffy, uninviting hote...",0,negative,james,MTurk,1,d_james_9.txt
2,We booked a room at the Hilton Chicago for two...,0,negative,hilton,MTurk,1,d_hilton_17.txt
3,"For a hotel rated with four diamonds by AAA, o...",0,negative,hilton,MTurk,1,d_hilton_15.txt
4,I was very disappointed with this hotel. The f...,0,negative,hilton,MTurk,1,d_hilton_9.txt


#### Raw Data Analysis

In [36]:
import plotly.express as px

fig = px.histogram(reviews, x='hotel_name', y='is_truthful', histfunc='sum', color='polarity')
fig.show()

It shows that the data is evenly distributed in different hotels, and each hotel has an equal number of positive and negative, truthful and deceptive data. 

## Preprocess

#### TextStemmer

In [42]:
from sklearn.base import BaseEstimator, TransformerMixin
import nltk
from nltk.stem import PorterStemmer
nltk.download('punkt')

'''
TextStemmer:
  The class will do following things:
    - convert to lowercase
    - remove punctuation
    - replace numbers with the string 'NUMBER'
    - perform stemming (trim word ending with library)
  Return a list with stemmed text
'''
class TextStemmer(BaseEstimator, TransformerMixin):
  def __init__(self, 
               lowercaseConversion=True,
               punctuationRemoval=True,
               numberReplacement=True,
               stemming=True
               ):
    self.lowercaseConversion = lowercaseConversion
    self.punctuationRemoval = punctuationRemoval
    self.numberReplacement = numberReplacement
    self.stemming = stemming
    self.stemmer = PorterStemmer()
  
  def fit(self, X, y=None):
    return self
  
  def transform(self, X, y=None):
    X_stemmed = []

    for review in X:
      text = review
      if text is None:
        text = 'Null'

      if self.punctuationRemoval:
        text = text.replace('.', '')
        text = text.replace(',', '')
        text = text.replace('?', '')
        text = text.replace('!', '')
        text = text.replace('(', '')
        text = text.replace(')', '')
        text = text.replace('\'s', '')
    
      token_words = nltk.word_tokenize(text)
      stem_text_list = []

      for word in token_words:
        stem_text_list.append(self.stemmer.stem(word))
        stem_text_list.append(" ")
      
      text_stemmed = "".join(stem_text_list)
      
      X_stemmed.append(text_stemmed)
    
    return X_stemmed

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [43]:
X = list(reviews['text'].copy())
y = list(reviews['is_truthful'].copy())

stemmer = TextStemmer()
X_stem = stemmer.fit_transform(X)

stem_df = pd.DataFrame({
  'text': X_stem,
  'is_truthful': y,
})

stem_df.head()

Unnamed: 0,text,is_truthful
0,when we got check and arriv at our room the fi...,0
1,the jame chicago is a stuffi uninvit hotel If ...,0
2,We book a room at the hilton chicago for two n...,0
3,for a hotel rate with four diamond by aaa one ...,0
4,I wa veri disappoint with thi hotel the front ...,0


#### TextLemmatizer

In [39]:
from sklearn.base import BaseEstimator, TransformerMixin
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

'''
TextLemmatizer:
  The class will do following things:
    - convert to lowercase
    - remove punctuation
    - replace numbers with the string 'NUMBER'
    - perform lemmatizing (transform into root form)
    - remove stop words
  Return a list with stemmed text
'''
class TextLemmatizer(BaseEstimator, TransformerMixin):
  def __init__(self, 
               lowercaseConversion=True,
               punctuationRemoval=True,
               numberReplacement=True,
               lemmatizing=True,
               removeStopwords=True
               ):
    self.lowercaseConversion = lowercaseConversion
    self.punctuationRemoval = punctuationRemoval
    self.numberReplacement = numberReplacement
    self.lemmatizing = lemmatizing
    self.lemmatizer = nltk.WordNetLemmatizer()
    
    self.removeStopwords = removeStopwords
    self.stopwords = nltk.corpus.stopwords.words('english')
    self.stopwords.extend(['!', ',', '.', '?', '-s', '-ly', '</s>',
                        's', '(', ')', '\'s', 'n\'t', '$', '2',
                        ':', '\'\'', '``', '-', '--'])
  
  def fit(self, X, y=None):
    return self
  
  def transform(self, X, y=None):
    X_lemmatized = []

    for review in X:
      text = review
      if text is None:
        text = 'Null'

      if self.lowercaseConversion:
        text = text.lower()

      if self.punctuationRemoval:
        text = text.replace('.', '')
        text = text.replace(',', '')
        text = text.replace('?', '')
        text = text.replace('!', '')
        text = text.replace('(', '')
        text = text.replace(')', '')
    
      token_words = nltk.word_tokenize(text)
      lemma_text_list = []

      for word in token_words:
        # notice that we have to give a context for lemmatizer
        lemma_text_list.append(self.lemmatizer.lemmatize(word, pos='v'))
      
      if self.removeStopwords:
        lemma_text_list_filtered = [ w for w in lemma_text_list if not w in self.stopwords ]

      text_lemmatized = " ".join(lemma_text_list_filtered)
      
      X_lemmatized.append(text_lemmatized)
    
    self.result = X_lemmatized
    
    return X_lemmatized

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [41]:
X = list(reviews['text'].copy())
y = list(reviews['is_truthful'].copy())

lemmatizer = TextLemmatizer()
X_lemma = lemmatizer.fit_transform(X)

lemma_df = pd.DataFrame({
  'text': X_lemma,
  'is_truthful': y,
})

lemma_df.head()

Unnamed: 0,text,is_truthful
0,get check arrive room first thing notice light...,0
1,jam chicago stuffy uninviting hotel look comfo...,0
2,book room hilton chicago two nights stay weeke...,0
3,hotel rat four diamonds aaa one would think hi...,0
4,disappoint hotel front desk clerk rude ; perso...,0


#### TextVectorizer

In [40]:
from scipy.sparse import csr_matrix
from collections import Counter

'''
TextVectorizer:
  The class will do following things:
    - count word frequency
    - generate feature vector consisting of top n frequent words
  Return a tuple (most_common_voc, voc_cnt, sparse_matrix)
'''

class TextVectorizer(BaseEstimator, TransformerMixin):
  def __init__(self, sizeOfVocabulary=1000):
    self.sizeOfVocabulary = sizeOfVocabulary
  
  def fit(self, X, y=None):
    # statistic of filtered words, after removing stop words
    wordCnt = []
    for text in X:
      token_words = nltk.word_tokenize(text)
      wordCnt.append(Counter(token_words))
    self.wordCnt = wordCnt

    WordCnt_all = Counter()
    for counter in wordCnt:
      for word, count in counter.items():
        WordCnt_all[word] += count
    self.most_common = WordCnt_all.most_common()[:self.sizeOfVocabulary]
    self.vocabulary = { word: index for index, (word, count) in enumerate(self.most_common) }

    return self

  def transform(self, X, y=None):
    wordCnt = []
    for text in X:
      token_words = nltk.word_tokenize(text)
      wordCnt.append(Counter(token_words))

    rows = []
    cols = []
    data = []
    for row, counter in enumerate(wordCnt):
      for word, count in counter.items():
        rows.append(row)
        # the last column is the sum of the count of words not in the vocabulary
        cols.append(self.vocabulary.get(word, self.sizeOfVocabulary))
        data.append(count)
    
    csrMat = csr_matrix((data, (rows, cols)), shape=(len(X), self.sizeOfVocabulary + 1))
    return (self.most_common, csrMat)

In [45]:
from sklearn.pipeline import Pipeline

X = list(reviews['text'].copy())
y = list(reviews['is_truthful'].copy())

preprocessPipeline = Pipeline([
  ("lemmatize text", TextLemmatizer()),
  ("text to feature", TextVectorizer(sizeOfVocabulary=100)),
])

most_common_voc, X_preprocessed = preprocessPipeline.fit_transform(X)

print(X_preprocessed.toarray())
print(most_common_voc)

[[ 3  0  0 ...  0  0 30]
 [ 1  3  1 ...  0  0 19]
 [ 2  2  2 ...  0  0 37]
 ...
 [ 1  0  0 ...  0  0 13]
 [ 4  0  3 ...  0  0 47]
 [ 2  0  1 ...  0  0 35]]
[('room', 3451), ('hotel', 3324), ('stay', 2259), ('chicago', 1519), ('get', 1031), ('would', 983), ('service', 861), ('great', 857), ('staff', 848), ('go', 754), ('one', 707), ('bed', 681), ('time', 622), ('make', 618), ('like', 603), ('could', 578), ('us', 578), ('night', 542), ('clean', 538), ('even', 522), ('desk', 501), ('check', 490), ('nice', 485), ('location', 479), ('look', 472), ('place', 457), ('take', 450), ('say', 434), ('call', 432), ('also', 425), ('find', 412), ('good', 399), ('experience', 393), ('back', 389), ('front', 389), ('come', 361), ('view', 352), ('first', 349), ('give', 345), ('recommend', 345), ('walk', 341), ('bathroom', 339), ('next', 332), ('day', 326), ('really', 321), ('well', 318), ('two', 316), ('book', 312), ('friendly', 310), ('never', 309), ('comfortable', 308), ('ask', 304), ('price', 304), ('h

In [11]:
import pandas as pd
import numpy as np
import plotly.express as px

words = [ word for word, count in most_common_voc ]
counts = [ count for word, count in most_common_voc ]

most_common_df = pd.DataFrame({
    'words': words,
    'counts': counts,
})

fig = px.scatter(most_common_df, x=words, y=counts)
fig.show()

In [12]:
import numpy as np

# TODO: construct dataframe
data = np.c_[X_preprocessed.toarray(), y]

cols = []
for name, count in most_common_voc:
  cols.append(name)
cols.append('others')
cols.append('is_truthful')

data_df = pd.DataFrame(data, columns=cols)

In [13]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600 entries, 0 to 1599
Columns: 102 entries, room to is_truthful
dtypes: int64(102)
memory usage: 1.2 MB


In [14]:
corr_matrix = data_df.corr()
corr_matrix["is_truthful"].sort_values(ascending=False)

is_truthful    1.000000
location       0.246782
floor          0.190081
small          0.170599
great          0.156438
                 ...   
look          -0.121371
hotel         -0.130839
visit         -0.143806
experience    -0.146691
chicago       -0.337182
Name: is_truthful, Length: 102, dtype: float64

In [15]:
def corr_visual(df, word1, word2):
  # TODO: count by (word1, word2, is_truthful)
  word1s = list(df[word1].copy())
  word2s = list(df[word2].copy())
  truth = list(df['is_truthful'].copy())

  word_tuples = [ (word1s[i], word2s[i], truth[i]) for i in range(len(truth))]

  word_tuple_counter = dict()

  for word_tuple in word_tuples:
    if word_tuple not in word_tuple_counter:
      word_tuple_counter[word_tuple] = 1
    else:
      word_tuple_counter[word_tuple] += 1
  
  print(word_tuple_counter)

  # construct new dataframe
  word1s_ = [ word1 for (word1, word2, truth) in word_tuple_counter.keys() ]
  word2s_ = [ word2 for (word1, word2, truth) in word_tuple_counter.keys() ]
  is_truthful_ = [ truth for (word1, word2, truth) in word_tuple_counter.keys() ]
  counts_ = [ count for key, count in word_tuple_counter.items() ]

  words_df = pd.DataFrame({
      word1: word1s_,
      word2: word2s_,
      'count': counts_,
      'is_truthful': is_truthful_
  })

  fig = px.scatter(words_df, x=word1, y=word2, color='is_truthful', size='count', size_max=60)
  fig.show()

  return words_df

new_df = corr_visual(data_df, 'location', 'chicago')

{(0, 0, 0): 158, (0, 1, 0): 282, (0, 2, 0): 163, (0, 3, 0): 58, (1, 2, 0): 32, (1, 1, 0): 39, (0, 4, 0): 13, (5, 1, 0): 1, (1, 0, 0): 25, (2, 0, 0): 2, (0, 5, 0): 5, (0, 7, 0): 1, (1, 3, 0): 8, (2, 1, 0): 4, (2, 2, 0): 2, (0, 0, 1): 290, (0, 1, 1): 147, (1, 1, 1): 84, (1, 0, 1): 145, (0, 2, 1): 46, (3, 0, 1): 2, (0, 6, 1): 2, (0, 4, 1): 2, (0, 3, 1): 9, (2, 1, 1): 14, (2, 0, 1): 13, (1, 2, 1): 31, (1, 4, 1): 2, (1, 4, 0): 2, (2, 3, 0): 2, (1, 5, 0): 1, (0, 6, 0): 1, (1, 7, 0): 1, (3, 1, 1): 1, (2, 2, 1): 6, (1, 3, 1): 3, (2, 3, 1): 3}


In [24]:
reviews.head()

Unnamed: 0,text,is_truthful,polarity,hotel_name,source,fold,filename
0,When we got checked and arrived at our room th...,0,negative,monaco,MTurk,1,d_monaco_2.txt
1,"The James Chicago is a stuffy, uninviting hote...",0,negative,james,MTurk,1,d_james_9.txt
2,We booked a room at the Hilton Chicago for two...,0,negative,hilton,MTurk,1,d_hilton_17.txt
3,"For a hotel rated with four diamonds by AAA, o...",0,negative,hilton,MTurk,1,d_hilton_15.txt
4,I was very disappointed with this hotel. The f...,0,negative,hilton,MTurk,1,d_hilton_9.txt


In [28]:
import plotly.express as px
df = px.data.tips()
# Here we use a column with categorical data
fig = px.histogram(df, x="day")
fig.show()

In [29]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
