In [1]:
from __future__ import print_function  # needed for Python 2
from __future__ import division        # needed for Python 2
import csv                               # csv reader
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from random import shuffle
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('always') # always show warnings, useful for SK Learning warnings

In [2]:
# Convert line from input file into an id/text/label tuple
# returns a triple of an integer (ID), a string containing the review, and a string indicating the label
# DOC_ID	LABEL	RATING	VERIFIED_PURCHASE	PRODUCT_CATEGORY	PRODUCT_ID	PRODUCT_TITLE	REVIEW_TITLE	REVIEW_TEXT
def parseReview(reviewLine):
    # reviewLine[0] = DOC_ID
    # reviewLine[8] = REVIEW_TEXT
    # reviewLine[1] = LABEL
    return (reviewLine[0], reviewLine[8], reviewLine[1])

In [3]:
# TEXT PREPROCESSING AND FEATURE VECTORIZATION
import string
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer 

# preProcess - process one sentence and return a list of processed tokens
# includes various normalisation options
#
# parameters:
# @lower - convert all text to lowercase
# @punc - remove punctuation
# @stem - simplify into stems using NLTK
# @stop - remove stop words using either SKLearn = 1 or NLTK = 2
# @lemma - lemmatize with NLTK

def preProcess(text, lower=1, punc=1, stem=1, stop=2, lemma=0):
    
    # remove punctuation from our list of tokens
    if (punc == 1):
        text = text.replace('<br />', '') # remove HTML newline tags
        text = text.replace('&#34;', ' ') # remove quotation mark &#34; encoding
        text = text.replace(')', ' ') # replace brackets with empty space
        text = text.replace('(', ' ')
        text = text.replace('.', ' ')
        text = text.translate(str.maketrans('', '', string.punctuation)) 
    
    # ensure our tokens are all lowercase
    if (lower == 1):    
        text = text.lower()  
        
    tokens = text.split() # split by whitespace
    
    # remove stem words using NLTK
    if (stem == 1):
        tokens = stemTokens(tokens)
    
    # remove stop words using sklearn
    if (stop == 1):
        tokens = stopTokens(tokens, 1)
        
    # remove stop words using NLTK
    if (stop == 2):
        tokens = stopTokens(tokens, 2)
        
    if (lemma == 1):
        tokens = lemmaTokens(tokens)
    
    return tokens

# simplify words into their stems using the SnowballStemmer from NLTK
def stemTokens(tokens):
    new_tokens = []
    stemmer = SnowballStemmer("english")
    for token in tokens:
        new_tokens.append(stemmer.stem(token))
    return new_tokens

# remove stop words
# lib = 1, use SKLearn
# lib = 2, use NLTK
def stopTokens(tokens, lib):
    
    # remove using SKLearn
    # list of removed words: https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/stop_words.py
    if (lib == 1):
        tfidf_vectorizer = TfidfVectorizer(stop_words="english")
        x = tfidf_vectorizer.fit_transform(tokens)
        new_tokens = tfidf_vectorizer.get_feature_names()
        
    # remove stop words using NLTK
    # type set(stopwords.words('english')) in interpreter to see list of words
    if (lib == 2):
        stop_words = set(stopwords.words('english'))
        new_tokens = [w for w in tokens if not w in stop_words] 
        
    return new_tokens

# Lemmatization of tokens with NLTK
def lemmaTokens(tokens):
    new_tokens = []
    lemmatizer = WordNetLemmatizer() 
    for token in tokens:
        new_tokens.append(lemmatizer.lemmatize(token, pos="v")) # lemmatize verbs, e.g. running to run
    return new_tokens

In [4]:
featureDict = {} # A global dictionary of features

def toFeatureVector(tokens):
    # Should return a dictionary containing features as keys, and weights as values
    featureVector = {}
    for token in tokens:
        
        # feature dictionary
        if token not in featureVector:
            featureVector[token] = 1
        else:
            featureVector[token] += 1
        
        ''' Binary feature weighting
        if token not in featureVector:
            featureVector[token] = 1
        '''
        
        # global dictionary
        if token not in featureDict:
            featureDict[token] = 1
        else:
            featureDict[token] += 1
            
    return featureVector

In [5]:
# PREDICTING LABELS GIVEN A CLASSIFIER

def predictLabels(reviewSamples, classifier):
    return classifier.classify_many(map(lambda t: t[0], reviewSamples))

In [6]:
# load data from a file and append it to the rawData
def loadData(path, Text=None):
    with open(path, newline='', encoding="latin-1") as f: 
        # use encoding="latin-1" encoding to fix Unicode decode errors?
        next(f) # skip the first line (headers)
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            #print(line)
            (Id, Text, Label) = parseReview(line)
            rawData.append((Id, Text, Label))
            preprocessedData.append((Id, preProcess(Text), Label))
        print("All data loaded from " + path)
        
def processVector():
    for (_, Text, Label) in rawData:
        vectoredProcessed.append((toFeatureVector(preProcess(Text)),Label)) 

In [7]:
# MAIN
# reset the featuredict
featureDict = {}

# loading reviews
rawData = []          # the filtered data from the dataset file (should be 21000 samples)
preprocessedData = [] # the preprocessed reviews (just to see how your preprocessing is doing)
vectoredProcessed = []  # our completed data, with text to vectors and preprocessed

# references to the data files
#reviewPath = '100_reviews.txt' # a smaller set of the data for fast iteration
reviewPath = 'amazon_reviews.txt'

## Do the actual stuff
# We parse the dataset and put it in a raw data list
print("Now %d rawData" % (len(rawData)),
      "Preparing the dataset...",sep='\n')
loadData(reviewPath)

Now 0 rawData
Preparing the dataset...
All data loaded from amazon_reviews.txt


In [8]:
# TRAINING AND VALIDATING OUR CLASSIFIER
def trainClassifier(trainData):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC(max_iter=10000))],verbose=False)
    return SklearnClassifier(pipeline).train(trainData)

In [9]:
# 10-FOLD CROSS VALIDATION

from sklearn.metrics import precision_recall_fscore_support

def crossValidate(dataset, folds):
    shuffle(dataset) # shuffle only once before implementing k-folds to ensure distribution of fake/real in dataset
    cv_results = []
    foldSize = int(len(dataset)/folds)

    for i in range(0,len(dataset),foldSize):
        crossTest = dataset[i:i+foldSize] # from start of our heldout up to the fold size
        crossTrain = dataset[:i] + dataset[i+foldSize:] # from start to start of heldout, and also from the end of heldout up to end of set
        
        # train on the trainData
        classifier = trainClassifier(crossTrain)
        
        # predict on the testData with the classifier
        crossPredictions = predictLabels(crossTest, classifier)
        
        # return the actual labels for the testData
        crossActual = [x[1] for x in crossTest] # list comprehension to take only real/fake label from testData
        
        # use sklearn.metrics to see how our predictions did
        # provide a weighted score to account for imbalance between labels
        result = precision_recall_fscore_support(crossActual, crossPredictions, average='weighted')
        
        cv_results.append(result)
        continue
    return cv_results

In [10]:
# we need to reset our processed data and featureDict after changing settings of toFeatureVector/preProcess()
featureDict = {}
vectoredProcessed = []

processVector() # apply toFeatureVector() and preProcess() on the rawData, populate vectoredProcessed to cross validate on

In [11]:
cv_results = crossValidate(vectoredProcessed, 10)
# print(cv_results)

total = [0,0,0]
for x in cv_results:
    total[0] += x[0] # precision
    total[1] += x[1] # recall
    total[2] += x[2] # f score

# averages
total[0] = total[0]/len(cv_results)
total[1] = total[1]/len(cv_results)
total[2] = total[2]/len(cv_results)

print("Average: ", total)

Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Average:  [0.610302720332182, 0.6099047619047618, 0.6096648959279517]


# Average results over 10-fold cross validation
` Average: [precision, recall, fscore]`

 1. with basic split() based on whitespace (features: 89115)

> Average:  [0.6055228430471471, 0.6049404761904762, 0.6048509385677797]

 2. converted to lowercase, punctuation removed (features: 41397)
 
> Average:  [0.6063646761606878, 0.6058333333333333, 0.6056341871532889]


 3. converted to lowercase, punctuation removed, removing stop words with NLTK, stemming words with NLTK (features: 30015)
 
> Average:  [0.6112074377868859, 0.6105357142857143, 0.6102143961844734]


 4. converted to lowercase, punctuation removed, removing stop words with NLTK, stemming words with SKLearn (features: 29760)

> Average:  [0.6040157712422838, 0.6033928571428572, 0.6030488493264483]
 
 
 5. converted to lowercase, punctuation removed, removing stop words with NLTK, lemmatizing with NLTK (verb)
 
> Average:  [0.6041187396998227, 0.60375, 0.6034261476908608]

We tidied up the words removing punctuation, various tokens in the text such as `<br />` and `&#34;` This reduced the feature count and had a very minor improvement.

We had improved results using stemming with NLTK over lemmatization. We were unable to properly fully lemmatize since we did not have morphological information for each word (e.g. noun, verb). As such we lemmatized each word assuming it was a verb so "running" would produce "run" for example. The NLTK stemmer performed much better than the SKLearn stemmer.

Removing stop words and stemming with NLTK had the best effect overall, although still a relatively minor improvement from the baseline whitespace.

# Feature weighting

Feature weighting was done based on the count of tokens in a given sentence. We compared our best result above with the same preProcess settings, however with feature weighting as a binary option.

Binary feature weighting: 
> Average: [0.6011458453150506, 0.6008928571428572, 0.6006165656925394]

vs feature count weighting: 
> Average: [0.6112074377868859, 0.6105357142857143, 0.6102143961844734]

Feature count weighting does improve the performance although we are still only talking about an improvement of 1/100th so relatively minor again.

# Including other features

The next stage is to include other features such as rating and verified purchase from the data to see if this improves prediction.

We will add RATING, VERIFIED_PURCHASE and PRODUCT_TITLE. I posit verified purchases may be more likely to be true since there is a cost involved. I would expect a large amount of fake reviews to be non-verified. I have also chosen rating as I also suspect most fake reviews will be 5 star so this should have good predictive power. Intuitively I would suspect certain categories of product are more likely to have fake reviews posted for them and so we could use that to predict more accuractely. However Amazon has provided us with equal distributions of fake/real (350 each) in the same categories, so while this may have some preditive power combined with other features, alone it is unlikely to predict much. As such I have chosen the product title as the final feature as this can have a similar predictive power. I expect certain types of products attract more fake reviews than others, and key words in the title may highlight these.

We will use toFeatureVector to include these features in the information passed to predictLabels(). Rating is an int 1-5 so we can give this feature a weight of 1-5. Verified purchase is binary so will either be Y or N as per the rawData and product title will be pre-processed and included.

In [12]:
def parseReviewQ5(reviewLine):
    # reviewLine[0] = DOC_ID
    # reviewLine[8] = REVIEW_TEXT
    # reviewLine[1] = LABEL
    # reviewLine[2] = RATING
    # reviewLine[3] = VERIFIED_PURCHASE
    # reviewLine[6] = PRODUCT_TITLE
    return (reviewLine[0], reviewLine[8], reviewLine[1], reviewLine[2], reviewLine[3], reviewLine[6])
    # return DOC_ID, REVIEW_TEXT, LABEL, RATING, VERIFIED_PURCHASE, PRODUCT_TITLE

In [13]:
# load data from a file and append it to the rawData
def loadDataQ5(path, Text=None):
    with open(path, newline='', encoding="latin-1") as f: 
        # use encoding="latin-1" encoding to fix Unicode decode errors?
        next(f) # skip the first line (headers)
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            #print(line)
            (Id, Text, Label, Rating, Verified, Title) = parseReviewQ5(line)
            rawData.append((Id, Text, Label, Rating, Verified, Title))
            preprocessedData.append((Id, preProcess(Text), Label, Rating, Verified, preProcess(Title)))
        print("All data loaded from " + path)

def processVectorQ5():
    for (_, Text, Label, Rating, Verified, Title) in rawData:
        vectoredProcessed.append((toFeatureVectorQ5(preProcess(Text), Rating, Verified, preProcess(Title)),Label)) 

In [14]:
featureDict = {} # A global dictionary of features

def toFeatureVectorQ5(text, rating, verified, title):

    featureVector = {}

    for token in text:
        # feature dictionary
        if token not in featureVector:
            featureVector[token] = 1
        else:
            featureVector[token] += 1
        # global dictionary
        if token not in featureDict:
            featureDict[token] = 1
        else:
            featureDict[token] += 1
            
    for token in title:
        # feature dictionary
        if token not in featureVector:
            featureVector[token] = 1
        else:
            featureVector[token] += 1
    
    featureVector['rating'] = rating
    featureVector['verified'] = verified
    
    return featureVector

In [23]:
# MAIN
# reset the featuredict
featureDict = {}

# loading reviews
rawData = []          # the filtered data from the dataset file (should be 21000 samples)
preprocessedData = [] # the preprocessed reviews (just to see how your preprocessing is doing)
vectoredProcessed = []  # our completed data, with text to vectors and preprocessed

# references to the data files
#reviewPath = '100_reviews.txt'
reviewPath = 'amazon_reviews.txt'

## Do the actual stuff
# We parse the dataset and put it in a raw data list
print("Now %d rawData" % (len(rawData)),
      "Preparing the dataset...",sep='\n')
loadDataQ5(reviewPath)

Now 0 rawData
Preparing the dataset...
All data loaded from amazon_reviews.txt


In [24]:
# we need to reset our processed data and featureDict after changing settings of toFeatureVector/preProcess()
featureDict = {}
vectoredProcessed = []

processVectorQ5() # apply toFeatureVector() and preProcess() on the rawData, populate vectoredProcessed to cross validate on

In [25]:
print(vectoredProcessed[0])

({'least': 1, 'think': 1, 'product': 1, 'save': 1, 'day': 1, 'keep': 1, 'around': 1, 'case': 1, 'need': 1, 'someth': 1, 'targus': 1, 'pauk10u': 1, 'ultra': 1, 'mini': 1, 'usb': 1, 'keypad': 1, 'black': 1, 'rating': '4', 'verified': 'N'}, '__label1__')


In [26]:
cv_results = crossValidate(vectoredProcessed, 10)
# print(cv_results)

total = [0,0,0]
for x in cv_results:
    total[0] += x[0] # precision
    total[1] += x[1] # recall
    total[2] += x[2] # f score

# averages
total[0] = total[0]/len(cv_results)
total[1] = total[1]/len(cv_results)
total[2] = total[2]/len(cv_results)

print("Average: ", total)

Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Average:  [0.8007384140998066, 0.8005714285714285, 0.8005548081465619]


Example review encoding before being passed to predictLabel:

`({'least': 1, 'think': 1, 'product': 1, 'save': 1, 'day': 1, 'keep': 1, 'around': 1, 'case': 1, 'need': 1, 'someth': 1, 'targus': 1, 'pauk10u': 1, 'ultra': 1, 'mini': 1, 'usb': 1, 'keypad': 1, 'black': 1, 'rating': '4', 'verified': 'N'}, '__label1__')`

# Results

> Average:  [0.8007384140998066, 0.8005714285714285, 0.8005548081465619]

Here we saw a huge improvement in the fscore and predictive power. The conclusion from this is that the words alone do not have much predictive power in terms of predicting whether a review is fake. Whilst we still can predict better than chance with words alone (0.6 vs 0.5) it seems the majority of the predictive power comes from the other categories such as rating and verified purchase. This is likely because the fake reviews are generally quite good (although not perfect) at producing realistic textual features. The fake reviews are likely written mostly by real people hired to do so, rather than computer generated.

I was interested to check my intuitions that fake reviews would have disproportionaly higher 5 star rating, and also that they were most likely non-verified purchases. As such I looked at these features as counts in the data provided by Amazon.

In [44]:
# rawData[2] = label
# rawData[3] = rating
# rawData[4] = verified
# rawData[5] = category

ratingsFake = {}
ratingsReal = {}
verifiedFake = {}
verifiedReal = {}

verifiedReal["N"] = 0
verifiedReal["Y"] = 0

verifiedFake["Y"] = 0
verifiedFake["N"] = 0

for line in rawData:
    if line[2] == "__label2__": # real
        if line[4] == "N": # unverified
            verifiedReal[line[4]] += 1
        if line[4] == "Y":
            verifiedReal[line[4]] += 1
        if line[3] not in ratingsReal:
            ratingsReal[line[3]] = 1
        else:
            ratingsReal[line[3]] += 1
            
    if line[2] == "__label1__": # fake
        if line[4] == "N": # unverified
            verifiedFake[line[4]] += 1
        if line[4] == "Y":
            verifiedFake[line[4]] += 1
        if line[3] not in ratingsFake:
            ratingsFake[line[3]] = 1
        else:
            ratingsFake[line[3]] += 1
        
print("5 star reviews        Fake: ", ratingsFake["5"], " Real: ", ratingsReal["5"])
print("4 star reviews        Fake: ", ratingsFake["4"], " Real: ", ratingsReal["4"])
print("3 star reviews        Fake: ", ratingsFake["3"], "  Real: ", ratingsReal["3"])
print("2 star reviews        Fake: ", ratingsFake["2"], "  Real: ", ratingsReal["2"])
print("1 star reviews        Fake: ", ratingsFake["1"], "  Real: ", ratingsReal["1"])
print("Unverified purchases  Fake: ", verifiedFake["N"], " Real: ", verifiedReal["N"])
print("Verified purchases    Fake: ", verifiedFake["Y"], " Real: ", verifiedReal["Y"])
    

5 star reviews        Fake:  6059  Real:  6151
4 star reviews        Fake:  1999  Real:  1974
3 star reviews        Fake:  926   Real:  942
2 star reviews        Fake:  627   Real:  565
1 star reviews        Fake:  889   Real:  868
Unverified purchases  Fake:  7623  Real:  1679
Verified purchases    Fake:  2877  Real:  8821


|  | Fake | Real |
|--|------|------|
|5 star reviews|6059|6151|
|4 star reviews|1999|1974|
|3 star reviews|926|942|
|2 star reviews|627|565|
|1 star reviews|889|868|
|Unverified purchases|7623|1679|
|Verified purchases|2877|8821|

Contrary to our intuition the number of fake 5 star reviews is roughly equivilent in both the fake and real labels, and also roughly equivilent across all ratings. It seems like this alone may not predict much, although there may be some interactions with other features.

We can see clearly that the fake reviews contain a much larger number of unverified purchases. Similarly, the real reviews are much more likely to have verified purchases. This is likely the best indicator in the dataset.