In [1]:
import csv                               # csv reader
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer # Need to run nltk.download('wordnet') from python console first
from IPython.display import display
from nltk.corpus import stopwords # Need to run nltk.download('stopwords') from python console first
from random import shuffle
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from collections import Counter
import numpy
import string

In [2]:
# load data from a file and append it to the rawData
def loadData(path, Text=None):
    with open(path, encoding="utf8") as f:
        reader = csv.reader(f, delimiter='\t')
        next(reader, None)
        for line in reader:
            (Id, Text, Rating, Verified, ProductID, Label) = parseReview(line)
            rawData.append((Id, Text, Rating, Verified, ProductID, Label))
            preprocessedData.append((Id, preProcess(Text, Rating, Verified, ProductID), Label))
        
def splitData(percentage):
    dataSamples = len(rawData)
    halfOfData = int(len(rawData)/2)
    trainingSamples = int((percentage*dataSamples)/2)
    for (_, Text, Rating, Verified, ProductID, Label) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainData.append((toFeatureVector(preProcess(Text, Rating, Verified, ProductID)),Label))
    for (_, Text, Rating, Verified, ProductID, Label) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testData.append((toFeatureVector(preProcess(Text, Rating, Verified, ProductID)),Label))

In [3]:
# QUESTION 1

def parseReview(reviewLine):
    ID = 0
    TEXT = 8
    LABEL = 1
    RATING = 2
    VERIFIED_PURCHASE = 3
    PRODUCT_ID = 5
    tuple = (int(reviewLine[ID]), reviewLine[TEXT], int(reviewLine[RATING]), True if reviewLine[VERIFIED_PURCHASE] == 'Y' else False, reviewLine[PRODUCT_ID], fakeLabel if reviewLine[LABEL]=='__label1__' else realLabel)
    return tuple


In [4]:
# TEXT PREPROCESSING AND FEATURE VECTORIZATION

# Input: a string of one review
def preProcess(text, rating, verified, product_id):
    # Should return a list of tokens
    tokens = text.split(" ")
    
    # Stemming and lemmatisation
    porter_stemmer = PorterStemmer()
    wordnet_lemmatizer = WordNetLemmatizer()
    
    
    # Removing the stop word. You will need to run nltk.download("stopwords") in python console.
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [w for w in tokens if not w in stop_words]
    filtered_tokens = []
    for w in tokens:
        if w not in stop_words:
            filtered_tokens.append(w)
    
    filtered_tokens = list(map(lambda x : porter_stemmer.stem(wordnet_lemmatizer.lemmatize(x)), filtered_tokens))
    filtered_tokens = list(map(lambda x : x.translate(string.punctuation), filtered_tokens))
    
    return (filtered_tokens, rating, verified, product_id)
#     return filtered_tokens
#     return tokens

In [5]:
# QUESTION 2
featureDict = {} # A global dictionary of features

def toFeatureVector(tokens):
    featurevect = dict(Counter(tokens[0]))
    featurevect['rating'] = tokens[1]
    featurevect['verified'] = int(tokens[2])
    featurevect[tokens[3]] = 1  
#     print(dict(Counter(tokens[0])))
#     print(featurevect)
    return  featurevect

In [6]:
# TRAINING AND VALIDATING OUR CLASSIFIER
def trainClassifier(trainData):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC())])
    return SklearnClassifier(pipeline).train(trainData)

In [7]:
# QUESTION 3

# Get the weight of the category out of all the categories.
def getCatWeight(cat):
    weight = dict(Counter(categories))
    return weight[cat]
    

def crossValidate(dataset, folds): 
    shuffle(dataset)
#     for x in range(0,len(dataset)):
#         cat = ''
#         for key in list(dataset[x][0].keys()):
#             if key in categories:
#                 cat = key
        
#         dataset[x][0][cat] = getCatWeight(cat)
#     print("Weighted assigned.")
    cv_results = []
    foldSize = int(len(dataset)/folds) # How much is the size of one fold?
    #print(foldSize)
    for i in range(0,len(dataset),foldSize):
        testFold = dataset[i:i+foldSize] # This our fold used for testing.
        trainingData = dataset[0:i] + dataset[foldSize:]
        classifier = trainClassifier(trainingData)
        y_pred = predictLabels(testFold, classifier)
        y_true = list(map(lambda t: t[1], testFold))
        results = list(precision_recall_fscore_support(y_true, y_pred, average='weighted'))
        results[3] = accuracy_score(y_true, y_pred) * 100
        cv_results.append(tuple(results))
        
    return cv_results

In [8]:
# PREDICTING LABELS GIVEN A CLASSIFIER

def predictLabels(reviewSamples, classifier):
    return classifier.classify_many(map(lambda t: t[0], reviewSamples))

def predictLabel(reviewSample, classifier):
    return classifier.classify(toFeatureVector(preProcess(reviewSample)))

In [9]:
# MAIN

# loading reviews
rawData = []          # the filtered data from the dataset file (should be 21000 samples)
preprocessedData = [] # the preprocessed reviews (just to see how your preprocessing is doing)
trainData = []        # the training data as a percentage of the total dataset (currently 80%, or 16800 samples)
testData = []         # the test data as a percentage of the total dataset (currently 20%, or 4200 samples)

# the output classes
fakeLabel = 'fake'
realLabel = 'real'

# references to the data files
reviewPath = 'amazon_reviews.txt'

## Do the actual stuff
# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing the dataset...",sep='\n')
loadData(reviewPath)
# print(categories)

# We split the raw dataset into a set of training data and a set of test data (80/20)
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing training and test data...",sep='\n')
splitData(0.8)
# We print the number of training samples and the number of features
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Training Samples: ", len(trainData), "Features: ", len(featureDict), sep='\n')
cvResults = crossValidate(trainData, 10)
display(cvResults)

# print("More analysis:")
# fakes = list(filter(lambda x: x[5] == "fake", rawData))
# reals = list(filter(lambda x: x[5] == "real", rawData))
# fakesTotalLength = 0
# realsTotalLength = 0

# for x in fakes:
#     fakesTotalLength += len(x[1])
    
# for x in reals:
#     realsTotalLength += len(x[1])

# print("Average fake review length: %f" % float(fakesTotalLength / len(fakes)))
# print("Average real review length: %f" % float(realsTotalLength / len(reals)))

print("Done!")

Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 21000 rawData, 0 trainData, 0 testData
Preparing training and test data...
Now 21000 rawData, 16800 trainData, 4200 testData
Training Samples: 
16800
Features: 
0
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...


[(0.77385042483067235,
  0.77380952380952384,
  0.77382010426013148,
  77.38095238095238),
 (1.0, 1.0, 1.0, 100.0),
 (1.0, 1.0, 1.0, 100.0),
 (1.0, 1.0, 1.0, 100.0),
 (1.0, 1.0, 1.0, 100.0),
 (1.0, 1.0, 1.0, 100.0),
 (1.0, 1.0, 1.0, 100.0),
 (1.0, 1.0, 1.0, 100.0),
 (1.0, 1.0, 1.0, 100.0),
 (1.0, 1.0, 1.0, 100.0)]

Done!
