In [1]:
import csv,re                               # csv reader
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from random import shuffle
from sklearn.pipeline import Pipeline
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np

In [2]:
def loadData(path, Text=None):
    with open(path, 'r',encoding='utf-8') as f:
        reader = csv.reader(f, delimiter='\t')
        next(reader)
        for line in reader:
            (Id, Text, Rating, Verified, Category, Label) = parseReview(line)
            rawData.append((Id, Text, Rating, Verified, Category, Label))

def splitData(percentage):
    dataSamples = len(rawData)
    halfOfData = int(len(rawData)/2)
    trainingSamples = int((percentage*dataSamples)/2)
    
    for (_, Text, Rating, Verified, Category, Label) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        other_features = toFeatureVector(preProcess(Text))
        other_features.update({'Rating':Rating, 'Verified':Verified, 'Category':Category}) 
        trainData.append((other_features, Label))
    for (_, Text, Rating, Verified, Category, Label) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        other_features = toFeatureVector(preProcess(Text))
        other_features.update({'Rating':Rating, 'Verified':Verified, 'Category':Category}) 
        testData.append((other_features, Label))

# Question 1

In [3]:
def parseReview(reviewLine):
    
    Id = reviewLine[0]
    Text = reviewLine[8]
    Label = reviewLine[1]
    Rating = reviewLine[2] 
    Verified = reviewLine[3]
    Category = reviewLine[4]
   
    return (Id, Text, Rating, Verified, Category, Label)
# the rating chosen as maybe relation with fake reviews and the rating given
# if the reviewer was verified to help with classification and improve scores
# the category as also maybe higher fake reviews in specific categories
    

In [4]:
def preProcess(text):
    # word tokenisation
    # separate out words and strings of punctuation into separate white spaced words
    text = re.sub(r"(\w)([.,;:!?'\"”\)])", r"\1 \2", text) #remove
    text = re.sub(r"([.,;:!?'\"“\(])(\w)", r"\1 \2", text)
    tokens = re.split(r"\s+",text)#split the regular expression by the white space
    text = re.sub(r"(\S)\1\1+",r"\1\1\1", text) #normalisation
    tokens = [t.lower() for t in tokens] 
    tokens = word_tokenize(text)
    tokens = [WordNetLemmatizer().lemmatize(t) for t in tokens] 
    return tokens

# Question 2

In [5]:
featureDict = {} # 
def toFeatureVector(words):     # returns a dictionary where the features as keys, and weights as values
    v = {}
    for w in words:
        try:
            featureDict[w] += 1
        except KeyError:            
            featureDict[w] = 1
        try:
            v[w] += (1.0/len(words))
        except KeyError:
            v[w] = (1.0/len(words))

    return v
# returns a dictionary with the features as keys, and weights as values where the key is not 
#in the dictionary add it and give it a wiehgt of 1, which improved the finalscore from the previous method of 
# just adding to the word to the dictionary with no weight of the actual word

In [6]:
# TRAINING AND VALIDATING OUR CLASSIFIER
def trainClassifier(trainData):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC())])
    return SklearnClassifier(pipeline).train(trainData)

# Question 3

In [7]:
def crossValidate(dataset, folds):
    shuffle(dataset)
    cv_results = []
    foldSize = int(len(dataset)/folds)
    # DESCRIBE YOUR METHOD IN WORDS
    for i in range(0,len(dataset),foldSize):
        trainFolds = dataset[i:i+foldSize]
        validationFold = dataset[:i] + dataset[i+foldSize:]
        classifier = trainClassifier(trainFolds)
        truth = [x[1] for x in validationFold]
        pred = predictLabels(validationFold,classifier)
        cv_results.append(precision_recall_fscore_support(truth, pred, average='weighted'))
    
    return cv_results


In [8]:
# PREDICTING LABELS GIVEN A CLASSIFIER

def predictLabels(reviewSamples, classifier):
    return classifier.classify_many(map(lambda t: t[0], reviewSamples))

def predictLabel(reviewSample, classifier):
    return classifier.classify(toFeatureVector(preProcess(reviewSample)))

In [9]:
# MAIN
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
# loading reviews
# initialize global lists that will be appended to by the methods below
rawData = []          # the filtered data from the dataset file (should be 21000 samples)
trainData = []        # the pre-processed training data as a percentage of the total dataset (currently 80%, or 16800 samples)
testData = []         # the pre-processed test data as a percentage of the total dataset (currently 20%, or 4200 samples)

# the output classes
fakeLabel = 'fake'
realLabel = 'real'

# references to the data files
reviewPath = 'amazon_reviews.txt'

# Do the actual stuff (i.e. call the functions we've made)
# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing the dataset...",sep='\n')
loadData(reviewPath) 

# We split the raw dataset into a set of training data and a set of test data (80/20)
# You do the cross validation on the 80% (training data)
# We print the number of training samples and the number of features before the split
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing training and test data...",sep='\n')
splitData(0.8)
# We print the number of training samples and the number of features after the split
print("After split, %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Training Samples: ", len(trainData), "Features: ", len(featureDict), sep='\n')

# QUESTION 3 - Make sure there is a function call here to the
# crossValidate function on the training set to get your results
crossValidate(trainData, 10) # perform 10 folds using the crossvalidate method

Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 21000 rawData, 0 trainData, 0 testData
Preparing training and test data...
After split, 21000 rawData, 16800 trainData, 4200 testData
Training Samples: 
16800
Features: 
44875
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...


[(0.780747288730837, 0.7780423280423281, 0.7774963285934446, None),
 (0.7790467775429123, 0.7767857142857143, 0.7763492805968027, None),
 (0.7834211952077138, 0.7816137566137566, 0.7812554053542361, None),
 (0.7806212174546636, 0.7773148148148148, 0.77665470019054, None),
 (0.7818267889799343, 0.7786375661375662, 0.7780503524264382, None),
 (0.7810719762436629, 0.779431216931217, 0.7790789901893914, None),
 (0.7815751697760855, 0.7803571428571429, 0.7801338355367099, None),
 (0.7829724289404056, 0.7804232804232805, 0.779965487086853, None),
 (0.7793396312025964, 0.7784391534391535, 0.778222158862791, None),
 (0.7792295943184618, 0.776521164021164, 0.7759860621854467, None)]

# Evaluate on test set

In [10]:
# Finally, check the accuracy of your classifier by training on all the tranin data
# and testing on the test set
# Will only work once all functions are complete
functions_complete = True  # set to True once you're happy with your methods for cross val
if functions_complete:
    print(testData[0])   # have a look at the first test data instance
    classifier = trainClassifier(trainData)  # train the classifier
    testTrue = [t[1] for t in testData]   # get the ground-truth labels from the data
    testPred = predictLabels(testData, classifier)  # classify the test data to get predicted labels
    finalScores = precision_recall_fscore_support(testTrue, testPred, average='weighted') # evaluate
    print("Done training!")
    print("Precision: %f\nRecall: %f\nF Score:%f" % finalScores[:3])

({'This': 0.04, 'assortment': 0.04, 'is': 0.04, 'really': 0.04, 'Hershey': 0.04, "'": 0.04, 's': 0.04, 'at': 0.04, 'their': 0.04, 'best': 0.04, '.': 0.08, 'The': 0.04, 'little': 0.04, 'one': 0.04, 'are': 0.04, 'always': 0.04, 'excited': 0.04, 'whenever': 0.04, 'the': 0.04, 'holiday': 0.04, 'come': 0.04, 'because': 0.04, 'of': 0.04, 'this': 0.04, 'Rating': '5', 'Verified': 'N', 'Category': 'Grocery'}, '__label1__')
Training Classifier...
Done training!
Precision: 0.805210
Recall: 0.801190
F Score:0.800534


# Questions 4 and 5
Once you're happy with your functions for Questions 1 to 3, it's advisable you make a copy of this notebook to make a new notebook, and then within it adapt and improve all three functions in the ways asked for in questions 4 and 5.