In [16]:
import csv,re                               # csv reader
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from random import shuffle
from sklearn.pipeline import Pipeline
import numpy as np

In [17]:
# load data from a file and append it to the rawData
def loadData(path, Text=None):
    with open(path, 'r',encoding='utf-8') as f:
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            if line[0] == "DOC_ID":  # skip the header
                continue
            (Id, Text, Label) = parseReview(line)
            rawData.append((Id, Text, Label))


def splitData(percentage):
    # A method to split the data between trainData and testData 
    dataSamples = len(rawData)
    halfOfData = int(len(rawData)/2)
    trainingSamples = int((percentage*dataSamples)/2)
    for (_, Text, Label) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainData.append((toFeatureVector(preProcess(Text)),Label))
    for (_, Text, Label) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testData.append((toFeatureVector(preProcess(Text)),Label))

In [18]:
def parseReview(reviewLine):
    
    if reviewLine[1]  == '__label1__':
        reviewLine[1] = 'fake'
    else :
        reviewLine[1] = 'real'
     
    return reviewLine[0], reviewLine[8], reviewLine[1] 

In [19]:
# Use the labels('__label1__' or '__label2__') given in reviewLine[1] to tell if the review is fake.
#Return a triple of an doc id,review and if the review is real or not base on its position in the dataset 

In [20]:
def preProcess(text):
    # word tokenisation
    # separate out words and strings of punctuation into separate white spaced words
    text = re.sub(r"(\w)([.,;:!?\"”\)])", r"\1 \2", text)
    text = re.sub(r"([.,;:!?'\"“\(])(\w)", r"\1 \2", text)
    tokens = re.split(r"\s+",text)#split the regular expression by the white space
    text = re.sub(r"(\S)\1\1+",r"\1\1\1", text) 
    tokens = [t.lower() for t in tokens]
    return tokens

In [21]:
# Use regex to tokenise the words, the regex is used to split words based on punctuation after the word
# The second regex is used to split the punctuation before the word
# Split the words based on white space
# Tokenize and convert to lower case

In [22]:
featureDict = {}
def toFeatureVector(words): # turn into feature vectors 
    v = {}
    for w in words:
        try:
            v[w] += (1.0)
        except KeyError:
            v[w] = (1.0)
    return v

In [23]:
#Create a dictionary that gives each word a weight of +1 everytime it is seen. 
#If i word is not seen add it to the dictionary with a weight of +1

In [24]:
# TRAINING AND VALIDATING OUR CLASSIFIER
def trainClassifier(trainData):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC())])
    return SklearnClassifier(pipeline).train(trainData)

In [25]:
def crossValidate(dataset, folds):
    shuffle(dataset)
    cv_results = []
    foldSize = int(len(dataset)/folds)
    # DESCRIBE YOUR METHOD IN WORDS
    for i in range(0,len(dataset),foldSize):
        trainFolds = dataset[i:i+foldSize]
        validationFold = dataset[:i] + dataset[i+foldSize:]
        classifier = trainClassifier(trainFolds)
        truth = [x[1] for x in validationFold]
        pred = predictLabels(validationFold,classifier)
        cv_results.append(precision_recall_fscore_support(truth, pred, average='weighted'))
    
    return cv_results


In [26]:
# Foldsize is the created as a integer based on the size of the dataset divided by the fold given as 10 in the main
#using the k*I-fold cross-validation is used with both validation and train set
#create a classifer using a function tainclassifer cased on the trainfold
#create a prediction variable using the predictLabels function on the validationfold  
#append the groundtruth and prediction variables to the fscore

In [27]:
# PREDICTING LABELS GIVEN A CLASSIFIER

def predictLabels(reviewSamples, classifier):
    return classifier.classify_many(map(lambda t: t[0], reviewSamples))

def predictLabel(text, classifier):
    return classifier.classify(toFeatureVector(preProcess(text)))


In [28]:
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

# loading reviews
rawData = []         
trainData = []        
testData = []         

fakeLabel = 'fake'
realLabel = 'real'

# references to the data files
reviewPath = 'amazon_reviews.txt'


print("Now %d rawData, %d trainData" % (len(rawData), len(trainData)),
      "Preparing the dataset...",sep='\n')
loadData(reviewPath) 
#print(trainData[0])
print("Now %d rawData, %d trainData" % (len(rawData), len(trainData)),
      "Preparing the dataset...",sep='\n')

splitData(0.8)
# We print the number of training samples and the number of features
print("Now %d rawData, %d trainData" % (len(rawData), len(trainData)),
      "Training Samples: ", len(trainData), "Features: ", len(featureDict), sep='\n')
crossValidate(trainData, 10) # perform 10 folds using the crossvalidate method

Now 0 rawData, 0 trainData
Preparing the dataset...
Now 21000 rawData, 0 trainData
Preparing the dataset...
Now 21000 rawData, 16800 trainData
Training Samples: 
16800
Features: 
0
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...


[(0.5947052014793951, 0.5944444444444444, 0.59423371092985, None),
 (0.5927283533269315, 0.5927248677248678, 0.5927239555990296, None),
 (0.5837131745073534, 0.5836640211640212, 0.5836559954015762, None),
 (0.5876364051724503, 0.5876322751322751, 0.5876329804071146, None),
 (0.5811965244504679, 0.580952380952381, 0.5806834856188544, None),
 (0.5941764877074379, 0.5941798941798941, 0.5941751864118677, None),
 (0.5878453582805098, 0.5875661375661375, 0.5873263174891926, None),
 (0.5967001121330788, 0.5966931216931217, 0.5966735527054605, None),
 (0.5841444588852223, 0.5841269841269842, 0.5840976870681831, None),
 (0.5831684909080361, 0.5831349206349207, 0.5831079267165358, None)]

In [29]:
# Finally, check the accuracy of your classifier by training on all the tranin data
# and testing on the test set
# Will only work once all functions are complete
functions_complete = True  # set to True once you're happy with your methods for cross val
if functions_complete:
    print(testData[0])   # have a look at the first test data instance
    classifier = trainClassifier(trainData)  # train the classifier
    testTrue = [t[1] for t in testData]   # get the ground-truth labels from the data
    testPred = predictLabels(testData, classifier)  # classify the test data to get predicted labels
    finalScores = precision_recall_fscore_support(testTrue, testPred, average='weighted') # evaluate
    print("Done training!")
    print("Precision: %f\nRecall: %f\nF Score:%f" % finalScores[:3])

({'this': 2.0, 'assortment': 1.0, 'is': 1.0, 'really': 1.0, "hershey'": 1.0, 's': 1.0, 'at': 1.0, 'their': 1.0, 'best': 1.0, '.': 2.0, 'the': 2.0, 'little': 1.0, 'ones': 1.0, 'are': 1.0, 'always': 1.0, 'excited': 1.0, 'whenever': 1.0, 'holidays': 1.0, 'come': 1.0, 'because': 1.0, 'of': 1.0}, 'fake')
Training Classifier...
Done training!
Precision: 0.605029
Recall: 0.605000
F Score:0.604973
