In [1]:
import csv,re,nltk                               # csv reader
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from random import shuffle
from sklearn.pipeline import Pipeline
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import numpy as np

In [2]:
# load data from a file and append it to the rawData
def loadData(path, Text=None):
    with open(path, 'r',encoding='utf-8') as f:
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            if line[0] == "DOC_ID":  # skip the header
                continue
            (Id, Text, Label) = parseReview(line)
            rawData.append((Id, Text, Label))


def splitData(percentage):
    # A method to split the data between trainData and testData 
    dataSamples = len(rawData)
    halfOfData = int(len(rawData)/2)
    trainingSamples = int((percentage*dataSamples)/2)
    for (_, Text, Label) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainData.append((toFeatureVector(preProcess(Text)),Label))
    for (_, Text, Label) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testData.append((toFeatureVector(preProcess(Text)),Label))

# Question 1

In [3]:
def parseReview(reviewLine):
    
    if reviewLine[1]  == '__label1__':
        reviewLine[1] = 'fake'
    else :
        reviewLine[1] = 'real'
     
    return reviewLine[0], reviewLine[8], reviewLine[1] #returns a triple of an doc id,review and if the review is real or not base on its position in the dataset 

In [4]:
def preProcess(text):
    # word tokenisation
    # separate out words and strings of punctuation into separate white spaced words
    text = re.sub(r"(\w)([.,;:!?'\"”\)])", r"\1 \2", text) #remove
    text = re.sub(r"([.,;:!?'\"“\(])(\w)", r"\1 \2", text)
    tokens = re.split(r"\s+",text)#split the regular expression by the white space
    text = re.sub(r"(\S)\1\1+",r"\1\1\1", text) #normalisation
    tokens = [t.lower() for t in tokens] 
    tokens = word_tokenize(text)
    #tokens = [WordNetLemmatizer().lemmatize(t) for t in tokens] 
    tokens = [nltk.stem.SnowballStemmer('english').stem(t) for t in tokens]   
    tokens = [t for t in tokens if t not in set(stopwords.words('english'))]

    return tokens

# Question 2

In [5]:
featureDict = {}
def toFeatureVector(words): # turn into feature vectors
    v = {}
    for w in words:
        try:
            v[w] += (1.0)
        except KeyError:
            v[w] = (1.0)
    return v

In [6]:
# TRAINING AND VALIDATING OUR CLASSIFIER
def trainClassifier(trainData):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC())])
    return SklearnClassifier(pipeline).train(trainData)

# Question 3

In [7]:
def crossValidate(dataset, folds):
    shuffle(dataset)
    cv_results = []
    foldSize = int(len(dataset)/folds)
    # DESCRIBE YOUR METHOD IN WORDS
    for i in range(0,len(dataset),foldSize):
        trainFolds = dataset[i:i+foldSize]
        validationFold = dataset[:i] + dataset[i+foldSize:]
        classifier = trainClassifier(trainFolds)
        truth = [x[1] for x in validationFold]
        pred = predictLabels(validationFold,classifier)
        cv_results.append(precision_recall_fscore_support(truth, pred, average='weighted'))
    
    return cv_results


In [8]:
# PREDICTING LABELS GIVEN A CLASSIFIER

def predictLabels(reviewSamples, classifier):
    return classifier.classify_many(map(lambda t: t[0], reviewSamples))

def predictLabel(text, classifier):
    return classifier.classify(toFeatureVector(preProcess(text)))


In [9]:
# MAIN
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
# loading reviews
# initialize global lists that will be appended to by the methods below
rawData = []          # the filtered data from the dataset file (should be 21000 samples)
trainData = []        # the pre-processed training data as a percentage of the total dataset (currently 80%, or 16800 samples)
testData = []         # the pre-processed test data as a percentage of the total dataset (currently 20%, or 4200 samples)

# the output classes
fakeLabel = 'fake'
realLabel = 'real'

# references to the data files
reviewPath = 'amazon_reviews.txt'

# Do the actual stuff (i.e. call the functions we've made)
# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing the dataset...",sep='\n')
loadData(reviewPath) 

# We split the raw dataset into a set of training data and a set of test data (80/20)
# You do the cross validation on the 80% (training data)
# We print the number of training samples and the number of features before the split
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing training and test data...",sep='\n')
splitData(0.8)
# We print the number of training samples and the number of features after the split
print("After split, %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Training Samples: ", len(trainData), "Features: ", len(featureDict), sep='\n')

# QUESTION 3 - Make sure there is a function call here to the
# crossValidate function on the training set to get your results
cv_results = crossValidate(trainData, 10) 
print ("CV_RESULTS", cv_results)

Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 21000 rawData, 0 trainData, 0 testData
Preparing training and test data...
After split, 21000 rawData, 16800 trainData, 4200 testData
Training Samples: 
16800
Features: 
43810
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
CV_RESULTS [(0.6107905166871425, 0.6103174603174604, 0.6099010465298379, None), (0.6087323187072666, 0.6078042328042328, 0.6070118201128015, None), (0.6154145081731172, 0.6111111111111112, 0.607803604804734, None), (0.6077956340944155, 0.6068121693121693, 0.6060532699946742, None), (0.615470909374095, 0.6154100529100529, 0.615342242752715, None), (0.6130612448850602, 0.6130291005291005, 0.6130109288977214, None), (0.6095573329519944, 0.6060846560846561, 0.6031428855193177, None), (0.6187950105244348, 0.6177248677248677, 

# Evaluate on test set

In [10]:
# Finally, check the accuracy of your classifier by training on all the tranin data
# and testing on the test set
# Will only work once all functions are complete
functions_complete = True  # set to True once you're happy with your methods for cross val
if functions_complete:
    print(testData[0])   # have a look at the first test data instance
    classifier = trainClassifier(trainData)  # train the classifier
    testTrue = [t[1] for t in testData]   # get the ground-truth labels from the data
    testPred = predictLabels(testData, classifier)  # classify the test data to get predicted labels
    finalScores = precision_recall_fscore_support(testTrue, testPred, average='weighted') # evaluate
    print("Done training!")
    print("Precision: %f\nRecall: %f\nF Score:%f" % finalScores[:3])

({'this': 0.08, 'assortment': 0.04, 'is': 0.04, 'really': 0.04, 'hershey': 0.04, "'": 0.04, 's': 0.04, 'at': 0.04, 'their': 0.04, 'best': 0.04, '.': 0.08, 'the': 0.08, 'little': 0.04, 'ones': 0.04, 'are': 0.04, 'always': 0.04, 'excited': 0.04, 'whenever': 0.04, 'holidays': 0.04, 'come': 0.04, 'because': 0.04, 'of': 0.04}, 'fake')
Training Classifier...
Done training!
Precision: 0.637710
Recall: 0.637619
F Score:0.637559


# Questions 4 and 5
Once you're happy with your functions for Questions 1 to 3, it's advisable you make a copy of this notebook to make a new notebook, and then within it adapt and improve all three functions in the ways asked for in questions 4 and 5.