In [1]:
import csv,re                               # csv reader
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from random import shuffle
from sklearn.pipeline import Pipeline
import numpy as np

In [2]:
# load data from a file and append it to the rawData
def loadData(path, Text=None):
    with open(path, 'r',encoding='utf-8') as f:
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            if line[0] == "DOC_ID":  # skip the header
                continue
            (Id, Text, Label) = parseReview(line)
            rawData.append((Id, Text, Label))


def splitData(percentage):
    # A method to split the data between trainData and testData 
    dataSamples = len(rawData)
    halfOfData = int(len(rawData)/2)
    trainingSamples = int((percentage*dataSamples)/2)
    for (_, Text, Label) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainData.append((toFeatureVector(preProcess(Text)),Label))
    for (_, Text, Label) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testData.append((toFeatureVector(preProcess(Text)),Label))

In [3]:
def parseReview(reviewLine):
    
    if reviewLine[1]  == '__label1__':
        reviewLine[1] = 'fake'
    else :
        reviewLine[1] = 'real'
     
    #print("Doc,id,true :",reviewLine[0], reviewLine[8], reviewLine[1])
    return reviewLine[0], reviewLine[8], reviewLine[1] #returns a triple of an doc id,review and if the review is real or not base on its position in the dataset 

In [4]:
def preProcess(text):
    # word tokenisation
    # separate out words and strings of punctuation into separate white spaced words
    text = re.sub(r"(\w)([.,;:!?'\"”\)])", r"\1 \2", text)
    text = re.sub(r"([.,;:!?'\"“\(])(\w)", r"\1 \2", text)
    tokens = re.split(r"\s+",text)#split the regular expression by the white space
    text = re.sub(r"(\S)\1\1+",r"\1\1\1", text) #normalisation
    tokens = [t.lower() for t in tokens]
    return tokens

In [5]:
featureDict = {}
def toFeatureVector(words): # turn into feature vectors
    v = {}
    for w in words:
#         try:
#             i = featureDict[w]
#         except KeyError:
#             i = len(featureDict) + 1
#             featureDict[w] = i
        try:
            v[w] += (1.0)
        except KeyError:
            v[w] = (1.0)
    return v

In [6]:
# TRAINING AND VALIDATING OUR CLASSIFIER
def trainClassifier(trainData):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC())])
    return SklearnClassifier(pipeline).train(trainData)

In [7]:
def crossValidate(dataset, folds):
    shuffle(dataset)
    cv_results = []
    foldSize = int(len(dataset)/folds)
    # DESCRIBE YOUR METHOD IN WORDS
    for i in range(0,len(dataset),foldSize):
        trainFolds = dataset[i:i+foldSize]
        validationFold = dataset[:i] + dataset[i+foldSize:]
        classifier = trainClassifier(trainFolds)
        truth = [x[1] for x in validationFold]
        pred = predictLabels(validationFold,classifier)
        cv_results.append(precision_recall_fscore_support(truth, pred, average='weighted'))
    
    return cv_results


In [8]:
# PREDICTING LABELS GIVEN A CLASSIFIER

def predictLabels(reviewSamples, classifier):
    return classifier.classify_many(map(lambda t: t[0], reviewSamples))

def predictLabel(text, classifier):
    return classifier.classify(toFeatureVector(preProcess(text)))


In [9]:
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

# loading reviews
rawData = []         
trainData = []        
testData = []         

fakeLabel = 'fake'
realLabel = 'real'

# references to the data files
reviewPath = 'amazon_reviews.txt'


print("Now %d rawData, %d trainData" % (len(rawData), len(trainData)),
      "Preparing the dataset...",sep='\n')
loadData(reviewPath) 
#print(trainData[0])
print("Now %d rawData, %d trainData" % (len(rawData), len(trainData)),
      "Preparing the dataset...",sep='\n')

splitData(0.8)
# We print the number of training samples and the number of features
print("Now %d rawData, %d trainData" % (len(rawData), len(trainData)),
      "Training Samples: ", len(trainData), "Features: ", len(featureDict), sep='\n')
print(trainData[0])

Now 0 rawData, 0 trainData
Preparing the dataset...
Now 21000 rawData, 0 trainData
Preparing the dataset...
Now 21000 rawData, 16800 trainData
Training Samples: 
16800
Features: 
0
({'when': 1.0, 'least': 1.0, 'you': 2.0, 'think': 1.0, 'so': 1.0, ',': 1.0, 'this': 1.0, 'product': 1.0, 'will': 1.0, 'save': 1.0, 'the': 1.0, 'day': 1.0, '.': 2.0, 'just': 2.0, 'keep': 1.0, 'it': 2.0, 'around': 1.0, 'in': 1.0, 'case': 1.0, 'need': 1.0, 'for': 1.0, 'something': 1.0}, 'fake')


In [10]:
crossValidate(trainData, 10) 

Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...


[(0.5758686804241684, 0.5757936507936507, 0.5757477184065353, None),
 (0.579949251220374, 0.5798280423280423, 0.5796647668416823, None),
 (0.5910946735777177, 0.5909391534391535, 0.5908352893446083, None),
 (0.5960887986209036, 0.596031746031746, 0.5959794920698382, None),
 (0.5906387603575606, 0.5905423280423281, 0.5904688606149482, None),
 (0.5839593516033366, 0.5839285714285715, 0.5838825845333073, None),
 (0.5798502698505095, 0.5796957671957672, 0.579610634690216, None),
 (0.5866602971175876, 0.5866402116402116, 0.5866376802829733, None),
 (0.5970496902336017, 0.5968915343915344, 0.5967401324931128, None),
 (0.59346253772726, 0.593452380952381, 0.5933934593524084, None)]