## Bigram Implementation

To use Bigrams instead of Unigrams I had to make some changes to how the vocabulary was built. In the cell below it can be seen that the tokens are now appended as pairs of strings instead of single strings

In [4]:
# test "get_documents()"

def get_document_preview(document, max_length = 72):
    s = []
    count = 0
    reached_limit = False
    for sentence in document:
        i = 0
        
        # This while loop will ensure that we append pairs of strings as our tokens rather than single strings
        while (i < len(sentence) - 1):
            token = sentence[i] + ' ' + sentence[i+1]
            if count + len(token) + len(s) > max_length:
                reached_limit = True
                break

            s.append(token)
            count += len(token)
            i+=1
        if reached_limit:
            break
    return '|'.join(s)
    
for label in 'pos neg'.split():
    print(f'== {label} ==')
    print('doc sentences start of first sentence')
    for index, document in enumerate(data_loader.get_documents(
        label = label
    )):
        print('%3d %7d   %s' %(
            index, len(document), get_document_preview(document)
        ))
        if index == 4:
            break

== pos ==
doc sentences start of first sentence
  0      25   films adapted|adapted from|from comic|comic books|books have|have had
  1      39   every now|now and|and then|then a|a movie|movie comes|comes along
  2      19   you've got|got mail|mail works|works alot|alot better|better than
  3      42   " jaws|jaws "|" is|is a|a rare|rare film|film that|that grabs|grabs your
  4      25   moviemaking is|is a|a lot|lot like|like being|being the|the general
== neg ==
doc sentences start of first sentence
  0      35   plot :|: two|two teen|teen couples|couples go|go to|to a|a church
  1      13   the happy|happy bastard's|bastard's quick|quick movie|movie review
  2      23   it is|is movies|movies like|like these|these that|that make|make a
  3      19   " quest|quest for|for camelot|camelot "|" is|is warner|warner bros
  4      37   synopsis :|: a|a mentally|mentally unstable|unstable man|man undergoing


In [5]:
# test "get_xval_splits()"

splits = data_loader.get_xval_splits()

print('tr-size te-size (number of documents)')
for xval_tr_data, xval_te_data in splits:
    print('%7d %7d' %(len(xval_tr_data), len(xval_te_data)))

tr-size te-size (number of documents)
   1800     200
   1800     200
   1800     200
   1800     200
   1800     200
   1800     200
   1800     200
   1800     200
   1800     200
   1800     200


In [6]:
class PolarityPredictorInterface:

    def train(self, data_with_labels):
        raise NotImplementedError
        
    def predict(self, data):
        raise NotImplementedError

Again, this version of the PolarityPredictorWithVocabulary class will now append pairs of strings instead of single strings

In [7]:
class PolarityPredictorWithVocabulary(PolarityPredictorInterface):
    
    def train(self, data_with_labels):
        self.reset_vocab()
        self.add_to_vocab_from_data(data_with_labels)
        self.finalise_vocab()
        tr_features = self.extract_features(
            data_with_labels
        )
        tr_targets = self.get_targets(data_with_labels)
        self.train_model_on_features(tr_features, tr_targets)
        
    def reset_vocab(self):
        self.vocab = set()
        
    def add_to_vocab_from_data(self, data):
        for document, label in data:
            for sentence in document:
                i = 0
                while (i < len(sentence) - 1):
                    token = sentence[i] + ' ' + sentence[i+1]
                    self.vocab.add(token)
                    i+=1

    def finalise_vocab(self):
        self.vocab = list(self.vocab)
        # create reverse map for fast token lookup
        self.token2index = {}
        for index, token in enumerate(self.vocab):
            self.token2index[token] = index
        
    def extract_features(self, data):
        raise NotImplementedError
    
    def get_targets(self, data, label2index = None):
        raise NotImplementedError
        
    def train_model_on_features(self, tr_features, tr_targets):
        raise NotImplementedError

In [8]:
import numpy

class PolarityPredictorWithBagOfWords_01(PolarityPredictorWithVocabulary):
    
    def __init__(self, clip_counts = True):
        self.clip_counts = clip_counts
        
    def extract_features(self, data):
        # create numpy array of required size
        columns = len(self.vocab)
        rows = len(data)
        features = numpy.zeros((rows, columns), dtype=numpy.int32)        
        # populate feature matrix
        for row, item in enumerate(data):
            document, _ = item
            for sentence in document:

                i = 0
                while (i < len(sentence)-1):
                    token = sentence[i] + ' ' + sentence[i+1]
                    i+=1

                    try:
                        index = self.token2index[token]
                    except KeyError:
                        # token not in vocab
                        # --> skip this token
                        # --> continue with next token
                        continue
                    if self.clip_counts:
                        features[row, index] = 1
                    else:
                        features[row, index] += 1

        return features

In [9]:
class PolarityPredictorWithBagOfWords(PolarityPredictorWithBagOfWords_01):
 
    def get_targets(self, data):
        ''' create column vector with target labels
        '''
        # prepare target vector
        targets = numpy.zeros(len(data), dtype=numpy.int8)
        index = 0
        for _, label in data:
            if label == 'pos':
                targets[index] = 1
            index += 1
        return targets

    def train_model_on_features(self, tr_features, tr_targets):
        raise NotImplementedError

## Naive Bayes

In [10]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

class PolarityPredictorBowNB(PolarityPredictorWithBagOfWords):

    def train_model_on_features(self, tr_features, tr_targets):
        # pass numpy array to sklearn to train NB
        self.model = MultinomialNB()
        self.model.fit(tr_features, tr_targets)
        
    def predict(
        self, data, get_accuracy = False,
        get_confusion_matrix = False
    ):
        features = self.extract_features(data)
        # use numpy to get predictions
        y_pred = self.model.predict(features)
        # restore labels
        labels = []
        for is_positive in y_pred:
            if is_positive:
                labels.append('pos')
            else:
                labels.append('neg')
        if get_accuracy or get_confusion_matrix:
            retval = []
            retval.append(labels)
            y_true = self.get_targets(data)
            if get_accuracy:
                retval.append(
                    metrics.accuracy_score(y_true, y_pred)
                )
            if get_confusion_matrix:
                retval.append(
                    metrics.confusion_matrix(y_true, y_pred)
                )
            return retval
        else:
            return labels

In [11]:
# first functionality test

model = PolarityPredictorBowNB()
model.train(splits[0][0]) 

In [12]:
def print_first_predictions(model, te_data, n = 12):
    predictions = model.predict(te_data)
    for i in range(n):
        document, label = te_data[i]
        prediction = predictions[i]
        print('%4d %s %s %s' %(
            i, label, prediction,
            get_document_preview(document),
        ))
    
print_first_predictions(model, splits[0][1])

   0 pos pos films adapted|adapted from|from comic|comic books|books have|have had
   1 pos pos every now|now and|and then|then a|a movie|movie comes|comes along
   2 pos pos you've got|got mail|mail works|works alot|alot better|better than
   3 pos pos " jaws|jaws "|" is|is a|a rare|rare film|film that|that grabs|grabs your
   4 pos neg moviemaking is|is a|a lot|lot like|like being|being the|the general
   5 pos pos on june|june 30|30 ,|, 1960|1960 ,|, a|a self-taught|self-taught ,
   6 pos pos apparently ,|, director|director tony|tony kaye|kaye had|had a|a major
   7 pos pos one of|of my|my colleagues|colleagues was|was surprised|surprised when
   8 pos pos after bloody|bloody clashes|clashes and|and independence
   9 pos pos the american|american action|action film|film has|has been|been slowly
  10 pos pos after watching|watching "|" rat|rat race|race "|" last|last week|week ,
  11 pos pos i've noticed|noticed something|something lately|lately that|that i've


In [13]:
labels, accuracy, confusion_matrix = model.predict(
    splits[0][1], get_accuracy = True, get_confusion_matrix = True
)

print(accuracy)
print(confusion_matrix)

0.815
[[78 22]
 [15 85]]


In [14]:
def evaluate_model(model, splits, verbose = False):
    accuracies = []
    f1s = []
    fold = 0
    for tr_data, te_data in splits:
        if verbose:
            print('Evaluating fold %d of %d' %(fold+1, len(splits)))
            fold += 1
        model.train(tr_data)
        _, accuracy, confusion_matrix = model.predict(te_data, get_accuracy = True, get_confusion_matrix = True)
        
        tp, fp, fn, tn = confusion_matrix[0][0], confusion_matrix[0][1], confusion_matrix[1][0], confusion_matrix[1][1]
        prec = tp/(tp + fp)
        rec = tp/(tp + fn)
        f1 = (2*prec*rec)/(prec+rec)
        
        accuracies.append(accuracy)
        f1s.append(f1)
        if verbose:
            print('Accuracy -->', accuracy)
            print('Precision -->', prec)
            print('Recall -->', rec)
            print('F1 -->', f1)
            print()
    n = float(len(accuracies))
    avg = sum(f1s) / n
    mse = sum([(x-avg)**2 for x in accuracies]) / n
    return (avg, mse**0.5, min(f1s),
            max(f1s))

# this takes about 3 minutes
print(evaluate_model(model, splits, verbose = True))

Evaluating fold 1 of 10
Accuracy --> 0.815
Precision --> 0.78
Recall --> 0.8387096774193549
F1 --> 0.8082901554404146

Evaluating fold 2 of 10
Accuracy --> 0.88
Precision --> 0.87
Recall --> 0.8877551020408163
F1 --> 0.8787878787878789

Evaluating fold 3 of 10
Accuracy --> 0.855
Precision --> 0.85
Recall --> 0.8585858585858586
F1 --> 0.8542713567839195

Evaluating fold 4 of 10
Accuracy --> 0.87
Precision --> 0.84
Recall --> 0.8936170212765957
F1 --> 0.8659793814432989

Evaluating fold 5 of 10
Accuracy --> 0.83
Precision --> 0.8
Recall --> 0.851063829787234
F1 --> 0.8247422680412372

Evaluating fold 6 of 10
Accuracy --> 0.845
Precision --> 0.78
Recall --> 0.896551724137931
F1 --> 0.8342245989304813

Evaluating fold 7 of 10
Accuracy --> 0.865
Precision --> 0.82
Recall --> 0.9010989010989011
F1 --> 0.8586387434554974

Evaluating fold 8 of 10
Accuracy --> 0.855
Precision --> 0.79
Recall --> 0.9080459770114943
F1 --> 0.8449197860962566

Evaluating fold 9 of 10
Accuracy --> 0.84
Precision --

This F1 score of **0.845** was an improvement over the Baseline NB implementation

## Logistic Regression

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

class PolarityPredictorBowLR(PolarityPredictorWithBagOfWords):

    def train_model_on_features(self, tr_features, tr_targets):
        # pass numpy array to sklearn to train Logistic Regression
        # iterations set to 1000 as default of 100 didn't guarantee convergence with our data
        self.model = LogisticRegression(max_iter=1000)
        self.model.fit(tr_features, tr_targets)
        
    def predict(
        self, data, get_accuracy = False,
        get_confusion_matrix = False
    ):
        features = self.extract_features(data)
        # use numpy to get predictions
        y_pred = self.model.predict(features)
        # restore labels
        labels = []
        for is_positive in y_pred:
            if is_positive:
                labels.append('pos')
            else:
                labels.append('neg')
        if get_accuracy or get_confusion_matrix:
            retval = []
            retval.append(labels)
            y_true = self.get_targets(data)
            if get_accuracy:
                retval.append(
                    metrics.accuracy_score(y_true, y_pred)
                )
            if get_confusion_matrix:
                retval.append(
                    metrics.confusion_matrix(y_true, y_pred)
                )
            return retval
        else:
            return labels

In [16]:
model = PolarityPredictorBowLR()
model.train(splits[0][0]) 

In [17]:
print_first_predictions(model, splits[0][1])

   0 pos pos films adapted|adapted from|from comic|comic books|books have|have had
   1 pos pos every now|now and|and then|then a|a movie|movie comes|comes along
   2 pos neg you've got|got mail|mail works|works alot|alot better|better than
   3 pos pos " jaws|jaws "|" is|is a|a rare|rare film|film that|that grabs|grabs your
   4 pos neg moviemaking is|is a|a lot|lot like|like being|being the|the general
   5 pos pos on june|june 30|30 ,|, 1960|1960 ,|, a|a self-taught|self-taught ,
   6 pos pos apparently ,|, director|director tony|tony kaye|kaye had|had a|a major
   7 pos pos one of|of my|my colleagues|colleagues was|was surprised|surprised when
   8 pos pos after bloody|bloody clashes|clashes and|and independence
   9 pos pos the american|american action|action film|film has|has been|been slowly
  10 pos pos after watching|watching "|" rat|rat race|race "|" last|last week|week ,
  11 pos pos i've noticed|noticed something|something lately|lately that|that i've


In [18]:
labels, accuracy, confusion_matrix = model.predict(
    splits[0][1], get_accuracy = True, get_confusion_matrix = True
)

print(accuracy)
print(confusion_matrix)

0.84
[[87 13]
 [19 81]]


In [19]:
print(evaluate_model(model, splits, verbose = True))

Evaluating fold 1 of 10
Accuracy --> 0.84
Precision --> 0.87
Recall --> 0.8207547169811321
F1 --> 0.8446601941747572

Evaluating fold 2 of 10
Accuracy --> 0.865
Precision --> 0.89
Recall --> 0.8476190476190476
F1 --> 0.8682926829268293

Evaluating fold 3 of 10
Accuracy --> 0.81
Precision --> 0.91
Recall --> 0.7583333333333333
F1 --> 0.8272727272727273

Evaluating fold 4 of 10
Accuracy --> 0.84
Precision --> 0.85
Recall --> 0.8333333333333334
F1 --> 0.8415841584158417

Evaluating fold 5 of 10
Accuracy --> 0.79
Precision --> 0.8
Recall --> 0.7843137254901961
F1 --> 0.792079207920792

Evaluating fold 6 of 10
Accuracy --> 0.83
Precision --> 0.88
Recall --> 0.8
F1 --> 0.8380952380952381

Evaluating fold 7 of 10
Accuracy --> 0.84
Precision --> 0.81
Recall --> 0.8617021276595744
F1 --> 0.8350515463917526

Evaluating fold 8 of 10
Accuracy --> 0.835
Precision --> 0.83
Recall --> 0.8383838383838383
F1 --> 0.8341708542713568

Evaluating fold 9 of 10
Accuracy --> 0.855
Precision --> 0.89
Recall --

This F1 score of **0.84** was worse than the Baseline Logistic Regression

## Decision Tree

In [20]:
from sklearn.tree import DecisionTreeClassifier

class PolarityPredictorBowDT(PolarityPredictorWithBagOfWords):

    def train_model_on_features(self, tr_features, tr_targets):
        # pass numpy array to sklearn to train Logistic Regression
        # iterations set to 1000 as default of 100 didn't guarantee convergence with our data
        self.model = DecisionTreeClassifier()
        self.model.fit(tr_features, tr_targets)
        
    def predict(
        self, data, get_accuracy = False,
        get_confusion_matrix = False
    ):
        features = self.extract_features(data)
        # use numpy to get predictions
        y_pred = self.model.predict(features)
        # restore labels
        labels = []
        for is_positive in y_pred:
            if is_positive:
                labels.append('pos')
            else:
                labels.append('neg')
        if get_accuracy or get_confusion_matrix:
            retval = []
            retval.append(labels)
            y_true = self.get_targets(data)
            if get_accuracy:
                retval.append(
                    metrics.accuracy_score(y_true, y_pred)
                )
            if get_confusion_matrix:
                retval.append(
                    metrics.confusion_matrix(y_true, y_pred)
                )
            return retval
        else:
            return labels

In [21]:
model = PolarityPredictorBowDT()
model.train(splits[0][0])

In [22]:
labels, accuracy, confusion_matrix = model.predict(
    splits[0][1], get_accuracy = True, get_confusion_matrix = True
)

print(accuracy)
print(confusion_matrix)

0.6
[[55 45]
 [35 65]]


In [23]:
print(evaluate_model(model, splits, verbose = True))

Evaluating fold 1 of 10
Accuracy --> 0.59
Precision --> 0.53
Recall --> 0.6022727272727273
F1 --> 0.5638297872340425

Evaluating fold 2 of 10
Accuracy --> 0.56
Precision --> 0.47
Recall --> 0.573170731707317
F1 --> 0.5164835164835165

Evaluating fold 3 of 10
Accuracy --> 0.61
Precision --> 0.71
Recall --> 0.5916666666666667
F1 --> 0.6454545454545454

Evaluating fold 4 of 10
Accuracy --> 0.61
Precision --> 0.57
Recall --> 0.6195652173913043
F1 --> 0.59375

Evaluating fold 5 of 10
Accuracy --> 0.565
Precision --> 0.6
Recall --> 0.5607476635514018
F1 --> 0.5797101449275363

Evaluating fold 6 of 10
Accuracy --> 0.57
Precision --> 0.59
Recall --> 0.5673076923076923
F1 --> 0.5784313725490196

Evaluating fold 7 of 10
Accuracy --> 0.65
Precision --> 0.67
Recall --> 0.6442307692307693
F1 --> 0.6568627450980393

Evaluating fold 8 of 10
Accuracy --> 0.57
Precision --> 0.54
Recall --> 0.574468085106383
F1 --> 0.5567010309278351

Evaluating fold 9 of 10
Accuracy --> 0.565
Precision --> 0.6
Recall -

## Support Vector Machine

In [15]:
from sklearn import svm

class PolarityPredictorBowSVM(PolarityPredictorWithBagOfWords):

    def train_model_on_features(self, tr_features, tr_targets):
        # pass numpy array to sklearn to train Logistic Regression
        # iterations set to 1000 as default of 100 didn't guarantee convergence with our data
        self.model = svm.SVC()
        self.model.fit(tr_features, tr_targets)
        
    def predict(
        self, data, get_accuracy = False,
        get_confusion_matrix = False
    ):
        features = self.extract_features(data)
        # use numpy to get predictions
        y_pred = self.model.predict(features)
        # restore labels
        labels = []
        for is_positive in y_pred:
            if is_positive:
                labels.append('pos')
            else:
                labels.append('neg')
        if get_accuracy or get_confusion_matrix:
            retval = []
            retval.append(labels)
            y_true = self.get_targets(data)
            if get_accuracy:
                retval.append(
                    metrics.accuracy_score(y_true, y_pred)
                )
            if get_confusion_matrix:
                retval.append(
                    metrics.confusion_matrix(y_true, y_pred)
                )
            return retval
        else:
            return labels

In [16]:
model = PolarityPredictorBowSVM()
model.train(splits[0][0])

In [17]:
labels, accuracy, confusion_matrix = model.predict(
    splits[0][1], get_accuracy = True, get_confusion_matrix = True
)

print(accuracy)
print(confusion_matrix)

0.73
[[96  4]
 [50 50]]


In [18]:
print(evaluate_model(model, splits, verbose = True))

Evaluating fold 1 of 10
Accuracy --> 0.73
Precision --> 0.96
Recall --> 0.6575342465753424
F1 --> 0.7804878048780488

Evaluating fold 2 of 10
Accuracy --> 0.755
Precision --> 0.98
Recall --> 0.6758620689655173
F1 --> 0.8

Evaluating fold 3 of 10
Accuracy --> 0.69
Precision --> 0.96
Recall --> 0.6233766233766234
F1 --> 0.7559055118110236

Evaluating fold 4 of 10
Accuracy --> 0.705
Precision --> 0.94
Recall --> 0.6394557823129252
F1 --> 0.7611336032388665

Evaluating fold 5 of 10
Accuracy --> 0.69
Precision --> 0.94
Recall --> 0.6266666666666667
F1 --> 0.752

Evaluating fold 6 of 10
Accuracy --> 0.73
Precision --> 0.93
Recall --> 0.6642857142857143
F1 --> 0.775

Evaluating fold 7 of 10
Accuracy --> 0.74
Precision --> 0.95
Recall --> 0.6690140845070423
F1 --> 0.7851239669421488

Evaluating fold 8 of 10
Accuracy --> 0.715
Precision --> 0.93
Recall --> 0.6503496503496503
F1 --> 0.7654320987654321

Evaluating fold 9 of 10
Accuracy --> 0.77
Precision --> 0.96
Recall --> 0.6956521739130435
F1 

This F1 Score for SVM of **0.77** was worse than Baseline SVM

Once again, implementing Bigrams didn't guarantee an improvment in the performance of any individual algorithm but it did improve some, at the cost of a slower run time.

The next step was to examine what a move to Trigrams might do.