In [1]:
import csv
import string
import re
import math 
import random
from random import shuffle

from sklearn.svm import LinearSVC
from sklearn.svm import LinearSVR
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import NuSVC

from sklearn.metrics import precision_recall_fscore_support
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

from nltk.tag import CRFTagger
from nltk import pos_tag
from nltk.classify import SklearnClassifier
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

import warnings
warnings.filterwarnings('always') # always show warnings, useful for SK Learning warnings

In [2]:
# import our n-gram language module code from ngrammodel.py
import ngrammodel as ngram

# to refresh our module if changed
%load_ext autoreload
%autoreload 1
%reload_ext autoreload

In [3]:
# GLOBAL PARAMETERS

ngramGender = False # enable perplexity from N-gram models on gender to be added as features
ngramCharacter = False # enable character N-gram model features
sentenceLength = False # add length of sentence as feature
stopsRemoved = False # add number of stop words removed as a feature

In [4]:
featureDict = {} # a global dictionary of features
trainPath = 'training.csv'
testPath = 'test.csv'

def parseLine(line):
    # line[0] = TEXT/SCRIPT LINE
    # line[1] = CHARACTER NAME
    # line[2] = GENDER
    return (line[0], line[1], line[2])

def loadData(path):
    data = []
    with open(path, encoding="latin-1") as file: 
        reader = csv.reader(file, delimiter=',')
        for line in reader:
            # print(line)
            (Text, Name, Gender) = parseLine(line)
            data.append((Text, Name, Gender))
        print("All data loaded from " + path)
    return data

In [5]:
trainData = loadData(trainPath)

All data loaded from training.csv


In [6]:
# preProcess - process one sentence and return a list of processed tokens
# includes various normalisation options
#
# parameters:
# @punc - include punctuation as features
# @stem - simplify into stems using NLTK
# @stop - remove stop words using either SKLearn = 1 or NLTK = 2
# @lemma - lemmatize with NLTK (only to be used in addition with crftag=3)
# @bigrams - include bigrams as features instead of unigrams
# @crftag - include POS tags, see crftagging() for details

def preProcess(text, punc=1, stem=1, stop=2, lemma=0, bigrams=0, crftag=3):
    # save a list of the punctuation marks before removal, so they can be re-added as features
    regexPunc = re.compile('[^_\'&+?£:;!,-]')
    puncList = regexPunc.sub('', text)
    
    # count the number of pauses in the sentence
    pauses = text.count("...")
    
    # count the number of full stops that are not pauses ('...')
    # regexp matches only '.' that is not preceded by another '.', or followed by another '.'
    # (i hate regexp - this took far too long)
    fullStops = re.findall('(?<!\.)(?!\.\.)([.])', text)
    
    # clean up data set by removing unwanted characters
    regex = re.compile('[^a-zA-Z0-9\' ]')
    text = regex.sub(' ', text) # replace unmatched characters with whitespace (except apostrophes)
    
    # now remove apostrophes but don't replace with whitespace
    regex2 = re.compile('[^a-zA-Z0-9 ]')
    text = regex2.sub('', text)
       
    # make the text lowercase for consistency
    text = text.lower()  
    
    # split by whitespace
    tokens = text.split()
    
    # save our basic pre-processed tokens to be passed to n-gram language models later
    ngramTokens = tokens
    
    # get a list of our CRF tags to be added later
    if (crftag != 0): # from nltk.tag import CRFTagger
        crftags = crftagging(tokens, crftag)
    
    sentLength = len(tokens)
    
    # stem words using NLTK
    if (stem == 1):
        tokens = stemTokens(tokens)
        
    # bigrams
    if (bigrams == 1):
        new_tokens = []
        for i in range(len(tokens)-1):
            new_tokens.append(tokens[i] + " " + tokens[i+1])
        tokens = new_tokens
    
    # keep a count of tokens before stop removal
    tokenCount = len(tokens)
    
    # remove stop words using sklearn
    if (stop == 1):
        tokens = stopTokens(tokens, 1)
        
    # remove stop words using NLTK
    if (stop == 2):
        tokens = stopTokens(tokens, 2)
    
    # calculate how many stop words we removed
    stopsRemoved = tokenCount - len(tokens)
    
    # implement lemma with NLTK
    if (lemma == 1):
        tokens = lemmaTokens(tokens)
    
    # append our punctuation characters as features
    if (punc == 1):
        for x in puncList:
            tokens.append(x)
    
        # append our full stops as features (i.e. rough encoding of number of sentences per line)
        for y in fullStops:
            tokens.append(y)
        
        # append the correct number of pauses as features
        for i in range(pauses):
            tokens.append("...")
        
    # merge our CRF tag list with our token list if required
    if (crftag != 0):
        tokens += crftags
    
    return tokens, stopsRemoved, sentLength, ngramTokens
    
# simplify words into their stems using the SnowballStemmer from NLTK
def stemTokens(tokens):
    new_tokens = []
    stemmer = SnowballStemmer("english")
    for token in tokens:
        new_tokens.append(stemmer.stem(token))
    return new_tokens

# remove stop words
# lib = 1, use SKLearn
# lib = 2, use NLTK
def stopTokens(tokens, lib):
    # remove using SKLearn
    # list of removed words: https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/stop_words.py
    if (lib == 1):
        tfidf_vectorizer = TfidfVectorizer(stop_words="english")
        x = tfidf_vectorizer.fit_transform(tokens)
        new_tokens = tfidf_vectorizer.get_feature_names()
        
    # remove stop words using NLTK
    # type set(stopwords.words('english')) in interpreter to see list of words
    if (lib == 2):
        stop_words = set(stopwords.words('english'))
        new_tokens = [w for w in tokens if not w in stop_words] 
        
    return new_tokens

# Lemmatization of tokens with NLTK
# input is a list of (token,tag) tuples - the output of crftagging() with combined=1 and tagger=3 arguments
def lemmaTokens(tokens):
    taggedTokens = crftagging(tokens, tagger=3, combined=1)
    new_tokens = []
    lemmatizer = WordNetLemmatizer() 
    for pair in taggedTokens:
        # crftagging(tagger=3) returns 12 universal tags - match these to wordnet POS types, otherwise use default NOUN
        wordnetPOS = ["NOUN", "VERB", "ADJ", "ADV"]
        if pair[1] == "NOUN":
            tag = wordnet.NOUN
        if pair[1] == "VERB":
            tag = wordnet.VERB
        if pair[1] == "ADJ":
            tag = wordnet.ADJ
        if pair[1] == "ADV":
            tag = wordnet.ADV
        else:
            tag = wordnet.NOUN
        new_tokens.append(lemmatizer.lemmatize(pair[0], tag)) 
    return new_tokens

# returns a list of the POS tags, using one of 3 tagging methods
# parameters:
# @tokens - list of words to be tagged
# @tagger - tagging method as below
# @combined - whether to return a list of [(token,tag)] tuples, to pass to lemmaTokens() function
def crftagging(tokens, tagger=3, combined=0):
    crftags = []
    if tagger == 1:
        TAGGER_PATH = "trainedCRFtagger"   # pre-trained POS-tagger
        tagger = CRFTagger()
        tagger.set_model_file(TAGGER_PATH)
        taggedTokens = tagger.tag(tokens)
    if tagger == 2:
        taggedTokens = pos_tag(tokens)
    # output universal tagset of Petrov, Das, & McDonald
    if tagger == 3:
        taggedTokens = pos_tag(tokens, tagset='universal')
    
    if(combined == 1):
        return taggedTokens
    else:
        for pair in taggedTokens:
            tag = pair[1]
            crftags.append(tag)
        return crftags

  fullStops = re.findall('(?<!\.)(?!\.\.)([.])', text)


In [7]:
# toFeatureVector - return a weighted dictionary containing text features as keys and weights as values
# takes the tuple returned by preProcess() as input
#
# preProcessed[0] - tokens/features to be counted
# preProcessed[1] - the amount of stop words removed
# preProcessed[2] - total sentence length
# preProcessed[3] - list of basic preProcessed words for our language models

def toFeatureVector(preProcessed):
    featureVector = {}
    for token in preProcessed[0]:
        
        # feature dictionary
        if token not in featureVector:
            featureVector[token] = 1
        else:
            featureVector[token] += 1
            
        # global dictionary
        if token not in featureDict:
            featureDict[token] = 1
        else:
            featureDict[token] += 1
            
    if(stopsRemoved == True):      
        featureVector["stopsRemoved"] = preProcessed[1]
    if(sentenceLength == True):   
        featureVector["sentenceLength"] = preProcessed[2]
    
    return featureVector

In [8]:
# preProcess and add featureVectors for all data
def processVector(rawData):
    processedData = []
    for (Text, Name, Gender) in rawData:
        preProcessed = preProcess(Text)
        ngramText = preProcessed[3]
        processedData.append((toFeatureVector(preProcessed),Name,Gender,ngramText))
    return processedData

# Training N-gram language models with Kneser-Ney smoothing

Code below is for training ngram language models using Kneser-Ney (KN) smoothing. Majority of the code for this is in ./ngrammodel.py and imported into this notebook.

The idea is to split the data by label, e.g. male/female, and train an ngram model on each. For each sentence, the perplexity of that sentence when returned by each language model was added as a feature. In theory this will allow us to encode typical speech patterns, phrases, words of either male/female speaker, or for each character. If a particular sentence has a lower perplexity in one language model, this means it matches this model closer than the others.

The KN smoothing takes two parameters, order and discount. In order to tune for these parameters below, a simple training/heldout split is used to test different values.

Since perplexity has to added as a feature to every sentence, a cross-validation method is used in the function to do this (addNGramFeatures). If we include the same sentence we are trying to get perplexity of in the training of the language model, we will get a very low perplexity as this sentence has been "seen" before. Therefore to tag the entire training data with perplexities, and also ensure our language models are sufficiently trained, a 5 fold training cross-validation method is implemented, where we train 4/5 of the data and tag the remaining 1/5, over 5 iterations to tag the entire set. This is preferable over say splitting the data 50% and tagging the remaining 50% and vice versa, as we are training the language model on more data each time.

This allows us to have accurate accurate perplexity features when we're running cross-validation on the classifier. For the final classification on the test data, the language models are trained on the entire training data.

In [9]:
# get basic pre-processed text (i.e. lowercase, removed punctuation and encoding errors) 
# from the training data to train our ngram language models 
def preProcessNGram(rawData):
    ngramData = []
    for (Text, Name, Gender) in rawData:
        ngramData.append((preProcess(Text, punc=0, stem=0, stop=0, lemma=0, bigrams=0, crftag=0),Name,Gender))
    return ngramData

In [10]:
def processNGram(ngramData):
    # create a dictionary of unigram counts
    unigrams = ngram.countUnigrams(ngramData)
    # replace any words only seen once with </unk>
    vocab = ngram.minDocFrequency(unigrams, 2)
    # train on the data/vocab and return
    trainedKN = ngram.trainKN(ngramData, vocab, order)
    # returns a tuple of the trained KN and the vocab
    return (trainedKN, vocab)

In [11]:
def getSentencePerplexity(sentences, tupleKN):
    # tupleKN[0] = trainedKN
    # tupleKN[1] = vocab
    return math.trunc(ngram.perplexityKN(sentences, tupleKN[1], order, discount, False, tupleKN[0]))

In [12]:
ngramData = preProcessNGram(trainData)

In [13]:
# specify the order and discount parameters to use for the Kneser-Ney smoothing
order = 5
discount = 0.9

In [14]:
# basic 80/20 training/holdout split for tuning KN parameters
maleData = []
femaleData = []
# build a list of male/female only sentences to train our ngram language model
for x in ngramData:
    if x[2] == 'male':
        maleData.append(x[0][0])
    if x[2] == 'female':
        femaleData.append(x[0][0])

# split into training and heldout
maleTrain = maleData[0:int(len(maleData)*0.8)]
maleHeldout = maleData[int(len(maleData)*0.8):]

femaleTrain = femaleData[0:int(len(femaleData)*0.8)]
femaleHeldout = femaleData[int(len(femaleData)*0.8):]

maleKN = processNGram(maleTrain)
femaleKN = processNGram(femaleTrain)

In [15]:
# testing different order and discount values for the male/female models
# discountList = [0.2, 0.4, 0.6, 0.8, 0.9, 0.925, 0.95] 
discountList = [0.7, 0.8, 0.9] # discounts to attempt for each order

for i in range(3,5): # orders 3,4
    order = i
    for j in discountList:
        discount = j
        print("male: ", getSentencePerplexity(maleHeldout, maleKN))
        print("female: ", getSentencePerplexity(femaleHeldout, femaleKN))
        print("order: ", i, " discount: ", j, "\n")

male:  115
female:  115
order:  3  discount:  0.7 

male:  110
female:  110
order:  3  discount:  0.8 

male:  109
female:  109
order:  3  discount:  0.9 

male:  121
female:  120
order:  4  discount:  0.7 

male:  112
female:  112
order:  4  discount:  0.8 

male:  109
female:  109
order:  4  discount:  0.9 



# KN tuning results

As can be seen below the optimal values for order and discount was found to be 5 and 0.9 respectively.

| 1      | 2       | 3        | 4        | 5        | 6       |
|--------|---------|----------|----------|----------|---------|
| 0\.2   | 228\.65 | 134\.19  | 206\.77  | 273\.66  | 321\.98 | 368\.54  |
| 0\.4   | 228\.65 | 115\.31  | 137\.18  | 158\.56  | 173\.44 | 187\.14  |
| 0\.6   | 228\.65 | 107\.72  | 111\.43  | 119\.20  | 124\.89 | 130\.11  |
| 0\.8   | 228\.65 | 105\.46  | 100\.46  | 102\.11  | 103\.81 | 105\.45  |
| 0\.9   | 228\.65 | 106\.61  | 99\.44   | **99\.33**   | 99\.84  | 100\.41  |
| 0\.925 | 228\.65 | 107\.335 | 99\.925  | 99\.505  | 99\.785 | 100\.125 |
| 0\.95  | 228\.65 | 108\.39  | 100\.955 | 100\.295 | 100\.37 | 100\.515 |
| 0\.975 | 228\.65 | 110\.015 | 102\.95  | 102\.195 | 102\.12 | 102\.095 |


In [16]:
# process all the training data in according with preProcess and feature vector parameters
processedTrainData = processVector(trainData)
processedTrainData[4]

({'oscar': 1, 'asleep': 1, "'": 1, '.': 1, 'NOUN': 1, 'VERB': 1},
 'MAX',
 'male',
 ['oscars', 'asleep'])

In [17]:
# reset the order and discount parameters to use for the Kneser-Ney smoothing
order = 5
discount = 0.9

In [18]:
'''
addNGramFeatures()

We use this function to add the ngram features to all the training data for every sentence
(using the 4/5ths cross-validation method as explained above). This function also adds the 
features to test data when we run the final classification task.

If testing=False (default), we run the separate cross-validation method by training on 4/5ths of the data 
and add the features only to the trainingData.

The testing=True flag denotes we should train the ngram model on all the trainData and add perplexity features 
to the testData.
'''
def addNGramFeatures(trainData, folds, testData=None, testing=False):
    newData = []
    
    ##### If we're tagging the training data only
    if testing == False:
        
        # setup our cross validation folds
        foldSize = int(len(trainData)/folds)
        # allow for datasets that do not divide exactly by number of folds - crossTrain data will always include the remainder
        remainder = len(trainData) % folds 
        
        for i in range(0,len(trainData)-remainder,foldSize):
            crossTest = trainData[i:i+foldSize] # from start of our heldout up to the fold size
            crossTrain = trainData[:i] + trainData[i+foldSize:] # from start to start of test, and also from the end of heldout up to end of set
            
            ##### add the ngram gender tags if specified by global var
            if ngramGender == True:
                maleData = []
                femaleData = []
                # train our language models only on the crossTrain data
                for line in crossTrain:
                    if line[2] == 'male':
                        # append the ngram training sentence at index 3 of our dataset
                        maleData.append(line[3])
                    if line[2] == 'female':
                        femaleData.append(line[3])

                maleKN = processNGram(maleData)
                femaleKN = processNGram(femaleData)
                    
            if ngramCharacter == True:    
                
                charDict = {}
                charList = ["SEAN", "SHIRLEY", "MAX", "IAN", "MINTY", "RONNIE", "STACEY", "JANE", "PHIL", "CLARE", "TANYA", "HEATHER", "GARRY", "BRADLEY", "CHRISTIAN", "STEVEN", "ROXY", "JACK"]
                # initialise a dict with characters as keys and a [[list], None] as a value
                # the [list] will store all lines attributed to the character
                # 'None' will latter be used to store the trainedKN tuple to pass to getSentencePerplexity
                charDict = {k: [[],None] for k in charList}
                
                # train our language models only on the crossTrain data
                for _, name, _, ngramline in crossTrain:
                    # append the line to the list at index [0] of our other list
                    charDict[name][0].append(ngramline)
                # process all the ngrams on the list of text for that character and put it in the index[1] of the charDict list
                for char in charDict:
                    charDict[char][1] = processNGram(charDict[char][0])

            # for each featureVector in crossTest, get perplexity of the sentence and add as a feature
            for line in crossTest:
                # copy the featureVector into a new one (so we don't ammend the old one)
                featureVector = line[0].copy()
                ngramLine = line[3]
                
                if ngramGender == True:
                    featureVector["male"] = getSentencePerplexity([ngramLine], maleKN)
                    featureVector["female"] = getSentencePerplexity([ngramLine], femaleKN)
                    
                if ngramCharacter == True:
                    for char in charDict:
                        featureVector[char] = getSentencePerplexity([ngramLine], charDict[char][1])
                    
                new_line = (featureVector, line[1], line[2], line[3])
                newData.append(new_line)
                
            continue
            
        return newData
            
    ##### If we're on final testing, train on all the trainData and tag the testData        
    if testing == True:
        if ngramGender == True:
            maleData = []
            femaleData = []

            for line in trainData:
                if line[2] == 'male':
                    maleData.append(line[3])
                if line[2] == 'female':
                    femaleData.append(line[3])

            maleKN = processNGram(maleData)
            femaleKN = processNGram(femaleData)
        
        if ngramCharacter == True:
            
            charDict = {}
            charList = ["SEAN", "SHIRLEY", "MAX", "IAN", "MINTY", "RONNIE", "STACEY", "JANE", "PHIL", "CLARE", "TANYA", "HEATHER", "GARRY", "BRADLEY", "CHRISTIAN", "STEVEN", "ROXY", "JACK"]
            charDict = {k: [[],None] for k in charList}
            
            # train our language models on the entire training data
            for _, name, _, ngramline in trainData:
                # append the line to the list at index [0] of our other list
                charDict[name][0].append(ngramline)

            # process all the ngrams on the list of text for that character and put it in the index[1] of the charDict list
            for char in charDict:
                charDict[char][1] = processNGram(charDict[char][0])
        
        # for each featureVector in testData, get perplexity of the sentence and add as a feature
        for line in testData:
            featureVector = line[0].copy()
            ngramLine = line[3]

            if ngramGender == True:
                featureVector["male"] = getSentencePerplexity([ngramLine], maleKN)
                featureVector["female"] = getSentencePerplexity([ngramLine], femaleKN)

            if ngramCharacter == True:
                for char in charDict:
                    featureVector[char] = getSentencePerplexity([ngramLine], charDict[char][1]) 
                    
            new_line = (featureVector, line[1], line[2], line[3])
            newData.append(new_line)
            
        return newData

In [19]:
# add gender ngram perplexities if required by the global vars
if(ngramGender == True or ngramCharacter == True):
    finalTrainData = addNGramFeatures(processedTrainData, 5)
else:
    finalTrainData = processedTrainData

# Classifier selection

In [20]:
# selection of classifier
def trainClassifier(trainData):
    print("Training Classifier...")
    #return SklearnClassifier(LinearSVC(C=0.05, max_iter=10000, loss='squared_hinge', class_weight='balanced')).train(trainData)
    #return SklearnClassifier(NuSVC(cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf', max_iter=-1, nu=0.5, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)).train(trainData)
    #return SklearnClassifier(BernoulliNB()).train(trainData)
    #return SklearnClassifier(MultinomialNB(alpha=1.0, fit_prior=False, class_prior=None)).train(trainData)
    return SklearnClassifier(ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False)).train(trainData)

In [21]:
def crossValidate(dataset, folds):
    shuffle(dataset) # shuffle only once before implementing k-folds
    cv_results = []
    sk_reports = []
    foldSize = int(len(dataset)/folds)
    # allow for datasets that do not divide exactly by number of folds - crossTrain data will always include the remainder
    remainder = len(dataset) % folds 
    
    for i in range(0,len(dataset)-remainder,foldSize):
        crossTest = dataset[i:i+foldSize] # from start of our heldout up to the fold size
        crossTrain = dataset[:i] + dataset[i+foldSize:] # from start to start of test, and also from the end of heldout up to end of set
        #print("test: ", len(crossTest))
        #print("train: ", len(crossTrain))
        
        # train classifier on the trainData
        classifier = trainClassifier(crossTrain)
        
        # predict on the testData with the classifier
        crossPredictions = predictLabels(crossTest, classifier)
        #print("predictions: ", len(crossPredictions))
        
        # return the actual labels for the testData
        crossActual = [x[1] for x in crossTest] # list comprehension to take only gender/character label from testData
        #print("actual: ", len(crossActual))
        #print("---------")
        
        # use sklearn.metrics to see how our predictions did
        # provide a weighted score to account for imbalance between labels
        result = precision_recall_fscore_support(crossActual, crossPredictions, average='weighted')

        cv_results.append(result)

        continue
    return cv_results

In [22]:
# our current data includes both gender and character labels - remove the character/gender labels
def splitGender(processedData):
    genderData = []
    for (Text, Name, Gender, _) in processedData:
        genderData.append((Text,Gender)) 
    return genderData

def splitCharacter(processedData):
    characterData = []
    for (Text, Name, Gender, _) in processedData:
        characterData.append((Text,Name)) 
    return characterData

genderTrain = splitGender(finalTrainData)
characterTrain = splitCharacter(finalTrainData)

In [23]:
def predictLabels(reviewSamples, classifier):
    return classifier.classify_many(map(lambda t: t[0], reviewSamples))

# Cross validation results

In [24]:
# An example of the features before running the classifier
print(finalTrainData[115][0:3])
print("Number of features: ", len(featureDict))

({'wed': 1, 'well': 1, 'caus': 1, 'shirley': 3, 'go': 1, 'bridesmaid': 1, "'": 1, ',': 1, '!': 1, '...': 2, 'PRON': 4, 'ADV': 1, 'DET': 1, 'NOUN': 6, 'VERB': 4, 'ADJ': 1, 'PRT': 1, 'CONJ': 1}, 'HEATHER', 'female')
Number of features:  4172


In [25]:
cv_results = crossValidate(genderTrain, 10)

total = [0,0,0]
for x in cv_results:
    total[0] += x[0] # precision
    total[1] += x[1] # recall
    total[2] += x[2] # f score

# averages
total[0] = total[0]/len(cv_results)
total[1] = total[1]/len(cv_results)
total[2] = total[2]/len(cv_results)

print("Average: ", total)


Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Average:  [0.5858970714303553, 0.585459940652819, 0.5854145001898793]


In [26]:
cv_results = crossValidate(characterTrain, 10)

total = [0,0,0]
for x in cv_results:
    total[0] += x[0] # precision
    total[1] += x[1] # recall
    total[2] += x[2] # f score

# averages
total[0] = total[0]/len(cv_results)
total[1] = total[1]/len(cv_results)
total[2] = total[2]/len(cv_results)

print("Average: ", total)

Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Average:  [0.2150325481175268, 0.21899109792284865, 0.20906719146529662]


# Final classification on the test data

In [27]:
# train the classifier on trainData and predict on testData
def testPrediction(testData, trainData):
    # train on the trainData
    classifier = trainClassifier(trainData)
    # predict on the testData with the classifier
    predictions = predictLabels(testData, classifier)
    #print("predictions: ", len(crossPredictions))
    # return the actual labels for the testData
    actual = [x[1] for x in testData] # list comprehension to take only gender/character label from testData
    # use sklearn.metrics to see how our predictions did
    # provide a weighted score to account for imbalance between labels
    report = classification_report(actual, predictions, labels=None, target_names=None, sample_weight=None, digits=2, output_dict=False)
    result = precision_recall_fscore_support(actual, predictions, average='weighted')
    return result, report

In [28]:
# load the testData
testData = loadData(testPath)

# process the testData
processedTest = processVector(testData)

# add ngrams to testData based on language model trained on entire training data if required
if(ngramGender == True or ngramCharacter == True):
    finalTest = addNGramFeatures(finalTrainData, 5, testData=processedTest, testing=True)
else:
    finalTest = processedTest

# split into genderData and characterData
genderTest = splitGender(finalTest)
characterTest = splitCharacter(finalTest)

All data loaded from test.csv


In [29]:
genderResult, genderReport = testPrediction(genderTest, genderTrain)

charResult, charReport = testPrediction(characterTest, characterTrain)

print(genderReport)
print(genderResult)
print(charReport)
print(charResult)


Training Classifier...
Training Classifier...
              precision    recall  f1-score   support

      female       0.55      0.59      0.57       526
        male       0.61      0.58      0.60       598

    accuracy                           0.58      1124
   macro avg       0.58      0.58      0.58      1124
weighted avg       0.58      0.58      0.58      1124

(0.5837706014487268, 0.5818505338078291, 0.5822786465035372, None)
              precision    recall  f1-score   support

     BRADLEY       0.20      0.15      0.17        41
   CHRISTIAN       0.27      0.17      0.21        46
       CLARE       0.13      0.23      0.17        31
       GARRY       0.13      0.04      0.06        48
     HEATHER       0.23      0.33      0.27        42
         IAN       0.23      0.29      0.25       101
        JACK       0.25      0.18      0.21        85
        JANE       0.38      0.26      0.31        76
         MAX       0.19      0.29      0.23        73
       MINTY       

# Baseline performance and general data evalutation functions

In [30]:
# calculate our baseline performance with random data
def calculateBaseline(testData, trainData, predList):
    # predict on the testData randomly
    predictions = []
    for i in range(len(testData)):
        predictions.append(random.choice(predList))
    # return the actual labels for the testData
    actual = [x[1] for x in testData] # list comprehension to take only gender/character label from testData
    # use sklearn.metrics to see how our predictions did
    # provide a weighted score to account for imbalance between labels
    report = classification_report(actual, predictions, labels=None, target_names=None, sample_weight=None, digits=2, output_dict=False)
    result = precision_recall_fscore_support(actual, predictions, average='weighted')
    return result, report

In [31]:
charList = ["SEAN", "SHIRLEY", "MAX", "IAN", "MINTY", "RONNIE", "STACEY", "JANE", "PHIL", "CLARE", "TANYA", "HEATHER", "GARRY", "BRADLEY", "CHRISTIAN", "STEVEN", "ROXY", "JACK"]
genderList = ["male", "female"]

gblResult, gblReport = calculateBaseline(genderTest, genderTrain, genderList)

cblResult, cblReport = calculateBaseline(characterTest, characterTrain, charList)

print(gblReport)
print(gblResult)
print(cblReport)
print(cblResult)

              precision    recall  f1-score   support

      female       0.47      0.52      0.50       526
        male       0.54      0.49      0.51       598

    accuracy                           0.50      1124
   macro avg       0.51      0.51      0.50      1124
weighted avg       0.51      0.50      0.50      1124

(0.5077182619991688, 0.5044483985765125, 0.5047838268976055, None)
              precision    recall  f1-score   support

     BRADLEY       0.02      0.02      0.02        41
   CHRISTIAN       0.07      0.09      0.08        46
       CLARE       0.02      0.03      0.02        31
       GARRY       0.07      0.08      0.07        48
     HEATHER       0.03      0.05      0.04        42
         IAN       0.05      0.03      0.04       101
        JACK       0.08      0.06      0.07        85
        JANE       0.05      0.04      0.04        76
         MAX       0.05      0.04      0.04        73
       MINTY       0.00      0.00      0.00        51
        PHI

In [32]:
# character and line count so we can assess our performance
charDict = {}
for _, char in characterTrain:
    if char not in charDict:
        charDict[char] = 1
    else:
        charDict[char] += 1

print("number of unique characters: ", len(charDict))
print(charDict)  

number of unique characters:  18
{'STACEY': 619, 'PHIL': 500, 'CHRISTIAN': 397, 'IAN': 935, 'TANYA': 1276, 'ROXY': 411, 'JANE': 675, 'BRADLEY': 332, 'MINTY': 454, 'MAX': 714, 'HEATHER': 469, 'STEVEN': 298, 'SHIRLEY': 620, 'GARRY': 340, 'JACK': 591, 'CLARE': 416, 'SEAN': 520, 'RONNIE': 546}
