In [16]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score

In [4]:
data_train = pd.read_json('train.json')
data_val = pd.read_json('val.json')

In [5]:
def createBigramModel(data):
    bigrams = {}
    num_reviews, review_length = data.shape

    for r in range(num_reviews):
        prevWord = "<start>"

        for w in range(review_length):

            currWordStruct = data[w][r]

            if currWordStruct == None:
                break
                
            currWord = currWordStruct[0]
            currLang = currWordStruct[1]

            if w < (review_length - 1):
                nextWordStruct = data[w + 1][r]
                if nextWordStruct:

                    nextWord = nextWordStruct[0]
                    nextLang = nextWordStruct[1]
                    
                    toAppend = 0
                    if currLang != nextLang:
                        toAppend = 1
                        
                    if (prevWord, currWord) in bigrams.keys():
                        bigrams[(prevWord, currWord)].append(toAppend)
                    else:
                        bigrams[(prevWord, currWord)] = [toAppend]
            
            prevWord = currWord
    
    for key in bigrams.keys():
        bigrams[key] = np.mean(bigrams[key])
    
    return bigrams

In [6]:
model = createBigramModel(data_train)

In [27]:
def createLabelsBinary(data):
    labels = []
    num_reviews, review_length = data.shape

    for r in range(num_reviews):
        label_vec = []

        for w in range(review_length):

            currWordStruct = data[w][r]

            if currWordStruct == None:
                break
                
            currWord = currWordStruct[0]
            currLang = currWordStruct[1]

            if w < (review_length - 1):
                nextWordStruct = data[w + 1][r]
                if nextWordStruct:

                    nextWord = nextWordStruct[0]
                    nextLang = nextWordStruct[1]

                    if currLang != nextLang:
                        label_vec.append(1)
                        
                    else:
                        label_vec.append(0)

                else:
                    label_vec.append(0)
            else:
                label_vec.append(0)

        labels.append(label_vec)
    
    return labels
    

In [28]:
val_labels = createLabelsBinary(data_val)

In [33]:
def testBayes(model, testData, testLabels):
    num_reviews, review_length = testData.shape
    trueLabels = []
    predictLabels = []
    
    for r in range(num_reviews):
        
        reviewLabels = testLabels[r]
        prevWord = "<start>"
        
        for w in range(review_length):

            currWordStruct = testData[w][r]

            if currWordStruct == None:
                break
                
            currWord = currWordStruct[0]
            
            label = 0
            if (prevWord, currWord) in model.keys():
                prob = model[(prevWord, currWord)]
                label = np.random.choice([0, 1], p=[1 - prob, prob])
            
            trueLabels.append(reviewLabels[w])
            predictLabels.append(label)

    val_f1 = f1_score(trueLabels, predictLabels, average='binary')
    val_recall = recall_score(trueLabels, predictLabels, average='binary')
    val_precision = precision_score(trueLabels, predictLabels, average='binary')
    
    print("— val_f1: %f — val_precision: %f — val_recall %f" % (val_f1, val_precision, val_recall))

In [34]:
testBayes(model, data_val, val_labels)

— val_f1: 0.314140 — val_precision: 0.315106 — val_recall 0.313179
