In [67]:
import pandas as pd
import numpy as np
import nltk 
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from spacy.lang.en.stop_words import STOP_WORDS
import spacy
from tqdm import tqdm
import string
import collections
import math
from sklearn.cluster import KMeans
from scipy.spatial.distance import cosine
import warnings
warnings.filterwarnings('ignore')

In [68]:
nltk.download('wordnet')
nltk.download('punkt')
en = spacy.load('en_core_web_sm')
stopwords = en.Defaults.stop_words 
stopwords.update(['film', 'movie', 'nt', 'like','','ve','films'])

[nltk_data] Downloading package wordnet to /home/joodotey/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/joodotey/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [69]:
def tokenize(document):
  tokens = []
  sentences = sent_tokenize(document)
  for sentence in sentences:
    words = word_tokenize(sentence)

    #make all words lower case
    words = [word.lower() for word in words if word and len(word) >  2]

    #remove all punctuation
    words = [word.translate(str.maketrans('', '', string.punctuation)) for word in words]

    #remove stop words
    words = [word for word in words if word.lower() not in stopwords]
    
    tokens += words
  return tokens

In [70]:
def get_top_50(dataset):

    embeddings = np.array(dataset.embedding.tolist())
    Kmeans_clusterer = KMeans(n_clusters = 1, 
                                random_state = 42)
    clusters = Kmeans_clusterer.fit_predict(embeddings)
    dataset['cluster'] = pd.Series(clusters, index = dataset.index).astype('int')
    dataset['centroid'] = dataset.cluster.apply(lambda x: Kmeans_clusterer.cluster_centers_[x])
    dataset['embedding'] = dataset['embedding'].apply(np.array)
    dataset['distance'] = dataset.apply(lambda x : cosine(x['centroid'], x['embedding']), axis = 1)
    dataset = dataset.groupby('cluster')
    dataset = dataset.apply(lambda x : x.sort_values(by = 'distance'))
    
    return dataset.text.tolist()[:50]

In [71]:
#load dataset
dataset = pd.read_json('data/updated_baseline.json')


In [72]:
false_positives = dataset[dataset.predicted == 1.0]
top_50_false_positives = get_top_50(false_positives)

false_negatives = dataset[dataset.predicted == 0.0]
top_50_false_negatives = get_top_50(false_negatives)

In [73]:
top_50_false_positives_documents = []
for document in top_50_false_positives:
  top_50_false_positives_documents.extend(tokenize(document))


top_50_false_negatives_documents = []
for document in top_50_false_negatives:
  top_50_false_negatives_documents.extend(tokenize(document))


In [75]:
unique_tokens = {
                'negatives': collections.defaultdict(int),
                'positives':collections.defaultdict(int)
            }
tokens_in_corpus = collections.defaultdict(int)
for sentence in top_50_false_positives_documents:
    unique_tokens['positives'][sentence] += 1
    tokens_in_corpus[sentence] += 1

for sentence in top_50_false_negatives_documents:
    unique_tokens['negatives'][sentence] += 1
    tokens_in_corpus[sentence] += 1

In [76]:
len_negatives = len(unique_tokens['negatives'])
len_positives = len(unique_tokens['positives'])
len_total = len(tokens_in_corpus)
delta = collections.defaultdict(float)
variance = collections.defaultdict(float)
z_score = collections.defaultdict(float)

for word in set(unique_tokens['negatives']) | set(unique_tokens['positives']):
    
    positive_count = unique_tokens['positives'][word]
    negative_count = unique_tokens['negatives'][word]
    total_count = tokens_in_corpus[word]

    first_log = math.log10((negative_count + total_count) 
                           / (len_negatives + len_total - negative_count - total_count))
    second_log = math.log10((positive_count + total_count) 
                           / (len_positives + len_total - positive_count - total_count))
    
    delta[word] = first_log - second_log

    first_fraction = 1 / (negative_count + total_count)
    second_fraction = 1 / (positive_count + total_count)

    variance[word] = first_fraction + second_fraction

    z_score[word] = delta[word] / variance[word]