In [26]:
import numpy as np
import pandas as pd
import spacy

In [3]:
data = pd.read_csv('../../events/group_all_labelled.csv')
df = data.loc[data.reviewed]

In [4]:
import gensim 
from gensim.models import Word2Vec 
from gensim.models.phrases import Phrases, Phraser
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

def get_tokens(doc):
    return [w.lemma_ for w in nlp(doc) if (w.is_alpha and not w.is_stop)]

def get_vectors(df, max_epochs = 100, vec_size = 20, alpha = 0.025, model_path="d2v.model"):
    tagged_data = [TaggedDocument(words=row.tokens, tags=[row.event_id]) for row in df.itertuples()]
    model = Doc2Vec(size=vec_size, alpha=alpha, min_alpha=0.00025, min_count=1, dm =1)
    model.build_vocab(tagged_data)
    for epoch in range(max_epochs):
        model.train(tagged_data, total_examples=model.corpus_count, epochs=model.iter)
        model.alpha -= 0.0002
        model.min_alpha = model.alpha
        
    model.save(model_path)
    vecs = []

    for event_id in df['event_id']:
        try:
            vec = model.docvecs[event_id]
        except:
            vec = np.nan
        vecs.append(vec)
        
    return vecs

def to_list(x):
    if isinstance(x, str):
        return x.split(',')
    else:
        return ['unknown']

In [5]:
nlp = spacy.load('en_core_web_lg')

In [6]:
#Split training/test set
ylabels = df['Near Miss Event'].astype(int)

X = df.drop(columns = ['group','filename','sentence_text' ,'Near Miss Event'])

# Event Text Vector
X['tokens'] = X['event_text'].apply(get_tokens)
X['event_text_vector'] = get_vectors(X)

  model.train(tagged_data, total_examples=model.corpus_count, epochs=model.iter)


In [59]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

import random
random.seed(123) # Why does this keep changing?
X_train, X_test, y_train, y_test = train_test_split(X['event_text_vector'], ylabels, test_size=0.3)

X_train_vector = pd.DataFrame(X_train.tolist(), index=X_train.index)
X_test_vector = pd.DataFrame(X_test.tolist(), index=X_test.index)

print(X_train.shape )
print(X_test.shape)
print(f'Class balance for train set:\n{y_train.value_counts()}\n')
print(f'Class balance for test set:\n{y_test.value_counts()}')

(1169,)
(501,)
Class balance for train set:
0    887
1    282
Name: Near Miss Event, dtype: int64

Class balance for test set:
0    358
1    143
Name: Near Miss Event, dtype: int64


In [147]:
def cosine_similarity(a, b):
    magnitude = lambda x : np.sqrt(np.sum(np.power(x,2)))
    return (np.dot(a, b)) / (magnitude(a)*magnitude(b))

def cosine_distance(a, b):
    return 1 - cosine_similarity(a, b)

def cosine_classify(a, b, threshold = 0.5):
    return int(cosine_distance(a,b) > threshold)

#decimals for rounding
decimals = 4

for threshold in (0.45, 0.475, 0.5, 0.525, 0.55):
    predictions = [cosine_classify(vectors, mean_vector.values, threshold=threshold) for vectors in X_test_vector.values]

    # Model Accuracy
    print(f'Threshold set to {threshold}: classify as 1 if cosine distance > {threshold}, else 0.')
    print(f"Accuracy: {round(metrics.accuracy_score(y_test, predictions),decimals)}, F1 Score: {round(metrics.f1_score(y_test, predictions),decimals)}")
    print(f"Precision: {round(metrics.precision_score(y_test, predictions),decimals)}, Recall: {round(metrics.recall_score(y_test, predictions),decimals)}")
    display(confusion_matrix(y_test, predictions))

Threshold set to 0.45: classify as 1 if cosine distance > 0.45, else 0.
Accuracy: 0.4192, F1 Score: 0.4305
Precision: 0.2989, Recall: 0.7692


array([[100, 258],
       [ 33, 110]])

Threshold set to 0.475: classify as 1 if cosine distance > 0.475, else 0.
Accuracy: 0.481, F1 Score: 0.3981
Precision: 0.2976, Recall: 0.6014


array([[155, 203],
       [ 57,  86]])

Threshold set to 0.5: classify as 1 if cosine distance > 0.5, else 0.
Accuracy: 0.5309, F1 Score: 0.3188
Precision: 0.2723, Recall: 0.3846


array([[211, 147],
       [ 88,  55]])

Threshold set to 0.525: classify as 1 if cosine distance > 0.525, else 0.
Accuracy: 0.6148, F1 Score: 0.2372
Precision: 0.2727, Recall: 0.2098


array([[278,  80],
       [113,  30]])

Threshold set to 0.55: classify as 1 if cosine distance > 0.55, else 0.
Accuracy: 0.6747, F1 Score: 0.1466
Precision: 0.2917, Recall: 0.0979


array([[324,  34],
       [129,  14]])