In [1]:
%matplotlib inline

import pandas as pd
import numpy as np

import spacy

# specify number of workers to run algorithm code
num_workers = 4 #8

In [39]:
# import data
df = pd.read_csv(
    '../../events/group_all_labelled.csv',
    usecols=['event_id','filename', 'group', 'sentence_text','event_text', 'Near Miss Event'])
df['label'] = df['Near Miss Event'].astype(int)

In [40]:
# spacy
nlp = spacy.load("en_core_web_lg")
        
# Creating our tokenizer function
# https://towardsdatascience.com/building-a-topic-modeling-pipeline-with-spacy-and-gensim-c5dc03ffc619
def lemmatizer(doc):
    # This takes in a doc of tokens from the NER and lemmatizes them. 
    # Pronouns (like "I" and "you" get lemmatized to '-PRON-', so I'm removing those.
    doc = [token.lemma_.lower().strip() for token in doc if token.lemma_ != '-PRON-']
    doc = u' '.join(doc)
    return nlp.make_doc(doc)
    
def remove_stopwords(doc):
    # This will remove stopwords and punctuation.
    # Use token.text to return strings, which we'll need for Gensim.
    doc = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
    return doc

# create our language pipeline
nlp = spacy.load("en_core_web_lg")
nlp.add_pipe(lemmatizer,name='lemmatizer')  # lemmatizer
nlp.add_pipe(remove_stopwords, name="stopwords", last=True)  # remove stopwords + punctuation, return textt

In [41]:
df['tokens'] = list(nlp.pipe(df.event_text.values, batch_size=100))  # adjust

In [42]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline

# bag of words vectorizer
bow_vec = CountVectorizer(
    min_df=3, # min document frequency count
    analyzer="word",
    ngram_range=(1,3))

# term frequency - inverse document frequency
tf_idf_trans = TfidfTransformer()

text_vectorizer = Pipeline([
    ('bow', bow_vec), # learns the vocabulary dictionary & returns a Document-Term matrix. [n_samples, n_features]
    ('tfidf', tf_idf_trans)
])

In [43]:
df['clean_text'] = df['tokens'].map(lambda tokens: " ".join(tokens))

In [44]:
tfidf_vector = text_vectorizer.fit_transform(df['clean_text'])

In [45]:
# Applying the Singular value decomposition
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=50, random_state=2018)
svd_tfidf = svd.fit_transform(tfidf_vector)

print("Dimensionality of LSA space: {}".format(svd_tfidf.shape))

Dimensionality of LSA space: (1671, 50)


In [77]:
import plotly.express as px
from sklearn.manifold import TSNE

for perplexity in (2, 5, 10, 20, 30, 50, 100):
    tsne_model = TSNE(n_jobs=num_workers,
                      perplexity = perplexity,
                      early_exaggeration=4, # Trying out exaggeration trick
                      n_components=2,
                      verbose=1,
                      random_state=2018,
                      n_iter=5000)

    tsne_tfidf = tsne_model.fit_transform(svd_tfidf)

    # Putting the tsne information into a dataframe
    projections = pd.DataFrame(data=tsne_tfidf, columns=["x", "y"])

    fig = px.scatter(projections, x='x', y='y', title = f't-SNE Projection in 2D - perplexity = {perplexity}',
                     height=600, width=600,
                     color = df.label.astype(str))
    fig.show()

[t-SNE] Computing 7 nearest neighbors...
[t-SNE] Indexed 1671 samples in 0.016s...
[t-SNE] Computed neighbors for 1671 samples in 0.066s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1671
[t-SNE] Computed conditional probabilities for sample 1671 / 1671
[t-SNE] Mean sigma: 0.000000
[t-SNE] KL divergence after 250 iterations with early exaggeration: 17.831482
[t-SNE] KL divergence after 5000 iterations: 0.788101


[t-SNE] Computing 16 nearest neighbors...
[t-SNE] Indexed 1671 samples in 0.003s...
[t-SNE] Computed neighbors for 1671 samples in 0.158s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1671
[t-SNE] Computed conditional probabilities for sample 1671 / 1671
[t-SNE] Mean sigma: 0.088842
[t-SNE] KL divergence after 250 iterations with early exaggeration: 19.350224
[t-SNE] KL divergence after 4150 iterations: 1.181165


[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Indexed 1671 samples in 0.004s...
[t-SNE] Computed neighbors for 1671 samples in 0.141s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1671
[t-SNE] Computed conditional probabilities for sample 1671 / 1671
[t-SNE] Mean sigma: 0.106869
[t-SNE] KL divergence after 250 iterations with early exaggeration: 20.579723
[t-SNE] KL divergence after 3900 iterations: 1.358330


[t-SNE] Computing 61 nearest neighbors...
[t-SNE] Indexed 1671 samples in 0.004s...
[t-SNE] Computed neighbors for 1671 samples in 0.148s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1671
[t-SNE] Computed conditional probabilities for sample 1671 / 1671
[t-SNE] Mean sigma: 0.121181
[t-SNE] KL divergence after 250 iterations with early exaggeration: 19.890015
[t-SNE] KL divergence after 3950 iterations: 1.434245


[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 1671 samples in 0.003s...
[t-SNE] Computed neighbors for 1671 samples in 0.129s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1671
[t-SNE] Computed conditional probabilities for sample 1671 / 1671
[t-SNE] Mean sigma: 0.128583
[t-SNE] KL divergence after 250 iterations with early exaggeration: 19.029316
[t-SNE] KL divergence after 4750 iterations: 1.425684


[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 1671 samples in 0.004s...
[t-SNE] Computed neighbors for 1671 samples in 0.166s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1671
[t-SNE] Computed conditional probabilities for sample 1671 / 1671
[t-SNE] Mean sigma: 0.137430
[t-SNE] KL divergence after 250 iterations with early exaggeration: 17.678288
[t-SNE] KL divergence after 2650 iterations: 1.381206


[t-SNE] Computing 301 nearest neighbors...
[t-SNE] Indexed 1671 samples in 0.003s...
[t-SNE] Computed neighbors for 1671 samples in 0.167s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1671
[t-SNE] Computed conditional probabilities for sample 1671 / 1671
[t-SNE] Mean sigma: 0.149395
[t-SNE] KL divergence after 250 iterations with early exaggeration: 15.546962
[t-SNE] KL divergence after 2650 iterations: 1.226373


In [83]:
import plotly.express as px
from sklearn.manifold import TSNE

for perplexity in (2, 5, 10, 20, 30, 50, 100):
    tsne_model = TSNE(n_jobs=num_workers,
                      perplexity = perplexity,
                      early_exaggeration=4, # Trying out exaggeration trick
                      n_components=2,
                      verbose=1,
                      random_state=2018,
                      n_iter=5000,
                      metric='cosine')

    tsne_tfidf = tsne_model.fit_transform(svd_tfidf)

    # Putting the tsne information into a dataframe
    projections = pd.DataFrame(data=tsne_tfidf, columns=["x", "y"])

    fig = px.scatter(projections, x='x', y='y', title = f't-SNE Projection in 2D - perplexity = {perplexity}',
                     height=600, width=600,
                     color = df.label.astype(str))
    fig.show()

[t-SNE] Computing 7 nearest neighbors...
[t-SNE] Indexed 1671 samples in 0.001s...
[t-SNE] Computed neighbors for 1671 samples in 0.153s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1671
[t-SNE] Computed conditional probabilities for sample 1671 / 1671
[t-SNE] Mean sigma: 0.000000
[t-SNE] KL divergence after 250 iterations with early exaggeration: 17.274017
[t-SNE] KL divergence after 5000 iterations: 0.674115


[t-SNE] Computing 16 nearest neighbors...
[t-SNE] Indexed 1671 samples in 0.006s...
[t-SNE] Computed neighbors for 1671 samples in 0.215s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1671
[t-SNE] Computed conditional probabilities for sample 1671 / 1671
[t-SNE] Mean sigma: 0.172850
[t-SNE] KL divergence after 250 iterations with early exaggeration: 17.531406
[t-SNE] KL divergence after 5000 iterations: 1.015456


[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Indexed 1671 samples in 0.000s...
[t-SNE] Computed neighbors for 1671 samples in 0.196s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1671
[t-SNE] Computed conditional probabilities for sample 1671 / 1671
[t-SNE] Mean sigma: 0.220287
[t-SNE] KL divergence after 250 iterations with early exaggeration: 19.030724
[t-SNE] KL divergence after 5000 iterations: 1.210949


[t-SNE] Computing 61 nearest neighbors...
[t-SNE] Indexed 1671 samples in 0.002s...
[t-SNE] Computed neighbors for 1671 samples in 0.177s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1671
[t-SNE] Computed conditional probabilities for sample 1671 / 1671
[t-SNE] Mean sigma: 0.253616
[t-SNE] KL divergence after 250 iterations with early exaggeration: 19.469629
[t-SNE] KL divergence after 4100 iterations: 1.321000


[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 1671 samples in 0.000s...
[t-SNE] Computed neighbors for 1671 samples in 0.186s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1671
[t-SNE] Computed conditional probabilities for sample 1671 / 1671
[t-SNE] Mean sigma: 0.270238
[t-SNE] KL divergence after 250 iterations with early exaggeration: 19.041910
[t-SNE] KL divergence after 5000 iterations: 1.349729


[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 1671 samples in 0.000s...
[t-SNE] Computed neighbors for 1671 samples in 0.232s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1671
[t-SNE] Computed conditional probabilities for sample 1671 / 1671
[t-SNE] Mean sigma: 0.289200
[t-SNE] KL divergence after 250 iterations with early exaggeration: 18.011284
[t-SNE] KL divergence after 4600 iterations: 1.352368


[t-SNE] Computing 301 nearest neighbors...
[t-SNE] Indexed 1671 samples in 0.000s...
[t-SNE] Computed neighbors for 1671 samples in 0.221s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1671
[t-SNE] Computed conditional probabilities for sample 1671 / 1671
[t-SNE] Mean sigma: 0.312517
[t-SNE] KL divergence after 250 iterations with early exaggeration: 16.173401
[t-SNE] KL divergence after 1800 iterations: 1.245304


In [84]:
# animation
# gapminder = px.data.gapminder()
# gapminder
# px.scatter(gapminder, x="gdpPercap", y="lifeExp",
#            animation_frame="year", animation_group="country",
#            size="pop", color="continent", hover_name="country",
#            log_x=True, size_max=55, range_x=[100,100000], range_y=[25,90])

Doc2Vec

In [86]:
df['clean_text']

0       follow completion hole logging core lack evapo...
1       mineral drillholes data 2 lithology summary ap...
2       suitable target area identify area apply explo...
3       gascoyne platform diamond shape area cover 86,...
4       bromine level halite high 330ppm suggest preci...
                              ...                        
1666    wadi prospect 100 metre south kingsway zone st...
1667    diamond drilling focusse test hole conductor i...
1668    significant assay receive hole falconbridge ag...
1669    data lodge doir airborne geophysical register ...
1670    project area extend approximately 50 kilometre...
Name: clean_text, Length: 1671, dtype: object

In [93]:
# train model
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from timer import Timer
from datetime import datetime

def get_doc2vec(model, df):
    vecs = []
    for idx in df.index:
        try:
            vec = model.docvecs[idx]
        except:
            vec = np.nan
        vecs.append(vec)
    return vecs

model_path = 'doc2vec.model'

# Storing the question texts in a list
event_texts = df.clean_text.tolist()

# Creating a list of terms and a list of labels to go with it
documents = [TaggedDocument(doc, tags=[str(i)]) for i, doc in enumerate(event_texts)]
print(f'Doc2Vec model training on {len(df)} reports starting at {datetime.now().time()}')

max_epochs = 100

with Timer():
    #Note: dm defines the training algorithm.
    # If dm=1 means ‘distributed memory’ (PV-DM) and dm =0 means ‘distributed bag of words’ (PV-DBOW).
    model = Doc2Vec(vector_size=50, alpha=0.025, min_alpha=0.00025, min_count=2, dm=1, epochs=max_epochs, workers=num_workers)
    model.build_vocab(documents)
    model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

Doc2Vec model training on 1671 reports starting at 16:21:59.394828
Elapsed time: 16.6347 seconds


In [94]:
model.save(model_path)

In [99]:
for perplexity in (2, 5, 10, 30, 50, 100):
    tsne_model = TSNE(n_jobs=num_workers,
                      perplexity=perplexity,
                      early_exaggeration=4, # Trying out exaggeration trick
                      n_components=2,
                      verbose=1,
                      random_state=2018,
                      n_iter=5000,
                      metric='cosine')

    tsne_d2v = tsne_model.fit_transform(model.docvecs.vectors_docs)

    # Putting the tsne information into a dataframe
    projectionsd2v = pd.DataFrame(data=tsne_d2v, columns=["x", "y"])

    fig = px.scatter(projectionsd2v, x='x', y='y',
                     title = f't-SNE Projection (D2V): perplexity = {perplexity}',
                     height=600, width=600,
                     color = df.label.astype(str))
    fig.show()

[t-SNE] Computing 7 nearest neighbors...
[t-SNE] Indexed 1671 samples in 0.000s...
[t-SNE] Computed neighbors for 1671 samples in 0.102s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1671
[t-SNE] Computed conditional probabilities for sample 1671 / 1671
[t-SNE] Mean sigma: 0.110087
[t-SNE] KL divergence after 250 iterations with early exaggeration: 18.829094
[t-SNE] KL divergence after 5000 iterations: 0.987215


[t-SNE] Computing 16 nearest neighbors...
[t-SNE] Indexed 1671 samples in 0.000s...
[t-SNE] Computed neighbors for 1671 samples in 0.161s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1671
[t-SNE] Computed conditional probabilities for sample 1671 / 1671
[t-SNE] Mean sigma: 0.169997
[t-SNE] KL divergence after 250 iterations with early exaggeration: 24.064127
[t-SNE] KL divergence after 5000 iterations: 1.688203


[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Indexed 1671 samples in 0.000s...
[t-SNE] Computed neighbors for 1671 samples in 0.186s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1671
[t-SNE] Computed conditional probabilities for sample 1671 / 1671
[t-SNE] Mean sigma: 0.201703
[t-SNE] KL divergence after 250 iterations with early exaggeration: 23.375736
[t-SNE] KL divergence after 5000 iterations: 2.005023


[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 1671 samples in 0.000s...
[t-SNE] Computed neighbors for 1671 samples in 0.133s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1671
[t-SNE] Computed conditional probabilities for sample 1671 / 1671
[t-SNE] Mean sigma: 0.239946
[t-SNE] KL divergence after 250 iterations with early exaggeration: 20.408779
[t-SNE] KL divergence after 4600 iterations: 2.070801


[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 1671 samples in 0.000s...
[t-SNE] Computed neighbors for 1671 samples in 0.250s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1671
[t-SNE] Computed conditional probabilities for sample 1671 / 1671
[t-SNE] Mean sigma: 0.255497
[t-SNE] KL divergence after 250 iterations with early exaggeration: 18.832308
[t-SNE] KL divergence after 4750 iterations: 1.967608


[t-SNE] Computing 301 nearest neighbors...
[t-SNE] Indexed 1671 samples in 0.000s...
[t-SNE] Computed neighbors for 1671 samples in 0.271s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1671
[t-SNE] Computed conditional probabilities for sample 1671 / 1671
[t-SNE] Mean sigma: 0.276600
[t-SNE] KL divergence after 50 iterations with early exaggeration: 16.355175
[t-SNE] KL divergence after 4300 iterations: 1.814262
