In [56]:
import warnings; warnings.filterwarnings('ignore')

import pandas as pd 
import json
from pathlib import Path

from collections import OrderedDict
import numpy as np

import re
from collections import defaultdict

import spacy
from spacy.matcher import PhraseMatcher
from spacy.pipeline import EntityRuler

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

from yellowbrick.text import TSNEVisualizer, FreqDistVisualizer, dispersion

import matplotlib.pyplot as plt
%matplotlib inline 

import gensim 
from gensim.models import Word2Vec 
from gensim.models.phrases import Phrases, Phraser
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

BASE_PATH = Path('..')
events_path = BASE_PATH / 'events'
dictionary_path = BASE_PATH / 'dictionary'
data_path = BASE_PATH / 'data'
subset_reports_path = data_path / 'subset'
subset_reports_path_txt = data_path / 'subset_txt'
df_path = data_path / 'dataframes'
patterns_path = dictionary_path / 'patterns'
triggers_path = dictionary_path / 'trigger phrases'

In [2]:
nlp = spacy.load("en_core_web_lg")

In [142]:
GROUP = 0

In [134]:
filename = events_path / f'group_{GROUP}_labelled.csv'
df = pd.read_csv(filename)
df = df[df.columns[2:]]

In [135]:
df.head()

Unnamed: 0,event_id,filename,sentence_idx,sentence_text,n_trigger_words_in_sentence,trigger_words_in_sentence,n_trigger_words_in_event,trigger_words_in_event,event_text,STRAT,event_label,ROCK,LOCATION,MINERAL,ORE_DEPOSIT,TIMESCALE,reviewed,Near Miss Event,Key trigger phrase
0,a080918_e9_1443_annual_09_13904956_0,a080918_e9_1443_annual_09_13904956.json,0,following the completion of the hole and loggi...,1,potential,1,potential,following the completion of the hole and loggi...,dirk hartog formation,0,,,,,,True,False,
1,a080918_e9_1443_annual_09_13904956_15,a080918_e9_1443_annual_09_13904956.json,15,photos of core c: yaringa e9_1443_annual_09.do...,1,mineralisation,1,mineralisation,mineral drillholes data 2. lithology summary a...,,0,evaporite,"carnarvon basin, australia",,,,True,False,
2,a080918_e9_1443_annual_09_13904956_18,a080918_e9_1443_annual_09_13904956.json,18,introduction the company has identified the on...,1,mineralisation,1,mineralisation,several suitable target areas were identified ...,,0,sediments,"gascoyne, carnarvon basin",diamond,,,True,False,
3,a080918_e9_1443_annual_09_13904956_21,a080918_e9_1443_annual_09_13904956.json,21,parts of the adjacent coolcalalaya rift are al...,1,mineralisation,2,"possible, mineralisation",the gascoyne platform is a diamond shaped area...,,0,"evaporite, sediments","gascoyne, perth",diamond,,,True,False,
4,a080918_e9_1443_annual_09_13904956_34,a080918_e9_1443_annual_09_13904956.json,34,"a recent detailed analysis of drilling, seismi...",1,mineralisation,1,mineralisation,bromine levels in the halite are high (up to 3...,dirk hartog formation,0,evaporite,,"salts, sylvite, halite",,,True,True,precipitated


In [148]:
def get_tokens(doc):
    return [w.lemma_ for w in nlp(doc) if (w.is_alpha and not w.is_stop)]

def get_vectors(df, max_epochs = 100, vec_size = 20, alpha = 0.025, model_path="d2v.model"):
    tagged_data = [TaggedDocument(words=row.tokens, tags=[row.event_id]) for row in df.itertuples()]
    model = Doc2Vec(size=vec_size, alpha=alpha, min_alpha=0.00025, min_count=1, dm =1)
    model.build_vocab(tagged_data)
    for epoch in range(max_epochs):
        model.train(tagged_data, total_examples=model.corpus_count, epochs=model.iter)
        model.alpha -= 0.0002
        model.min_alpha = model.alpha
        
    model.save(model_path)
    vecs = []

    for event_id in df['event_id']:
        try:
            vec = model.docvecs[event_id]
        except:
            vec = np.nan
        vecs.append(vec)
        
    return vecs


def to_list(x):
    if isinstance(x, str):
        return x.split(',')
    else:
        return ['unknown']

def add_features(full_df, labelled_df):
    for feature in ['STRAT', 'ROCK', 'LOCATION', 'MINERAL', 'ORE_DEPOSIT', 'TIMESCALE']:
        s = full_df[feature].apply(to_list)
        wide_df = pd.DataFrame(mlb.fit_transform(s), columns=mlb.classes_, index=df.index)
        wide_df = wide_df.add_prefix(f'{feature}_')
        labelled_df = labelled_df.merge(wide_df, how='left', left_index=True, right_index=True)
    return labelled_df

In [150]:
labelled_df = df.loc[df.reviewed][['event_id', 'event_text', 'Near Miss Event']]

# Event Text Vector
labelled_df['tokens'] = labelled_df['event_text'].apply(get_tokens)
labelled_df['event_text_vector'] = get_vectors(labelled_df)

# One hot encoded Feature
labelled_df = add_features(df, labelled_df)

# Label
labelled_df['label'] = labelled_df['Near Miss Event'].astype(int)

# Drop unused labels
labelled_df = labelled_df.drop(columns=['tokens', 'event_text', 'Near Miss Event'])

labelled_df.head()

Unnamed: 0,event_id,event_text_vector,STRAT_ ashburton formation,STRAT_ brockman iron formation,STRAT_ capricorn formation,STRAT_ cawse monzogranite,STRAT_ corboy formation,STRAT_ dales gorge member,STRAT_ duck creek dolomite,STRAT_ emull gabbro,...,TIMESCALE_archaean,TIMESCALE_cainozoic,TIMESCALE_devonian,TIMESCALE_mesoproterozoic,TIMESCALE_mesozoic,TIMESCALE_phanerozoic,TIMESCALE_proterozoic,TIMESCALE_tertiary,TIMESCALE_unknown,label
0,a080918_e9_1443_annual_09_13904956_0,"[-0.65880835, 2.0076575, -2.3191159, 3.762255,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,a080918_e9_1443_annual_09_13904956_15,"[-1.0317131, 2.780362, 2.1162007, 1.6282803, 0...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,a080918_e9_1443_annual_09_13904956_18,"[-3.6325414, 6.233181, -0.20545405, 4.883079, ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,a080918_e9_1443_annual_09_13904956_21,"[-1.6911434, 4.5786643, -0.6014996, 5.4774303,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,a080918_e9_1443_annual_09_13904956_34,"[-0.5821011, 2.6584122, -0.9980737, -1.4596263...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1


In [151]:
filename = events_path / f'group_{GROUP}_processed.csv'
labelled_df.to_csv(filename, index=False)

In [82]:
model = Doc2Vec.load("d2v.model")

In [84]:
sample_vec = model.docvecs['a080918_e9_1443_annual_09_13904956_0']
sample_vec

array([ 0.12339132, -0.6571223 , -1.3423402 ,  1.3109198 ,  0.14577109,
        5.620479  ,  1.7602464 ,  1.8104154 ,  4.597116  ,  0.56684893,
        1.4344531 ,  1.17409   , -1.538548  ,  0.9024879 , -1.8723099 ,
        4.1246476 ,  0.16465104,  2.1530197 , -3.43479   ,  4.1463523 ],
      dtype=float32)

In [85]:
len(sample_vec)

20

In [98]:
test_df.shape

(530, 2)