# Target Visualization - T-SNE and Doc2Vec
Source: https://www.kaggle.com/arthurtok/target-visualization-t-sne-and-doc2vec/notebook

This kernel will be an exploration into the target variable and how it is distributed accorss the structure of the training data to see if any potential information or patterns can be gleaned going forwards. Since classical treatment of text data normally comes with the challenges of high dimensionality (using terms frequencies or term frequency inverse document frequencies), the plan therefore in this kernel is to visually explore the target variable in some lower dimensional space using SVD and LSA(Latent Semantic Analysis) and Doc2Vec method. In these lower dimensional spaces, we can finally utilize the manifold learning method of the t-distributed stochastic neighbour embedding (tNSE) technique to further reduce the dimensionality for target variable visualisation. 

In [1]:

# Importing the relevant libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer

from string import punctuation

import re
from functools import reduce

import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure, show, output_notebook, reset_output
from bokeh.palettes import d3
import bokeh.models as bmo
from bokeh.io import save, output_file

# init_notebook_mode(connected = True)
# color = sns.color_palette("Set2")
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999
pd.options.display.max_rows = 999

In [2]:
from pathlib import Path

BASE_PATH = Path('..')
events_path = BASE_PATH / 'events'
dictionary_path = BASE_PATH / 'dictionary'
data_path = BASE_PATH / 'data'
subset_reports_path = data_path / 'subset'
subset_reports_path_txt = data_path / 'subset_txt'
df_path = data_path / 'dataframes'
patterns_path = dictionary_path / 'patterns'
triggers_path = dictionary_path / 'trigger phrases'


#group_events_path = events_path / f'group_{GROUP}_events.csv'
#labelled_path = events_path / f'group_{GROUP}_labelled.csv'
#processed_path = events_path / f'group_{GROUP}_processed.csv'

In [3]:
#NLP packages 
import string
import spacy
from spacy import displacy

from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

# 1. Data preprocessing

In [4]:
pd.Series(['False']).astype(bool)

0    True
dtype: bool

In [6]:
groups = [0, 1, 2, 3, 4, 6] # lol

filenames = {group: events_path / f'group_{group}_labelled.csv' for group in groups}
print(filenames)

{0: WindowsPath('../events/group_0_labelled.csv'), 1: WindowsPath('../events/group_1_labelled.csv'), 2: WindowsPath('../events/group_2_labelled.csv'), 3: WindowsPath('../events/group_3_labelled.csv'), 4: WindowsPath('../events/group_4_labelled.csv'), 6: WindowsPath('../events/group_6_labelled.csv')}


In [7]:
df = pd.read_csv(group_6_labelled.csv)

NameError: name 'group_6_labelled' is not defined

In [8]:
#Read in labelled event data file from 6 groups

groups = [0, 1, 2, 3, 4, 6] # lol

filenames = {group: events_path / f'group_{group}_labelled.csv' for group in groups}

# instantiate empty list to store dfs on read
dfall = []
for group in groups:
    df = pd.read_csv(filenames[group])
    
    # data processing and cleaning on near miss event column
    df = df.loc[df['Near Miss Event'].notna(), ]
    
    # pd.Series(['False']) returns True as string are converted to bool on whether they are empty or not!
    df['Near Miss Event'] = df['Near Miss Event'].apply(lambda x : (x == 'True') | (x == True)).astype(bool)
    
    # need to read in dataframe to work out length of group column
    df.insert(2, 'group', np.repeat(group, len(df)))
    dfall.append(df)
    
# concat list of dfs as a single data frame containing all labelled events from 6 groups
dfall = pd.concat(dfall)

FileNotFoundError: [Errno 2] File ..\events\group_0_labelled.csv does not exist: '..\\events\\group_0_labelled.csv'

In [None]:
dfall.to_csv(events_path / f'group_all_labelled.csv', index=False)

In [None]:
df = dfall.loc[dfall.reviewed][['event_id','filename', 'group', 'sentence_text','event_text', 'Near Miss Event']]
# Target Label
df['Label'] = df['Near Miss Event'].astype(int)
print(df.shape)
df.head()


# NLP 

Apply standard NLP steps to process the event text from the input file, including:

* Removing stop words 

* Tokenization

* Lemmatization

In [None]:
# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en_core_web_lg')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

In [None]:
# Apply Spacy functions
df["tokenized_text"] = df["event_text"].apply(lambda x: spacy_tokenizer(x))

In [None]:
df.head()

# 2. T-SNE applied to Latent Semantic (LSA) space


To start off we look at the sparse representation of text documents via the Term frequency Inverse document frequency method. What this does is create a matrix representation that upweights locally prevalent but globally rare terms - therefore accounting for the occurence bias when using just term frequencies

In [None]:
# how much data comes from each subgroup?
df.group.value_counts()

In [None]:
tf_idf_vec = TfidfVectorizer(min_df=3,
                             max_features = 60_000, #100_000,
                             analyzer="word",
                             ngram_range=(1,3), # (1,6)
                             stop_words="english")

# fit and transform on all events
tf_idf = tf_idf_vec.fit_transform(list(df["tokenized_text"].map(lambda tokens: " ".join(tokens))))

# fit on all events, transform subset
# tf_idf_vec.fit(list(X["event_text"].map(lambda tokens: " ".join(tokens))))
# tf_idf = tf_idf_vec.transform(list(X.loc[X.group == GROUP,"event_text"].map(lambda tokens: " ".join(tokens))))

In [None]:
# Applying the Singular value decomposition
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=50, random_state=2018)
svd_tfidf = svd.fit_transform(tf_idf)
print("Dimensionality of LSA space: {}".format(svd_tfidf.shape))

In [None]:
# # Showing scatter plots 
# from mpl_toolkits.mplot3d import Axes3D
# fig = plt.figure(figsize=(16,12))

# # Plot models:
# ax = Axes3D(fig) 
# ax.scatter(svd_tfidf[:,0],
#            svd_tfidf[:,1],
#            svd_tfidf[:,2],
#            c=X.Label.values,
#            cmap=plt.cm.winter_r,
#            s=2,
#            edgecolor='none',
#            marker='o')
# plt.title("Semantic Tf-Idf-SVD reduced plot of Sincere-Insincere data distribution")
# plt.xlabel("First dimension")
# plt.ylabel("Second dimension")
# plt.legend()
# plt.xlim(0.0, 0.20)
# plt.ylim(-0.2,0.4)
# plt.show()

In [None]:
from sklearn.manifold import TSNE

# Importing multicore version of TSNE
#from MulticoreTSNE import MulticoreTSNE as TSNE

In [None]:
tsne_model = TSNE(n_jobs=4,
                  early_exaggeration=4, # Trying out exaggeration trick
                  n_components=2,
                  verbose=1,
                  random_state=2018,
                  n_iter=500)

tsne_tfidf = tsne_model.fit_transform(svd_tfidf)

# Putting the tsne information into a dataframe
tsne_tfidf_df = pd.DataFrame(data=tsne_tfidf, columns=["x", "y"])

# add X values to full df
for col in ['event_id','filename', 'sentence_text', 'event_text', 'tokenized_text', 'Label', 'group']:
    tsne_tfidf_df[col] = df[col].values

# add X values to subsetted df (i.e. for only one GROUP)
# for col in ['filename', 'sentence_text', 'event_text', 'Label', 'group']:
#     tsne_tfidf_df[col] = X[X.loc[X.group == GROUP, col].values

In [None]:
tsne_tfidf_df

In [None]:
output_notebook()

# colormap = np.array(["#6d8dca", "#d07d3c"])

# we need a list of length 7 becasue charlie labelled group 6 instead of 5 lol
colormap = np.array(["darkblue", "red", "purple", "green", "orange", "yellow", "yellow"])

# palette = d3["Category10"][len(tsne_tfidf_df["asset_name"].unique())]
source = ColumnDataSource(data = dict(x = tsne_tfidf_df["x"], 
                                      y = tsne_tfidf_df["y"],
                                      color = colormap[tsne_tfidf_df["Label"]],
                                      group = tsne_tfidf_df["group"],
                                      sentence_text = tsne_tfidf_df["sentence_text"],
                                      event_text = tsne_tfidf_df["event_text"],
                                      filename = tsne_tfidf_df["filename"],
                                      event_id = tsne_tfidf_df["event_id"],
                                      Label = tsne_tfidf_df["Label"]))
TOOLTIPS = [
    ("event_id","@event_id"),
    ("filename", "@filename"),
    ("event_text", "@event_text"),
    ("Label","@Label"), 
    ("group", "@group")
]


plot_tfidf = bp.figure(plot_width = 800, plot_height = 700, tooltips=TOOLTIPS,
                       title = "T-SNE applied to Tfidf_SVD space")

plot_tfidf.scatter(x = "x", 
                   y = "y", 
                   color="color",
                   legend = "Label",
                   source = source,
                   alpha = 0.7,
                   radius = 0.4)

show(plot_tfidf)

In [None]:
output_notebook()

# colormap = np.array(["#6d8dca", "#d07d3c"])

# we need a list of length 7 becasue charlie labelled group 6 instead of 5 lol
colormap = np.array(["darkblue", "red", "purple", "green", "orange", "yellow", "yellow"])

# palette = d3["Category10"][len(tsne_tfidf_df["asset_name"].unique())]
source = ColumnDataSource(data = dict(x = tsne_tfidf_df["x"], 
                                      y = tsne_tfidf_df["y"],
                                      color = colormap[tsne_tfidf_df["group"]],
                                      group = tsne_tfidf_df["group"],
                                      sentence_text = tsne_tfidf_df["sentence_text"],
                                      event_text = tsne_tfidf_df["event_text"],
                                      event_id = tsne_tfidf_df["event_id"],
                                      filename = tsne_tfidf_df["filename"],
                                      Label = tsne_tfidf_df["Label"]))
TOOLTIPS = [
     ("event_id","@event_id"),
    ("filename", "@filename"),
#    ("sentence_text", "@sentence_text"),  # show centre sentence of text chunk
    ("event_text", "@event_text"), # show full text chunk
    ("Label","@Label"),
    ("group", "@group")
]


plot_tfidf = bp.figure(plot_width = 800, plot_height = 700, tooltips=TOOLTIPS,
                       title = "T-SNE applied to Tfidf_SVD space")

plot_tfidf.scatter(x = "x", 
                   y = "y", 
                   color="color",
                   legend = "group",
                   source = source,
                   alpha = 0.7,
                   radius = 0.35)  # adjust scatter point size

show(plot_tfidf)

# 3. T-SNE applied on Doc2Vec embedding
Pushing forward with our T-SNE visual explorations, we next move away from semantic matrices into the realm of embeddings. Here we will use the Doc2Vec algorithm and much like its very well known counterpart Word2vec involves unsupervised learning of continuous representations for text. Unlike Word2vec which involves finding the representations for words (i.e. word embeddings), Doc2vec modifies the former method and extends it to sentences and even documents.

For this notebook, we will be using gensim's Doc2Vec class which inherits from the base Word2Vec class where style of usage and parameters are similar. The only differences lie in the naming terminology of the training method used which are the “distributed memory” or “distributed bag of words” methods.

According to the Gensim documentation, Doc2Vec requires the input to be an iterable object representing the sentences in the form of two lists, a list of the terms and a list of labels

In [None]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [None]:
# Storing the question texts in a list
event_texts = list(df["event_text"])

# Creating a list of terms and a list of labels to go with it
documents = [TaggedDocument(doc, tags=[str(i)]) for i, doc in enumerate(event_texts)]

In [None]:
#Implement Doc2Vec
max_epochs = 100
alpha=0.025
model = Doc2Vec(documents,
                size=10, 
                min_alpha=0.00025,
                alpha=alpha,
                min_count=1,
#                 window=2, 
                workers=4)

In [None]:
# Creating and fitting the tsne model to the document embeddings
tsne_model = TSNE(n_jobs=4,
                  early_exaggeration=4,
                  n_components=2,
                  verbose=1,
                  random_state=2018,
                  n_iter=300)

# fit alll
#tsne_d2v = tsne_model.fit_transform(model.docvecs.vectors_docs)

tsne_d2v = tsne_model.fit_transform(model.docvecs.vectors_docs)

# Putting the tsne information into sq
tsne_d2v_df = pd.DataFrame(data=tsne_d2v, columns=["x", "y"])

# add X values to full df
for col in ['event_id','filename', 'sentence_text', 'event_text', 'tokenized_text', 'Label', 'group']:
    tsne_d2v_df[col] = df[col].values

In [None]:
output_notebook()

# colormap = np.array(["#6d8dca", "#d07d3c"])
colormap = np.array(["darkblue", "red", "purple", "green", "orange", "yellow", "yellow"])

# palette = d3["Category10"][len(tsne_tfidf_df["asset_name"].unique())]
source = ColumnDataSource(data = dict(x = tsne_d2v_df["x"], 
                                      y = tsne_d2v_df["y"],
                                      color = colormap[tsne_d2v_df["Label"]],
                                      group = tsne_d2v_df["group"],
                                      event_text = tsne_d2v_df["event_text"],
                                      sentence_text = tsne_d2v_df['sentence_text'],
                                      event_id = tsne_d2v_df["event_id"],
                                      filename = tsne_d2v_df["filename"],
                                      Label = tsne_d2v_df["Label"]))

TOOLTIPS = [
     ("event_id","@event_id"),
    ("filename", "@filename"),
    ("sentence_text", "@sentence_text"),
    ("event_text", "@event_text"),
    ("Label","@Label"),
    ("group", "@group")
]

plot_d2v = bp.figure(plot_width = 800, plot_height = 700, tooltips=TOOLTIPS,
                       title = "T-SNE applied to Doc2vec document embeddings")

plot_d2v.scatter(x = "x", 
                   y = "y", 
                   color="color",
                   legend = "Label",
                   source = source,
                   alpha = 0.7,
                   radius = 0.15)

show(plot_d2v)

Takeaways from the plot

The visual overlap between near miss and non near miss event are even greater in the Doc2Vec plots - so much so that there doesn't seem to be any obvious manner to segragate the labels via eye-balling if going down the route of document embeddings.