## Using vectorization to generate features

In [12]:
import pandas as pd
import spacy
import umap
import numpy as np

from io import BytesIO
from PIL import Image
import base64

from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource, CategoricalColorMapper
from bokeh.palettes import Spectral10, Category10
from pathlib import Path

import sys
sys.path.append('..')
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
from ml_editor.data_processing import format_raw_df, get_split_by_author, get_normalized_series
from ml_editor.data_processing import add_text_features_to_df
from ml_editor.data_visualization import plot_embeddings

In [5]:
df = pd.read_csv(Path('../data/writers.csv'))
df = format_raw_df(df.copy())

train_author, test_author = get_split_by_author(df[df['is_question']])

questions = train_author[train_author['is_question']]
raw_text = questions['body_text']
sent_labels = questions['AcceptedAnswerId'].notna()

Load a model, and disable pipeline unnecessary for our task

In [6]:
nlp = spacy.load('en_core_web_md', disable=['parser', 'ner', 'tagger'])

Get the vector for each of our questions. 
By default, the vector returned is the average of all vectors in the sentence

In [7]:
spacy_emb = train_author[train_author['is_question']]['body_text'].apply(lambda x: nlp(x).vector)
embeddings = np.vstack(spacy_emb)

In [8]:
umap_embedder = umap.UMAP()
umap_emb = umap_embedder.fit_transform(embeddings)

### Interactive plot
Now we use bokeh to explore embeddings interactively.

In [13]:
from bokeh.models import ColumnDataSource

In [16]:
output_notebook()

def get_interactive_umap_embeddings_plot(umap_vectors, labels, text, legends, tooltip_label=None):
    if not tooltip_label:
        print('Using standard label')
        tooltip_label = labels
    w2v_df = pd.DataFrame(umap_vectors, columns=('x', 'y'))
    print(len(w2v_df))
    w2v_df['label'] = [str(x) for x in labels]
    w2v_df['tooltip_label'] = [str(x) for x in tooltip_label]
    w2v_df['text'] = list(text)
    w2v_df['legends'] = ['Answered' if el else 'Unanswered' for el in list(legends)]
    datasource = ColumnDataSource(w2v_df)
    
    color_mapping = CategoricalColorMapper(factors=['True', 'False'], palette=['#1f77b4', '#ff7f0e'])
    
    TOOLTIPS = [
        ('text', '@text'),
        ('got_answer', '@tooltip_label')
    ]
    hover = HoverTool(tooltips=TOOLTIPS)
    hover.attachment = 'right'
    
    plot_figure = figure(title='UMAP projection of questions',
                        plot_width=600,
                        plot_height=400,
                        tools=('pan', 'wheel_zoom', 'reset', 'box_zoom', 'undo'))
    plot_figure.add_tools(hover)
    
    plot_figure.circle('x',
                      'y',
                      source=datasource,
                      color=dict(field='label', transform=color_mapping),
                      legend='legends',
                      line_alpha=0,
                      fill_alpha=0.4,
                      size=5
                      )
    return plot_figure

In [17]:
plot_figure = get_interactive_umap_embeddings_plot(umap_emb, sent_labels, raw_text, legends=sent_labels)
show(plot_figure)

Using standard label
5495
