In [45]:
import pathlib 
import numpy as np
from random import shuffle
import ipywidgets as widgets
import pandas as pd
import re
import altair as alt


from whatlies.language import CountVectorLanguage, BytePairLanguage, SentenceTFMLanguage, LaBSELanguage
from whatlies.transformers import Pca, Umap
from whatlies import EmbeddingSet

from hulearn.preprocessing import InteractivePreprocessor
from hulearn.experimental.interactive import InteractiveCharts

In [46]:
# lang_cv  = CountVectorLanguage(10)
# lang_bp  = BytePairLanguage("he", dim=300, vs=200_000)
# lang_LaBSEL = LaBSELanguage()

In [47]:
def description_preprocessing(description):
    description = description.lower()
    description = re.sub(r"[״׳`!*~.()/,?+-]", " ", description)
    description = description.replace('  ', ' ')
    description = re.sub(r"\"", "", description)
    description = re.sub(r"\'", "", description)
    description = ''.join([i for i in description if not i.isdigit()])
    description = description.replace('   ', ' ')
    description = description.replace('  ', ' ')    
    description = description.strip()

    return description

In [48]:
txt = pathlib.Path("nlu-multi.md").read_text()
texts = list(set([t.replace("- ", "") for t in txt.split("\n") if len(t) > 0 and t[0] != "#"]))
print(f"We're going to plot {len(texts)} texts.")

shuffle(texts)

We're going to plot 648 texts.


In [49]:
texts

['את חמודה חחחח',
 'you are so handsome',
 'לא מעוניין בשיחה הזאת',
 'а oткуда вы',
 'בבקשה תהיה חבר שלי',
 "you're looking good today",
 'שלום לך עוזר',
 'מה תאריך הלידה שלך',
 'your town',
 'Ну и спасибо тебе',
 'בוקר מקסים',
 'אני רוצה שאני ואתה נהיה חברים',
 'Привет',
 'thank you my friend',
 'you look great today',
 'אני לא מעוניין לדבר איתך',
 'Сколько тебе лет',
 'רק בדיקה',
 'מה שם משפחה שלך',
 'תספרי לי מה השם שלך',
 'שלום שלום',
 'ניסיון',
 "You know today it's my bday ",
 'אני מדוכאת',
 'לא מעוניין לדבר',
 'המון המון תודה',
 "let's not talk",
 'את נראית מעולה',
 'hey',
 'you are looking so beautiful',
 'אני חברה שלך',
 'שכחתי את השם שלך',
 'hey buddy',
 'באיזה רחוב אתה גר',
 'את ממש מקסימה',
 'your age',
 'אני מתעייף',
 'שלום וברכה',
 'I am really tired',
 'oka',
 'מאיזה מקופ אתה בארץ',
 "I'm not in a mood to talk",
 'אני מעוניין לסיים את השיחה',
 'מאיזה מקום אתה',
 'you are my best friend',
 'בסדר תודה',
 'You look aged',
 'hey there',
 'are we best friends',
 'you are look

In [50]:
%%time

def make_plot(lang):
    return (lang[texts]
             .transform(Umap(2))
             .plot_interactive(annot=False)
             .properties(width=500, height=500, title=type(lang).__name__))

# make_plot(lang_cv) | make_plot(lang_bp) | make_plot(lang_use)

CPU times: user 8 µs, sys: 7 µs, total: 15 µs
Wall time: 18.8 µs


In [51]:
lang_xlm  = SentenceTFMLanguage('xlm-r-100langs-bert-base-nli-mean-tokens')

In [52]:
make_plot(lang_xlm)

In [53]:
lang_STF  = SentenceTFMLanguage('LaBSE')

In [54]:
# alt.data_transformers.disable_max_rows()
make_plot(lang_STF)

In [None]:
df = lang_STF[texts].transform(Umap(2)).to_dataframe().reset_index()
df.columns = ['text', 'd1', 'd2']
df['label'] = ''
df.shape[0]

In [None]:
# Here's the global state object
state = {}
state['df'] = df.copy()
state['chart'] = InteractiveCharts(df.loc[lambda d: d['label'] == ''], labels=['group'])

In [None]:
# pd.set_option('display.max_colwidth', -1)

def show_draw_chart(b=None):
    with out_chart:
        out_chart.clear_output()
        out_labels.clear_output()
        state['chart'].dataf = state['df'].loc[lambda d: d['label'] == '']
        state['chart'].charts = []
        state['chart'].add_chart(x='d1', y='d2', legend=False)

def show_examples(b=None):
    with out_table:
        out_table.clear_output()
        tfm = InteractivePreprocessor(json_desc=state['chart'].data())
        subset = state['df'].pipe(tfm.pandas_pipe).loc[lambda d: d['group'] != 0]
#         display(subset.sample(min(15, subset.shape[0]))[['text']], )
        display(subset[['text']])

def assign_label(b=None):
    tfm = InteractivePreprocessor(json_desc=state['chart'].data())
    idx = state['df'].pipe(tfm.pandas_pipe).loc[lambda d: d['group'] != 0].index
    state['df'].iloc[idx, 3] = label_name.value
    with out_counter:
        out_counter.clear_output()
        n_lab = state['df'].loc[lambda d: d['label'] != ''].shape[0]
        print(f"{n_lab}/{state['df'].shape[0]} labelled")

def retrain_state(b=None):
    keep = list(state['df'].loc[lambda d: d['label'] == '']['text'])
    umap = Umap(2)
    new_df = EmbeddingSet(*[e for e in embset if e.name in keep]).transform(umap).to_dataframe().reset_index()
    new_df.columns = ['text', 'd1', 'd2']
    new_df['label'] = ''
    state['df'] = pd.concat([new_df, state['df'].loc[lambda d: d['label'] != '']])
    show_draw_chart(b)
    
def choose_label(b=None):
    with out_labels:
        out_labels.clear_output()
        labels = list(set(state['df'][state['df']['label'] != '']['label'].to_list()))
        display(pd.DataFrame({'Labels':labels}))
    
out_table = widgets.Output()
out_labels = widgets.Output()
out_chart = widgets.Output()
out_counter = widgets.Output()

label_name = widgets.Text("label name")

btn_examples = widgets.Button(
    description='Show Examples',
    icon='eye'
)

btn_label = widgets.Button(
    description='Add label',
    icon='check'
)

btn_retrain = widgets.Button(
    description='Retrain',
    icon='coffee'
)

btn_redraw = widgets.Button(
    description='Redraw',
    icon='check'
)

btn_choose_label = widgets.Button(
    description='Existed Labels',
    icon=''
)


btn_examples.on_click(show_examples)
btn_choose_label.on_click(choose_label)
btn_label.on_click(assign_label)
btn_redraw.on_click(show_draw_chart)
btn_retrain.on_click(retrain_state)

show_draw_chart()
display(widgets.VBox([widgets.HBox([btn_retrain, btn_examples, btn_redraw]), 
                      widgets.HBox([out_chart, out_table])]), 
        label_name, widgets.HBox([btn_label, out_counter]),
        out_labels, widgets.HBox([btn_choose_label]))

In [23]:
state['df']

Unnamed: 0,text,d1,d2,label
0,אתה מאוד מושך,3.150455,7.217732,
1,בוקר בוקר,1.736014,-1.990218,
2,עושה ניסוי,6.045632,7.717301,
3,היי,0.395877,0.035965,
4,אין לי כוח אני עייף,2.111082,9.824638,עייף
...,...,...,...,...
387,תעבירו אותי לנציג,8.574843,12.701341,
388,כי את נראית כמו פרח,4.135009,5.863996,
389,אני פשוט גמורה,2.578921,9.713188,עייף
390,באיזו ארץ אתה גר,-1.485256,7.666509,


In [68]:
state['df'][state['df']['label'] != ''].to_csv('labeled_data.csv')

In [72]:
import pandas as pd


def create_rasa_files(path, create_files_path):
    """
    Converts an CSV file created with the specified format to RASA accepted nlu.md format

    path: path where the CSV file is present
    create_files_path: path where the nlu.md file needs to be created
    """
    df = pd.read_csv(r"{}".format(path))
    file = open(r'{}'.format(create_files_path), "w")
    intents = list(set(df['label'].to_list()))
    for item in intents:
        file.write("## intent: {intent_name}\n".format(intent_name=item))
        for i, sent in df[df['label'] == item].iterrows():
            file.write("- {}\n".format(sent['text']))
        file.write("\n")
    file.close()

create_rasa_files('labeled_data.csv', 'nlu.md')


In [71]:
create_rasa_files('labeled_data.csv', 'nlu.md')