In [51]:
import pandas as pd
import tensorflow as tf
import random
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle
import numpy as np
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
!unzip slack_assist_data.zip

Archive:  slack_assist_data.zip
   creating: slack_assist_data/
  inflating: slack_assist_data/dialogues.tsv  
  inflating: slack_assist_data/tagged_posts.tsv  


# Parameters

In [23]:
num_words = 250000
oov_token = '<oov>'
train_size = 0.8
maxlen = 20
padding = 'pre'
truncating = 'pre'

In [24]:
def text_prepare(text):
    """Preform tokenization simple preprocessing"""
    
    replace_by_space_re = re.compile('[/(){}\[\]\|@,;]')
    bad_symbols_re = re.compile('[^0-9a-z #+_]')
    stopwords_set = set(stopwords.words('english'))
    
    text = text.lower()
    text = replace_by_space_re.sub(' ', text)
    text = bad_symbols_re.sub('', text)
    text = ' '.join([x for x in text.split() if x and x not in stopwords_set])
    return str(text.strip())

In [25]:
dialouges_df = pd.read_csv('/content/slack_assist_data/dialogues.tsv', sep='\t')
dialouges_df['label'] = 0
dialouges_df = dialouges_df[['text', 'label']]
tagged_post_df = pd.read_csv('/content/slack_assist_data/tagged_posts.tsv', sep='\t')
tagged_post_df['label'] = 1
tagged_post_df = tagged_post_df[['title', 'label']]
tagged_post_df = tagged_post_df.rename(columns={'title':'text'})
all_df = pd.concat([dialouges_df, tagged_post_df])
all_df = all_df.sample(frac=1).reset_index(drop=True)
all_df['text'] = all_df['text'].apply(text_prepare)

In [26]:
tokenizer = Tokenizer(num_words=num_words, oov_token=oov_token)
tokenizer.fit_on_texts(list(all_df['text'].values))

In [28]:
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [29]:
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [30]:
#Define Train test data
msk = np.random.rand(len(all_df)) <= train_size
train_data = all_df[msk]
test_data = all_df[~msk]
train_sentence = train_data['text'].values
train_label = train_data['label'].values
test_sentences = test_data['text'].values
test_label = test_data['label'].values

In [31]:
#Create data to sequence
train_sequence = tokenizer.texts_to_sequences(train_sentence)
train_padded = pad_sequences(train_sequence, maxlen=maxlen, padding=padding, truncating=truncating)
test_sequence = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequence, maxlen=maxlen, padding=padding, truncating=truncating)

In [48]:
model = tf.keras.models.Sequential([
                                    tf.keras.layers.Dense(128, activation='relu', input_shape=(20,)),
                                    tf.keras.layers.Dropout(0.2),
                                    tf.keras.layers.Dense(256, activation='relu'),
                                    tf.keras.layers.Dropout(0.2),
                                    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

In [49]:
model.fit(train_padded, train_label, validation_data=(test_padded, test_label), batch_size=64, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f72f6031ad0>

In [50]:
#Validation f1_score
prediction = model.predict_classes(test_padded)
f1_score(test_label, prediction)



0.9531518887391676

In [52]:
#save Model
model.save('intent_classifier.h5')