In [None]:
import pandas as pd
import tensorflow as tf
import random
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle
import numpy as np
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
!unzip slack_assist_data.zip

unzip:  cannot find or open slack_assist_data.zip, slack_assist_data.zip.zip or slack_assist_data.zip.ZIP.


# Parameters

In [None]:
num_words = 50000
oov_token = '<oov>'
train_size = 0.8
maxlen = 20
padding = 'pre'
truncating = 'pre'

In [None]:
def text_prepare(text):
    """Preform tokenization simple preprocessing"""
    
    replace_by_space_re = re.compile('[/(){}\[\]\|@,;]')
    bad_symbols_re = re.compile('[^0-9a-z #+_]')
    stopwords_set = set(stopwords.words('english'))
    
    text = text.lower()
    text = replace_by_space_re.sub(' ', text)
    text = bad_symbols_re.sub('', text)
    text = ' '.join([x for x in text.split() if x and x not in stopwords_set])
    return str(text.strip())
def map_tag(tag):
  return label_dict[tag]

In [None]:
tagged_post_df = pd.read_csv('/content/tagged_posts.tsv', sep='\t')
#Define label dictionary
label_dict = {}
for step, tag in enumerate(tagged_post_df['tag'].unique()):
  label_dict[tag] = step
tagged_post_df['label'] = tagged_post_df['tag'].apply(map_tag)
tagged_post_df = tagged_post_df[['title', 'label']]
tagged_post_df['title'] = tagged_post_df['title'].apply(text_prepare)

In [None]:
tokenizer = Tokenizer(num_words=num_words, oov_token=oov_token)
tokenizer.fit_on_texts(list(tagged_post_df['title'].values))

In [None]:
with open('tokenizer_tag.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open('label_dict.pickle', 'wb') as handle:
    pickle.dump(label_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open('tokenizer_tag.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [None]:
#Define Train test data
msk = np.random.rand(len(tagged_post_df)) <= train_size
train_data = tagged_post_df[msk]
test_data = tagged_post_df[~msk]
train_sentence = train_data['title'].values
train_label = train_data['label'].values
train_label = tf.keras.utils.to_categorical(train_label, len(label_dict.keys()))
test_sentences = test_data['title'].values
test_label = test_data['label'].values
test_label = tf.keras.utils.to_categorical(test_label, len(label_dict.keys()))

In [None]:
#Create data to sequence
train_sequence = tokenizer.texts_to_sequences(train_sentence)
train_padded = pad_sequences(train_sequence, maxlen=maxlen, padding=padding, truncating=truncating)
test_sequence = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequence, maxlen=maxlen, padding=padding, truncating=truncating)

In [None]:
model = tf.keras.models.Sequential([
                                    tf.keras.layers.Embedding(input_dim=num_words+1, output_dim=50, input_length=maxlen),
                                    tf.keras.layers.GlobalAveragePooling1D(),
                                    tf.keras.layers.Dense(256, activation='relu'),
                                    tf.keras.layers.Dropout(0.2),
                                    tf.keras.layers.Dense(10, activation='softmax')
])
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
history = model.fit(train_padded, train_label, validation_data=(test_padded, test_label), batch_size=256, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
#save Model
model.save('tag_classifier.h5')