In [20]:
import pandas as pd
import json
import re
import unicodedata
import tensorflow as tf
import numpy as np
from model_v2 import LanguageModel
from tqdm import tqdm_notebook
from utils import clean_text
from sklearn.model_selection import train_test_split

In [2]:
with open('102/word2idx.json', 'r') as inp:
    word2idx = json.load(inp)
with open('102/char2idx.json', 'r') as inp:
    char2idx = json.load(inp)
idx2word = {k: v for v, k in word2idx.items()}
idx2char = {k: v for v, k in char2idx.items()}

In [3]:
# fashion = pd.read_csv('fashion.csv')
# music = pd.read_csv('music.csv')
# travel = pd.read_csv('travel.csv')
# technology = pd.read_csv('technology.csv')
# lifestyle = pd.read_csv('lifestyle.csv')

In [4]:
# fashion['class'] = 'fashion'
# music['class'] = 'music'
# travel['class'] = 'travel'
# technology['class'] = 'technology'
# lifestyle['class'] = 'lifestyle'

In [5]:
# data = pd.concat([fashion, music, travel, technology, lifestyle], axis=0)

In [3]:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
with open('102/checkpoints/model_configs.json', 'r') as inp:
    params = json.load(inp)

model = LanguageModel(**params, is_training=False, is_encoding=True)

model.build_model()
saver = tf.train.Saver([x for x in tf.global_variables() if x not in tf.get_collection('LSTM_SAVED_STATE')])
sess.run(tf.global_variables_initializer())
saver.restore(sess, '102/checkpoints/test/model.cpkt-209107')

INFO:tensorflow:Restoring parameters from 102/checkpoints/test/model.cpkt-209107


In [4]:
sent_emb = tf.concat([model.encode_outputs[-1], model.layerwise_avg[-1], model.layerwise_max[-1]], axis=-1)

In [5]:
def pad_sequence(words):
    maxlen = max(len(x) for x in words)
    arr = np.zeros(shape=(len(words), 1, maxlen))
    for ir in range(len(arr)):
        s = words[ir]
        arr[ir][0][:len(s)] = s
    return arr

def __embed_sequence(sentence):
    unk_char = [char2idx[x] for x in '<UNK>']
    sentence = [[char2idx[x] for x in word] if word in word2idx else unk_char for word in sentence]
    seq_len = len(sentence)
    inputs = pad_sequence(sentence)
    embeddings = sess.run(sent_emb, feed_dict={
        model.inputs: inputs, model.seq_lens: [seq_len], model.reset_state: True
    })
    return embeddings
def embed_sentence(sentence):
    sentence = re.sub(r'(http[s]?://)?(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ', sentence)
    sentence = re.sub(r"#[^\s]*", ' ', sentence)
    sentence = clean_text(sentence, add_bos=True, add_eos=True).split()
    return __embed_sequence(sentence)

In [11]:
data = pd.read_csv('intent.csv')[['label', 'text']]
data.dropna(inplace=True)

In [12]:
data['price'] = data['label'].map(lambda x: 1 if 'P' in x else 0)
data['price'].value_counts()

1    3084
0    2997
Name: price, dtype: int64

In [13]:
data['where'] = data['label'].map(lambda x: 1 if 'W' in x else 0)
data['where'].value_counts()

0    5070
1    1011
Name: where, dtype: int64

In [14]:
data['time'] = data['label'].map(lambda x: 1 if 'T' in x else 0)
data['time'].value_counts()

0    6073
1       8
Name: time, dtype: int64

In [15]:
data['availability'] = data['label'].map(lambda x: 1 if 'A' in x else 0)
data['availability'].value_counts()

0    4831
1    1250
Name: availability, dtype: int64

In [16]:
data['interest'] = data['label'].map(lambda x: 1 if 'I' in x else 0)
data['interest'].value_counts()

0    5355
1     726
Name: interest, dtype: int64

In [17]:
data['other'] = data['label'].map(lambda x: 1 if 'O' in x else 0)
data['other'].value_counts()

0    4975
1    1106
Name: other, dtype: int64

In [18]:
data['vector'] = data[['price','where','time','availability','interest','other']].apply(lambda row: np.array(row, dtype=np.int32), axis=1)

In [22]:
embedding = [np.squeeze(embed_sentence(x), 0) for x in tqdm_notebook(data['text'])]    
data['embedding'] = embedding

HBox(children=(IntProgress(value=0, max=6081), HTML(value='')))




In [25]:
X_train, X_test, y_train, y_test = train_test_split([x for x in zip(data['embedding'], data['text'])], data['vector'], test_size=0.5, random_state=20)

In [26]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [27]:
import pickle

In [28]:
with open('classify data/X_train.pkl', 'wb') as out:
    pickle.dump(X_train, out)
with open('classify data/y_train.pkl', 'wb') as out:
    pickle.dump(y_train, out)
with open('classify data/X_test.pkl', 'wb') as out:
    pickle.dump(X_test, out)
with open('classify data/y_test.pkl', 'wb') as out:
    pickle.dump(y_test, out)
with open('classify data/X_val.pkl', 'wb') as out:
    pickle.dump(X_val, out)
with open('classify data/y_val.pkl', 'wb') as out:
    pickle.dump(y_val, out)