In [1]:
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
print(gpus)
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:2', device_type='GPU')]


In [2]:
epochs = 1
model_name = 'BiLSTM'

In [3]:
import pandas as pd
import numpy as np
import json

In [4]:
data_path = '/home/Danny/Data-Mining/lab2/kaggle/data/'

In [5]:
json_list = list()
with open(data_path + 'tweets_DM.json' , 'r') as file:
    for line in file:
        json_list.append(json.loads(line))

In [6]:
tweet_list = list()
for json in json_list:
    tweet_id = json['_source']['tweet']['tweet_id']
    hashtags = json['_source']['tweet']['hashtags']
    hashtag = ' '.join(hashtags)
    text = json['_source']['tweet']['text']
    text = text + ' ' + hashtag
    tweet_list.append([tweet_id, text])

In [7]:
text_df = pd.DataFrame(tweet_list, columns=['tweet_id', 'text'])

In [8]:
emotion_df = pd.read_csv(data_path + 'emotion.csv')
# emotion_df

In [9]:
identification_df = pd.read_csv(data_path + 'data_identification.csv')
# identification_df

In [10]:
test_df = identification_df[identification_df['identification'] == 'test']
# test_df

In [11]:
train_df = text_df.merge(emotion_df, left_on='tweet_id', right_on='tweet_id')
# train_df

In [12]:
x_list = train_df['text'].to_list()
y_list = train_df['emotion'].to_list()

In [13]:
x_list_list = list()
for x in x_list:
    x_list_list.append(x.split())
len(x_list_list)

1455563

In [14]:
x_list_list

[['People',
  'who',
  'post',
  '"add',
  'me',
  'on',
  '#Snapchat"',
  'must',
  'be',
  'dehydrated.',
  'Cuz',
  'man....',
  "that's",
  '<LH>',
  'Snapchat'],
 ['@brianklaas',
  'As',
  'we',
  'see,',
  'Trump',
  'is',
  'dangerous',
  'to',
  '#freepress',
  'around',
  'the',
  'world.',
  'What',
  'a',
  '<LH>',
  '<LH>',
  '#TrumpLegacy.',
  '#CNN',
  'freepress',
  'TrumpLegacy',
  'CNN'],
 ['Now', 'ISSA', 'is', 'stalking', 'Tasha', '😂😂😂', '<LH>'],
 ['@RISKshow',
  '@TheKevinAllison',
  'Thx',
  'for',
  'the',
  'BEST',
  'TIME',
  'tonight.',
  'What',
  'stories!',
  'Heartbreakingly',
  '<LH>',
  '#authentic',
  '#LaughOutLoud',
  'good!!',
  'authentic',
  'LaughOutLoud'],
 ['Still', 'waiting', 'on', 'those', 'supplies', 'Liscus.', '<LH>'],
 ['Love', 'knows', 'no', 'gender.', '😢😭', '<LH>'],
 ['@DStvNgCare',
  '@DStvNg',
  'More',
  'highlights',
  'are',
  'being',
  'shown',
  'than',
  'actual',
  'sports!',
  'Who',
  'watches',
  'triathlon',
  'highlights',
  

In [15]:
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(x_list_list, y_list, test_size=0.2, random_state=42)
train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size=0.2, random_state=42)
print(len(train_x), len(train_y))
print(len(test_x), len(test_y))
print(len(valid_x), len(valid_y))

931560 931560
291113 291113
232890 232890


In [None]:
import kashgari
from kashgari.tasks.classification import BiLSTM_Model
kashgari.config.use_cudnn_cell = True
import logging
logging.basicConfig(level='DEBUG')
# from kashgari.embeddings import BertEmbedding
# bert_embed = BertEmbedding('/home/Danny/pretrain_model/bert_tiny')
# model = BiLSTM_Model(bert_embed, sequence_length=128)
model = BiLSTM_Model()
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="./logs")
history = model.fit(train_x, 
                    train_y, 
                    valid_x, 
                    valid_y,
                    epochs=epochs,
         )

Preparing text vocab dict: 100%|██████████| 931560/931560 [00:04<00:00, 196163.63it/s]
Preparing text vocab dict: 100%|██████████| 232890/232890 [00:01<00:00, 183625.64it/s]
2020-11-30 22:19:25,446 [DEBUG] kashgari - --- Build vocab dict finished, Total: 257059 ---
2020-11-30 22:19:25,446 [DEBUG] kashgari - Top-10: ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '<LH>', 'the', 'to', 'I', 'a', 'and']
Preparing classification label vocab dict: 100%|██████████| 931560/931560 [00:00<00:00, 1836207.54it/s]
Preparing classification label vocab dict: 100%|██████████| 232890/232890 [00:00<00:00, 1812864.84it/s]
Calculating sequence length: 100%|██████████| 931560/931560 [00:00<00:00, 1999218.08it/s]
Calculating sequence length: 100%|██████████| 232890/232890 [00:00<00:00, 1961768.18it/s]
2020-11-30 22:19:27,419 [DEBUG] kashgari - Calculated sequence length = 27
2020-11-30 22:19:28,692 [DEBUG] kashgari - Model: "model_1"
_________________________________________________________________
Layer (type)       



In [None]:
model.evaluate(test_x, test_y)
model_path = 'model/{}_epoch_{}'.format(model_name, epochs)
model.save(model_path)

In [None]:
from matplotlib import pyplot as plt

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
# model_path = 'model/{}_epoch_{}'.format(model_name, epochs)
# model = kashgari.utils.load_model(model_path)
# model.evaluate(test_x, test_y)

In [None]:
test_df = test_df.merge(text_df, left_on='tweet_id', right_on='tweet_id')
test_df

In [None]:
text_list = list()
for text in test_df['text'].tolist():
    text_list.append(text.split())
text_list

In [None]:
predict_list = model.predict(text_list)
predict_list

In [None]:
test_df['predict'] = predict_list
test_df

In [None]:
output_df = test_df[['tweet_id', 'predict']]
output_df = output_df.rename(columns={'tweet_id':'id', 'predict':'emotion'})
# output_df

In [None]:
output_path = 'output/{}_epoch_{}.csv'.format(model_name, epochs)
output_df.to_csv(output_path, index=False, header=True)