In [1]:
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
print(gpus)
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:2', device_type='GPU')]


In [2]:
epochs = 1
model_name = 'bert_base'

In [3]:
import pandas as pd
import numpy as np
import json

In [4]:
data_path = '/home/Danny/Data-Mining/lab2/kaggle/data/'

In [5]:
json_list = list()
with open(data_path + 'tweets_DM.json' , 'r') as file:
    for line in file:
        json_list.append(json.loads(line))

In [6]:
tweet_list = list()
for json in json_list:
    tweet_id = json['_source']['tweet']['tweet_id']
    hashtags = json['_source']['tweet']['hashtags']
    hashtag = ' '.join(hashtags)
    text = json['_source']['tweet']['text']
    text = text + ' ' + hashtag
    tweet_list.append([tweet_id, text])

In [7]:
emotion_df = pd.read_csv(data_path + 'emotion.csv')
emotion_df

Unnamed: 0,tweet_id,emotion
0,0x3140b1,sadness
1,0x368b73,disgust
2,0x296183,anticipation
3,0x2bd6e1,joy
4,0x2ee1dd,anticipation
...,...,...
1455558,0x38dba0,joy
1455559,0x300ea2,joy
1455560,0x360b99,fear
1455561,0x22eecf,joy


In [8]:
identification_df = pd.read_csv(data_path + 'data_identification.csv')
identification_df

Unnamed: 0,tweet_id,identification
0,0x28cc61,test
1,0x29e452,train
2,0x2b3819,train
3,0x2db41f,test
4,0x2a2acc,train
...,...,...
1867530,0x227e25,train
1867531,0x293813,train
1867532,0x1e1a7e,train
1867533,0x2156a5,train


In [9]:
emoji_dict = {
    "😂": "lolface",
    "😂😂": "lolface",
    "😂😂😂": "lolface",
    "😇": "smile",
    "😀": "smile",
    "🎉": "party",
    "😳": "embarrassed",
    "😔": "sadface",
    "👀": "shifty",
    "🤷": "shrugging",
    "💔": "brokenhearted",
    "👻": "ghost",
    "😍": "heart",
    "🙄": "disdain",
    "💖": "heart",
    "✌": "victory",
    "🎶": "music",
    "😱": "shock",
    "😃": "smile",
    "😒": "unsatisfied",
    "👊": "brofist",
    "😄": "smile",
    "🌞": "smile",
    "🙌": "celebration",
    "😁": "smile",
    "🤗": "hugging",
    "🤣": "rofl",
    "🌈": "gaypride",
    "😉": "winking",
    "💞": "heart",
    "🙃": "irony",
    "😜": "winking",
    "😭": "bawling",
    "🤔": "thinker",
    "😎": "cool",
    "💛": "heart",
    "💚": "heart",
    "💃": "fun",
    "💗": "heart",
    "😬": "awkward",
    "😌": "relieved",
    "😅": "whew",
    "💋": "kiss",
    "🙈": "laugh",
    "😊": "^^",
    "👌": "okay",
    "😡": "angry",
    "😘": "kiss",
    "😩": "weary",
    "🔥": "excellent",
    "💙": "heart",
    "💕": "heart",
    "👏": "clapping",
    "👍": "thumbsup",
    "💯": "perfect",
    "💜": "heart",
    "🕘" : "late",
    "😡" : "angry",
    "😒" : "dissatisfied",
    "😤" : "angry",
    "😠" : "angry",
    "😑" : "annoy",
    "😡😡😡" : "angry",
    "😡😡" : "angry",
    "😰": "anxious",
    "😯": "surprise",
    "😨": "scared",
    "😲": "astonished",
    "💪": "strong",
    "🤦": "facepalm",
    "✨": "sparkle",
    "😢": "crying",
    "💓": "heart",
    "👑": "crown",
    "🤘": "rockon",
    "🌹": "rose",
    "😋": "delicious",
    "😏": "flirting",
    "😆": "XD",
    "😫": "exhausted",
    "😦": "frowning",
    "🙏": "please",
}

In [10]:
frequent_name_dict = {
    "#realdonaldtrump": "sadness",
    "#fifthharmony": "sadness",
    "#mostrequestlive": "sadness",
    "#onairromeo": "sadness",
    "#matthardybrand": "sadness",
}

In [11]:
def replace_word(text):
    text_list = text.split()
    for i, j in enumerate(text_list):
        if j in emoji_dict:
            text_list[i] = emoji_dict[j]
        if j in frequent_name_dict:
            text_list[i] = frequent_name_dict[j]
    return ' '.join(text_list)

In [12]:
text_df = pd.DataFrame(tweet_list, columns=['tweet_id', 'text'])
text_df

Unnamed: 0,tweet_id,text
0,0x376b20,"People who post ""add me on #Snapchat"" must be ..."
1,0x2d5350,"@brianklaas As we see, Trump is dangerous to #..."
2,0x28b412,"Confident of your obedience, I write to you, k..."
3,0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂 <LH>
4,0x2de201,"""Trust is not the same as faith. A friend is s..."
...,...,...
1867530,0x316b80,When you buy the last 2 tickets remaining for ...
1867531,0x29d0cb,I swear all this hard work gone pay off one da...
1867532,0x2a6a4f,@Parcel2Go no card left when I wasn't in so I ...
1867533,0x24faed,"Ah, corporate life, where you can date <LH> us..."


In [13]:
%%time
import re
from nltk.tokenize import TweetTokenizer

tweet_tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
text_df['text'] = text_df['text'].apply(lambda s : s.lower())
text_df['text'] = text_df['text'].apply(lambda s : replace_word(s))    
text_df['text'] = text_df['text'].apply(lambda s : re.sub('<lh>|<|>|@|#', '', s))
# text_df['text'] = text_df['text'].apply(lambda s : s.replace('<lh>',''))
# text_df['text'] = text_df['text'].apply(lambda s : s.replace('#',''))
# text_df['text'] = text_df['text'].apply(lambda s : s.replace('@',''))
text_df['token'] = text_df['text'].apply(lambda s : tweet_tokenizer.tokenize(s)) 
text_df

CPU times: user 3min 1s, sys: 10.8 s, total: 3min 12s
Wall time: 3min 13s


Unnamed: 0,tweet_id,text,token
0,0x376b20,"people who post ""add me on snapchat"" must be d...","[people, who, post, "", add, me, on, snapchat, ..."
1,0x2d5350,"brianklaas as we see, trump is dangerous to fr...","[brianklaas, as, we, see, ,, trump, is, danger..."
2,0x28b412,"confident of your obedience, i write to you, k...","[confident, of, your, obedience, ,, i, write, ..."
3,0x1cd5b0,now issa is stalking tasha lolface,"[now, issa, is, stalking, tasha, lolface]"
4,0x2de201,"""trust is not the same as faith. a friend is s...","["", trust, is, not, the, same, as, faith, ., a..."
...,...,...,...
1867530,0x316b80,when you buy the last 2 tickets remaining for ...,"[when, you, buy, the, last, 2, tickets, remain..."
1867531,0x29d0cb,i swear all this hard work gone pay off one da...,"[i, swear, all, this, hard, work, gone, pay, o..."
1867532,0x2a6a4f,parcel2go no card left when i wasn't in so i h...,"[parcel, 2go, no, card, left, when, i, wasn't,..."
1867533,0x24faed,"ah, corporate life, where you can date using ...","[ah, ,, corporate, life, ,, where, you, can, d..."


In [14]:
test_df = identification_df[identification_df['identification'] == 'test']
# test_df

In [15]:
train_df = text_df.merge(emotion_df, left_on='tweet_id', right_on='tweet_id')
# train_df

In [16]:
x_list = train_df['token'].to_list()
y_list = train_df['emotion'].to_list()
print(len(x_list))
print(len(y_list))

1455563
1455563


In [17]:
# x_list_list = list()
# for x in x_list:
#     x_list_list.append(x.split())
# len(x_list_list)

In [18]:
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(x_list, y_list, test_size=0.2, random_state=42)
train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size=0.2, random_state=42)
print(len(train_x), len(train_y))
print(len(test_x), len(test_y))
print(len(valid_x), len(valid_y))

931560 931560
291113 291113
232890 232890


In [19]:
%%time
import kashgari
from kashgari.tasks.classification import BiLSTM_Model
kashgari.config.use_cudnn_cell = True
import logging
logging.basicConfig(level='DEBUG')
from kashgari.embeddings import BertEmbedding
bert_embed = BertEmbedding('/home/Danny/pretrain_model/{}'.format(model_name))
model = BiLSTM_Model(bert_embed, sequence_length='auto')
model = BiLSTM_Model()
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="./logs")
history = model.fit(train_x, 
                    train_y, 
                    valid_x, 
                    valid_y,
                    epochs=epochs,
         )

2020-12-01 17:08:21,215 [DEBUG] kashgari - ------------------------------------------------
2020-12-01 17:08:21,216 [DEBUG] kashgari - Loaded transformer model's vocab
2020-12-01 17:08:21,216 [DEBUG] kashgari - config_path       : /home/Danny/pretrain_model/bert_base/bert_config.json
2020-12-01 17:08:21,217 [DEBUG] kashgari - vocab_path      : /home/Danny/pretrain_model/bert_base/vocab.txt
2020-12-01 17:08:21,217 [DEBUG] kashgari - checkpoint_path : /home/Danny/pretrain_model/bert_base/bert_model.ckpt
2020-12-01 17:08:21,218 [DEBUG] kashgari - Top 50 words    : ['[PAD]', '[unused0]', '[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]', '[unused6]', '[unused7]', '[unused8]', '[unused9]', '[unused10]', '[unused11]', '[unused12]', '[unused13]', '[unused14]', '[unused15]', '[unused16]', '[unused17]', '[unused18]', '[unused19]', '[unused20]', '[unused21]', '[unused22]', '[unused23]', '[unused24]', '[unused25]', '[unused26]', '[unused27]', '[unused28]', '[unused29]', '[unused30]'

ResourceExhaustedError: in user code:

    /home/Danny/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:571 train_function  *
        outputs = self.distribute_strategy.run(
    /home/Danny/anaconda3/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:951 run  **
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /home/Danny/anaconda3/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:2290 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /home/Danny/anaconda3/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:2649 _call_for_each_replica
        return fn(*args, **kwargs)
    /home/Danny/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:541 train_step  **
        self.trainable_variables)
    /home/Danny/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:1812 _minimize
        experimental_aggregate_gradients=False)
    /home/Danny/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py:478 apply_gradients
        self._create_all_weights(var_list)
    /home/Danny/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py:663 _create_all_weights
        self._create_slots(var_list)
    /home/Danny/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/optimizer_v2/adam.py:156 _create_slots
        self.add_slot(var, 'm')
    /home/Danny/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py:723 add_slot
        initial_value=initial_value)
    /home/Danny/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/variables.py:261 __call__
        return cls._variable_v2_call(*args, **kwargs)
    /home/Danny/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/variables.py:255 _variable_v2_call
        shape=shape)
    /home/Danny/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/variables.py:66 getter
        return captured_getter(captured_previous, **kwargs)
    /home/Danny/anaconda3/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:2562 creator
        return next_creator(**kwargs)
    /home/Danny/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/variables.py:66 getter
        return captured_getter(captured_previous, **kwargs)
    /home/Danny/anaconda3/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:2562 creator
        return next_creator(**kwargs)
    /home/Danny/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/variables.py:66 getter
        return captured_getter(captured_previous, **kwargs)
    /home/Danny/anaconda3/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:2562 creator
        return next_creator(**kwargs)
    /home/Danny/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/variables.py:66 getter
        return captured_getter(captured_previous, **kwargs)
    /home/Danny/anaconda3/lib/python3.6/site-packages/tensorflow/python/eager/def_function.py:494 variable_capturing_scope
        lifted_initializer_graph=lifted_initializer_graph, **kwds)
    /home/Danny/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/variables.py:263 __call__
        return super(VariableMetaclass, cls).__call__(*args, **kwargs)
    /home/Danny/anaconda3/lib/python3.6/site-packages/tensorflow/python/eager/def_function.py:180 __init__
        initial_value() if init_from_fn else initial_value,
    /home/Danny/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/init_ops_v2.py:129 __call__
        return array_ops.zeros(shape, dtype)
    /home/Danny/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/array_ops.py:2677 wrapped
        tensor = fun(*args, **kwargs)
    /home/Danny/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/array_ops.py:2733 zeros
        output = fill(shape, constant(zero, dtype=dtype), name=name)
    /home/Danny/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/array_ops.py:234 fill
        result = gen_array_ops.fill(dims, value, name=name)
    /home/Danny/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/gen_array_ops.py:3316 fill
        _ops.raise_from_not_ok_status(e, name)
    /home/Danny/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py:6653 raise_from_not_ok_status
        six.raise_from(core._status_to_exception(e.code, message), None)
    <string>:3 raise_from
        

    ResourceExhaustedError: OOM when allocating tensor with shape[155333,100] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:Fill]


In [20]:
model.evaluate(test_x, test_y)
model_path = 'model/{}_epoch_{}'.format(model_name, epochs)
model.save(model_path)

2020-12-01 17:08:56,694 [DEBUG] kashgari - predict input shape (291113, 102) x: 
[[   2  109   11 ...    0    0    0]
 [   2  172   80 ...    0    0    0]
 [   2  806   56 ...    0    0    0]
 ...
 [   2    9 1616 ...    0    0    0]
 [   2   12  344 ...    0    0    0]
 [   2  976 5113 ...    0    0    0]]


InternalError:  Dst tensor is not initialized.
	 [[{{node model_1/layer_embedding/embedding_lookup/_6}}]] [Op:__inference_predict_function_12027]

Function call stack:
predict_function


In [None]:
from matplotlib import pyplot as plt

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['accuracy', 'val_accuracy', 'loss', 'val_loss'])
plt.show()

In [None]:
# model_path = 'model/{}_epoch_{}'.format(model_name, epochs)
# model = kashgari.utils.load_model(model_path)
# model.evaluate(test_x, test_y)

In [None]:
test_df = test_df.merge(text_df, left_on='tweet_id', right_on='tweet_id')
test_df

In [None]:
text_list = list()
for text in test_df['text'].tolist():
    text_list.append(text.split())
text_list

In [None]:
predict_list = model.predict(text_list)
predict_list

In [None]:
test_df['predict'] = predict_list
test_df

In [None]:
output_df = test_df[['tweet_id', 'predict']]
output_df = output_df.rename(columns={'tweet_id':'id', 'predict':'emotion'})
# output_df

In [None]:
output_path = 'output/{}_epoch_{}.csv'.format(model_name, epochs)
output_df.to_csv(output_path, index=False, header=True)