In [1]:
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
print(gpus)
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:2', device_type='GPU')]


In [2]:
epochs = 5
model_name = 'BiLSTM_BERT_tiny'

In [3]:
import pandas as pd
import numpy as np
import json

In [4]:
data_path = '/home/Danny/Data-Mining/lab2/kaggle/data/'

In [5]:
json_list = list()
with open(data_path + 'tweets_DM.json' , 'r') as file:
    for line in file:
        json_list.append(json.loads(line))

In [6]:
tweet_list = list()
for json in json_list:
    tweet_id = json['_source']['tweet']['tweet_id']
    hashtags = json['_source']['tweet']['hashtags']
    hashtag = ' '.join(hashtags)
    text = json['_source']['tweet']['text']
    text = text + ' ' + hashtag
    tweet_list.append([tweet_id, text])

In [7]:
text_df = pd.DataFrame(tweet_list, columns=['tweet_id', 'text'])

In [8]:
emotion_df = pd.read_csv(data_path + 'emotion.csv')
# emotion_df

In [9]:
identification_df = pd.read_csv(data_path + 'data_identification.csv')
# identification_df

In [10]:
test_df = identification_df[identification_df['identification'] == 'test']
# test_df

In [11]:
train_df = text_df.merge(emotion_df, left_on='tweet_id', right_on='tweet_id')
# train_df

In [12]:
x_list = train_df['text'].to_list()
y_list = train_df['emotion'].to_list()

In [13]:
x_list_list = list()
for x in x_list:
    x_list_list.append(x.split())
len(x_list_list)

1455563

In [14]:
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(x_list_list, y_list, test_size=0.2, random_state=42)
train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size=0.2, random_state=42)
print(len(train_x), len(train_y))
print(len(test_x), len(test_y))
print(len(valid_x), len(valid_y))

931560 931560
291113 291113
232890 232890


In [15]:
import kashgari
from kashgari.tasks.classification import BiGRU_Model
kashgari.config.use_cudnn_cell = True
import logging
logging.basicConfig(level='DEBUG')
from kashgari.embeddings import BertEmbedding
bert_embed = BertEmbedding('/home/Danny/pretrain_model/bert_tiny')

model = BiGRU_Model(bert_embed, sequence_length=128)
model.fit(train_x, 
          train_y, 
          valid_x, 
          valid_y,
          epochs=epochs,
         )

2020-11-30 17:09:04,989 [DEBUG] kashgari - ------------------------------------------------
2020-11-30 17:09:04,990 [DEBUG] kashgari - Loaded transformer model's vocab
2020-11-30 17:09:04,991 [DEBUG] kashgari - config_path       : /home/Danny/pretrain_model/bert_tiny/bert_config.json
2020-11-30 17:09:04,991 [DEBUG] kashgari - vocab_path      : /home/Danny/pretrain_model/bert_tiny/vocab.txt
2020-11-30 17:09:04,992 [DEBUG] kashgari - checkpoint_path : /home/Danny/pretrain_model/bert_tiny/bert_model.ckpt
2020-11-30 17:09:04,992 [DEBUG] kashgari - Top 50 words    : ['[PAD]', '[unused0]', '[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]', '[unused6]', '[unused7]', '[unused8]', '[unused9]', '[unused10]', '[unused11]', '[unused12]', '[unused13]', '[unused14]', '[unused15]', '[unused16]', '[unused17]', '[unused18]', '[unused19]', '[unused20]', '[unused21]', '[unused22]', '[unused23]', '[unused24]', '[unused25]', '[unused26]', '[unused27]', '[unused28]', '[unused29]', '[unused30]'

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f18cdab1470>

In [16]:
model.evaluate(test_x, test_y)
model_path = 'model/{}_epoch_{}'.format(model_name, epochs)
model.save(model_path)

2020-11-30 17:35:11,746 [DEBUG] kashgari - predict input shape (2, 291113, 49) x: 
(array([[ 101,  100, 1037, ...,    0,    0,    0],
       [ 101,  100, 2256, ...,    0,    0,    0],
       [ 101,  100, 2065, ...,    0,    0,    0],
       ...,
       [ 101,  100, 2499, ...,    0,    0,    0],
       [ 101,  100, 1998, ...,    0,    0,    0],
       [ 101,  100,  100, ...,    0,    0,    0]], dtype=int32), array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32))
2020-11-30 17:36:23,654 [DEBUG] kashgari - predict output shape (291113, 8)
2020-11-30 17:36:23,814 [DEBUG] kashgari - predict output argmax: [0 2 0 ... 0 0 0]


              precision    recall  f1-score   support

       anger     0.8379    0.1067    0.1893      7946
anticipation     0.5791    0.3603    0.4442     49984
     disgust     0.4318    0.1287    0.1983     27669
        fear     0.4027    0.1235    0.1891     12846
         joy     0.4482    0.8647    0.5904    102943
     sadness     0.3625    0.3436    0.3528     38745
    surprise     0.8611    0.1137    0.2009      9816
       trust     0.6290    0.1559    0.2499     41164

    accuracy                         0.4598    291113
   macro avg     0.5691    0.2746    0.3018    291113
weighted avg     0.5058    0.4598    0.4065    291113



2020-11-30 17:36:32,014 [INFO] kashgari - model saved to /home/Danny/Data-Mining/lab2/kaggle/kashgari/model/BiLSTM_BERT_epoch_5


'/home/Danny/Data-Mining/lab2/kaggle/kashgari/model/BiLSTM_BERT_epoch_5'

In [17]:
model_path = 'model/{}_epoch_{}'.format(model_name, epochs)
model = kashgari.utils.load_model(model_path)
# model.evaluate(test_x, test_y)

  
2020-11-30 17:36:32,160 [DEBUG] kashgari - ------------------------------------------------
2020-11-30 17:36:32,162 [DEBUG] kashgari - Loaded transformer model's vocab
2020-11-30 17:36:32,162 [DEBUG] kashgari - config_path       : /home/Danny/pretrain_model/bert_tiny/bert_config.json
2020-11-30 17:36:32,163 [DEBUG] kashgari - vocab_path      : /home/Danny/pretrain_model/bert_tiny/vocab.txt
2020-11-30 17:36:32,163 [DEBUG] kashgari - checkpoint_path : /home/Danny/pretrain_model/bert_tiny/bert_model.ckpt
2020-11-30 17:36:32,164 [DEBUG] kashgari - Top 50 words    : ['[PAD]', '[unused0]', '[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]', '[unused6]', '[unused7]', '[unused8]', '[unused9]', '[unused10]', '[unused11]', '[unused12]', '[unused13]', '[unused14]', '[unused15]', '[unused16]', '[unused17]', '[unused18]', '[unused19]', '[unused20]', '[unused21]', '[unused22]', '[unused23]', '[unused24]', '[unused25]', '[unused26]', '[unused27]', '[unused28]', '[unused29]', '[unused3

              precision    recall  f1-score   support

       anger     0.8379    0.1067    0.1893      7946
anticipation     0.5791    0.3603    0.4442     49984
     disgust     0.4318    0.1287    0.1983     27669
        fear     0.4027    0.1235    0.1891     12846
         joy     0.4482    0.8647    0.5904    102943
     sadness     0.3625    0.3436    0.3528     38745
    surprise     0.8611    0.1137    0.2009      9816
       trust     0.6290    0.1559    0.2499     41164

    accuracy                         0.4598    291113
   macro avg     0.5691    0.2746    0.3018    291113
weighted avg     0.5058    0.4598    0.4065    291113



{'detail': {'anger': {'precision': 0.8379446640316206,
   'recall': 0.10672036244651396,
   'f1-score': 0.1893279749944184,
   'support': 7946},
  'anticipation': {'precision': 0.5791149417894127,
   'recall': 0.36025528169014087,
   'f1-score': 0.44418954586940973,
   'support': 49984},
  'disgust': {'precision': 0.4318292091217855,
   'recall': 0.12866384762730854,
   'f1-score': 0.19825689861610005,
   'support': 27669},
  'fear': {'precision': 0.4026896726719107,
   'recall': 0.12354040168145726,
   'f1-score': 0.18907487937094178,
   'support': 12846},
  'joy': {'precision': 0.44819564659144123,
   'recall': 0.8646823970546808,
   'f1-score': 0.590377587499088,
   'support': 102943},
  'sadness': {'precision': 0.3625197450841549,
   'recall': 0.34355400696864113,
   'f1-score': 0.3527821581437261,
   'support': 38745},
  'surprise': {'precision': 0.8611111111111112,
   'recall': 0.1136919315403423,
   'f1-score': 0.20086393088552915,
   'support': 9816},
  'trust': {'precision': 0

In [18]:
test_df = test_df.merge(text_df, left_on='tweet_id', right_on='tweet_id')
# test_df

In [19]:
text_list = test_df['text'].tolist()
# text_list

In [20]:
predict_list = model.predict(text_list)
predict_list

TypeError: can only concatenate list (not "str") to list

In [None]:
test_df['predict'] = predict_list
# test_df

In [None]:
output_df = test_df[['tweet_id', 'predict']]
output_df = output_df.rename(columns={'tweet_id':'id', 'predict':'emotion'})
# output_df

In [None]:
output_path = 'output/{}_epoch_{}.csv'.format(model.name, epochs)
output_df.to_csv(output_path, index=False, header=True)