In [1]:
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
print(gpus)
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:2', device_type='GPU')]


In [2]:
epochs = 5
model_name = 'BiGRU'

In [3]:
import pandas as pd
import numpy as np
import json

In [4]:
data_path = '/home/Danny/Data-Mining/lab2/kaggle/data/'

In [5]:
json_list = list()
with open(data_path + 'tweets_DM.json' , 'r') as file:
    for line in file:
        json_list.append(json.loads(line))

In [6]:
tweet_list = list()
for json in json_list:
    tweet_id = json['_source']['tweet']['tweet_id']
    hashtags = json['_source']['tweet']['hashtags']
    hashtag = ' '.join(hashtags)
    text = json['_source']['tweet']['text']
    text = text + ' ' + hashtag
    tweet_list.append([tweet_id, text])

In [7]:
text_df = pd.DataFrame(tweet_list, columns=['tweet_id', 'text'])

In [8]:
emotion_df = pd.read_csv(data_path + 'emotion.csv')
# emotion_df

In [9]:
identification_df = pd.read_csv(data_path + 'data_identification.csv')
# identification_df

In [10]:
test_df = identification_df[identification_df['identification'] == 'test']
# test_df

In [11]:
train_df = text_df.merge(emotion_df, left_on='tweet_id', right_on='tweet_id')
# train_df

In [12]:
x_list = train_df['text'].to_list()
y_list = train_df['emotion'].to_list()

In [13]:
x_list_list = list()
for x in x_list:
    x_list_list.append(x.split())
len(x_list_list)

1455563

In [14]:
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(x_list_list, y_list, test_size=0.2, random_state=42)
train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size=0.2, random_state=42)
print(len(train_x), len(train_y))
print(len(test_x), len(test_y))
print(len(valid_x), len(valid_y))

931560 931560
291113 291113
232890 232890


In [15]:
import kashgari
from kashgari.tasks.classification import BiGRU_Model
kashgari.config.use_cudnn_cell = True
import logging
logging.basicConfig(level='DEBUG')

model = BiGRU_Model()
model.fit(train_x, 
          train_y, 
          valid_x, 
          valid_y,
          epochs=epochs,
         )

Preparing text vocab dict: 100%|██████████| 931560/931560 [00:05<00:00, 170267.18it/s]
Preparing text vocab dict: 100%|██████████| 232890/232890 [00:01<00:00, 157748.67it/s]
2020-11-30 13:09:37,083 [DEBUG] kashgari - --- Build vocab dict finished, Total: 257059 ---
2020-11-30 13:09:37,084 [DEBUG] kashgari - Top-10: ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '<LH>', 'the', 'to', 'I', 'a', 'and']
Preparing classification label vocab dict: 100%|██████████| 931560/931560 [00:00<00:00, 1537063.37it/s]
Preparing classification label vocab dict: 100%|██████████| 232890/232890 [00:00<00:00, 1448351.14it/s]
Calculating sequence length: 100%|██████████| 931560/931560 [00:00<00:00, 1581542.19it/s]
Calculating sequence length: 100%|██████████| 232890/232890 [00:00<00:00, 1505278.67it/s]
2020-11-30 13:09:39,396 [DEBUG] kashgari - Calculated sequence length = 27
2020-11-30 13:09:40,691 [DEBUG] kashgari - Model: "model_1"
_________________________________________________________________
Layer (type)       

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7ff17abae978>

In [16]:
model.evaluate(test_x, test_y)
model_path = 'model/{}_epoch_{}'.format(model_name, epochs)
model.save(model_path)

2020-11-30 16:40:47,386 [DEBUG] kashgari - predict input shape (291113, 49) x: 
[[   2  935    8 ...    0    0    0]
 [   2  788   61 ...    0    0    0]
 [   2 3465   80 ...    0    0    0]
 ...
 [   2    7 1600 ...    0    0    0]
 [   2    4    9 ...    0    0    0]
 [   2 2281 6163 ...    0    0    0]]
2020-11-30 16:41:23,641 [DEBUG] kashgari - predict output shape (291113, 8)
2020-11-30 16:41:23,834 [DEBUG] kashgari - predict output argmax: [0 1 1 ... 2 3 0]


              precision    recall  f1-score   support

       anger     0.3353    0.2557    0.2902      7946
anticipation     0.6033    0.5496    0.5752     49984
     disgust     0.4103    0.3514    0.3786     27669
        fear     0.4371    0.4450    0.4410     12846
         joy     0.6287    0.6543    0.6413    102943
     sadness     0.4191    0.5102    0.4602     38745
    surprise     0.2398    0.3020    0.2673      9816
       trust     0.4602    0.4035    0.4300     41164

    accuracy                         0.5209    291113
   macro avg     0.4417    0.4340    0.4355    291113
weighted avg     0.5223    0.5209    0.5200    291113



2020-11-30 16:41:48,052 [INFO] kashgari - model saved to /home/Danny/Data-Mining/lab2/kaggle/kashgari/model/BiGRU_epoch_5


'/home/Danny/Data-Mining/lab2/kaggle/kashgari/model/BiGRU_epoch_5'

In [17]:
model_path = 'model/{}_epoch_{}'.format(model_name, epochs)
model = kashgari.utils.load_model(model_path)
model.evaluate(test_x, test_y)

  
2020-11-30 16:41:57,909 [DEBUG] kashgari - predict input shape (291113, 49) x: 
[[   2  935    8 ...    0    0    0]
 [   2  788   61 ...    0    0    0]
 [   2 3465   80 ...    0    0    0]
 ...
 [   2    7 1600 ...    0    0    0]
 [   2    4    9 ...    0    0    0]
 [   2 2281 6163 ...    0    0    0]]
2020-11-30 16:42:34,437 [DEBUG] kashgari - predict output shape (291113, 8)
2020-11-30 16:42:34,635 [DEBUG] kashgari - predict output argmax: [0 1 1 ... 2 3 0]


              precision    recall  f1-score   support

       anger     0.3353    0.2557    0.2902      7946
anticipation     0.6033    0.5496    0.5752     49984
     disgust     0.4103    0.3514    0.3786     27669
        fear     0.4371    0.4450    0.4410     12846
         joy     0.6287    0.6543    0.6413    102943
     sadness     0.4191    0.5102    0.4602     38745
    surprise     0.2398    0.3020    0.2673      9816
       trust     0.4602    0.4035    0.4300     41164

    accuracy                         0.5209    291113
   macro avg     0.4417    0.4340    0.4355    291113
weighted avg     0.5223    0.5209    0.5200    291113



{'detail': {'anger': {'precision': 0.3353135313531353,
   'recall': 0.2557261515227788,
   'f1-score': 0.2901613594173925,
   'support': 7946},
  'anticipation': {'precision': 0.6033297458762547,
   'recall': 0.5495558578745199,
   'f1-score': 0.5751887177661679,
   'support': 49984},
  'disgust': {'precision': 0.4103051019116344,
   'recall': 0.35140409844952836,
   'f1-score': 0.3785772690106296,
   'support': 27669},
  'fear': {'precision': 0.4371367390639339,
   'recall': 0.44496341273548184,
   'f1-score': 0.4410153537535684,
   'support': 12846},
  'joy': {'precision': 0.6287477131015943,
   'recall': 0.6543329803872046,
   'f1-score': 0.6412852552659764,
   'support': 102943},
  'sadness': {'precision': 0.41912435068376974,
   'recall': 0.5102077687443541,
   'f1-score': 0.4602025375392853,
   'support': 38745},
  'surprise': {'precision': 0.23976702798899854,
   'recall': 0.3019559902200489,
   'f1-score': 0.26729191090269633,
   'support': 9816},
  'trust': {'precision': 0.460

In [18]:
test_df = test_df.merge(text_df, left_on='tweet_id', right_on='tweet_id')
# test_df

In [19]:
text_list = test_df['text'].tolist()
# text_list

In [20]:
predict_list = model.predict(text_list)
predict_list

TypeError: can only concatenate list (not "str") to list

In [None]:
test_df['predict'] = predict_list
# test_df

In [None]:
output_df = test_df[['tweet_id', 'predict']]
output_df = output_df.rename(columns={'tweet_id':'id', 'predict':'emotion'})
# output_df

In [None]:
output_path = 'output/{}_epoch_{}.csv'.format(model.name, epochs)
output_df.to_csv(output_path, index=False, header=True)