In [1]:
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
print(gpus)
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:2', device_type='GPU')]


In [2]:
epochs = 20

In [3]:
import pandas as pd
import numpy as np
import json

In [4]:
data_path = '/home/Danny/Data-Mining/lab2/kaggle/data/'

In [5]:
json_list = list()
with open(data_path + 'tweets_DM.json' , 'r') as file:
    for line in file:
        json_list.append(json.loads(line))

In [6]:
tweet_list = list()
for json in json_list:
    tweet_id = json['_source']['tweet']['tweet_id']
    hashtags = json['_source']['tweet']['hashtags']
    hashtag = ' '.join(hashtags)
    text = json['_source']['tweet']['text']
    text = text + ' ' + hashtag
    tweet_list.append([tweet_id, text])

In [7]:
text_df = pd.DataFrame(tweet_list, columns=['tweet_id', 'text'])

In [8]:
emotion_df = pd.read_csv(data_path + 'emotion.csv')
# emotion_df

In [9]:
identification_df = pd.read_csv(data_path + 'data_identification.csv')
# identification_df

In [10]:
test_df = identification_df[identification_df['identification'] == 'test']
# test_df

In [11]:
train_df = text_df.merge(emotion_df, left_on='tweet_id', right_on='tweet_id')
# train_df

In [12]:
x_list = train_df['text'].to_list()
y_list = train_df['emotion'].to_list()

In [13]:
x_list_list = list()
for x in x_list:
    x_list_list.append(x.split())
len(x_list_list)

1455563

In [14]:
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(x_list_list, y_list, test_size=0.2, random_state=42)
train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size=0.2, random_state=42)
print(len(train_x), len(train_y))
print(len(test_x), len(test_y))
print(len(valid_x), len(valid_y))

931560 931560
291113 291113
232890 232890


In [15]:
import kashgari
from kashgari.tasks.classification import BiGRU_Model
kashgari.config.use_cudnn_cell = True
import logging
logging.basicConfig(level='DEBUG')

model = BiGRU_Model()
model.fit(train_x, 
          train_y, 
          valid_x, 
          valid_y,
          epochs=epochs,
         )

Preparing text vocab dict: 100%|██████████| 931560/931560 [00:05<00:00, 166168.15it/s]
Preparing text vocab dict: 100%|██████████| 232890/232890 [00:01<00:00, 154068.22it/s]
2020-11-29 18:33:10,585 [DEBUG] kashgari - --- Build vocab dict finished, Total: 257059 ---
2020-11-29 18:33:10,586 [DEBUG] kashgari - Top-10: ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '<LH>', 'the', 'to', 'I', 'a', 'and']
Preparing classification label vocab dict: 100%|██████████| 931560/931560 [00:00<00:00, 1471960.05it/s]
Preparing classification label vocab dict: 100%|██████████| 232890/232890 [00:00<00:00, 1385436.15it/s]
Calculating sequence length: 100%|██████████| 931560/931560 [00:00<00:00, 1559388.46it/s]
Calculating sequence length: 100%|██████████| 232890/232890 [00:00<00:00, 1443168.46it/s]
2020-11-29 18:33:12,947 [DEBUG] kashgari - Calculated sequence length = 27
2020-11-29 18:33:14,389 [DEBUG] kashgari - Model: "model_1"
_________________________________________________________________
Layer (type)       

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


2020-11-30 08:27:59,098 [DEBUG] kashgari - predict input shape (291113, 49) x: 
[[   2  935    8 ...    0    0    0]
 [   2  788   61 ...    0    0    0]
 [   2 3465   80 ...    0    0    0]
 ...
 [   2    7 1600 ...    0    0    0]
 [   2    4    9 ...    0    0    0]
 [   2 2281 6163 ...    0    0    0]]
2020-11-30 08:28:32,940 [DEBUG] kashgari - predict output shape (291113, 8)
2020-11-30 08:28:33,109 [DEBUG] kashgari - predict output argmax: [3 1 0 ... 2 0 0]


              precision    recall  f1-score   support

       anger     0.3085    0.2508    0.2767      7946
anticipation     0.5868    0.5273    0.5554     49984
     disgust     0.3518    0.3871    0.3686     27669
        fear     0.4495    0.4007    0.4237     12846
         joy     0.6155    0.6261    0.6208    102943
     sadness     0.4380    0.4384    0.4382     38745
    surprise     0.2575    0.2760    0.2664      9816
       trust     0.3984    0.4241    0.4108     41164

    accuracy                         0.5009    291113
   macro avg     0.4258    0.4163    0.4201    291113
weighted avg     0.5034    0.5009    0.5016    291113



AttributeError: 'BiGRU_Model' object has no attribute 'name'

In [18]:
model.evaluate(test_x, test_y)
model_path = 'model/{}_epoch_{}'.format(str(model), epochs)
model.save(model_path)

2020-11-30 12:21:42,712 [DEBUG] kashgari - predict input shape (291113, 49) x: 
[[   2  935    8 ...    0    0    0]
 [   2  788   61 ...    0    0    0]
 [   2 3465   80 ...    0    0    0]
 ...
 [   2    7 1600 ...    0    0    0]
 [   2    4    9 ...    0    0    0]
 [   2 2281 6163 ...    0    0    0]]
2020-11-30 12:22:19,806 [DEBUG] kashgari - predict output shape (291113, 8)
2020-11-30 12:22:19,967 [DEBUG] kashgari - predict output argmax: [3 1 0 ... 2 0 0]


              precision    recall  f1-score   support

       anger     0.3085    0.2508    0.2767      7946
anticipation     0.5868    0.5273    0.5554     49984
     disgust     0.3518    0.3871    0.3686     27669
        fear     0.4495    0.4007    0.4237     12846
         joy     0.6155    0.6261    0.6208    102943
     sadness     0.4380    0.4384    0.4382     38745
    surprise     0.2575    0.2760    0.2664      9816
       trust     0.3984    0.4241    0.4108     41164

    accuracy                         0.5009    291113
   macro avg     0.4258    0.4163    0.4201    291113
weighted avg     0.5034    0.5009    0.5016    291113



2020-11-30 12:22:43,819 [INFO] kashgari - model saved to /home/Danny/Data-Mining/lab2/kaggle/kashgari/model/<kashgari.tasks.classification.bi_gru_model.BiGRU_Model object at 0x7fca3ca332e8>_epoch_20


'/home/Danny/Data-Mining/lab2/kaggle/kashgari/model/<kashgari.tasks.classification.bi_gru_model.BiGRU_Model object at 0x7fca3ca332e8>_epoch_20'

In [None]:
# model = kashgari.utils.load_model(model_path)
# model.evaluate(test_x, test_y)

In [None]:
test_df = test_df.merge(text_df, left_on='tweet_id', right_on='tweet_id')
# test_df

In [None]:
text_list = test_df['text'].tolist()
# text_list

In [None]:
predict_list = model.predict(text_list)
predict_list

In [None]:
test_df['predict'] = predict_list
# test_df

In [None]:
output_df = test_df[['tweet_id', 'predict']]
output_df = output_df.rename(columns={'tweet_id':'id', 'predict':'emotion'})
# output_df

In [None]:
output_path = 'output/{}_epoch_{}.csv'.format(model.name, epochs)
output_df.to_csv(output_path, index=False, header=True)