In [1]:
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
print(gpus)
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:2', device_type='GPU')]


In [2]:
epochs = 1
model_name = 'roberta-base'

In [3]:
import pandas as pd
import numpy as np
import json

In [4]:
data_path = '/home/Danny/Data-Mining/lab2/kaggle/data/'

In [5]:
json_list = list()
with open(data_path + 'tweets_DM.json' , 'r') as file:
    for line in file:
        json_list.append(json.loads(line))

In [6]:
tweet_list = list()
for json in json_list:
    tweet_id = json['_source']['tweet']['tweet_id']
    hashtags = json['_source']['tweet']['hashtags']
    hashtag = ' '.join(hashtags)
    text = json['_source']['tweet']['text']
    text = text + ' ' + hashtag
    tweet_list.append([tweet_id, text])

In [7]:
text_df = pd.DataFrame(tweet_list, columns=['tweet_id', 'text'])

In [8]:
emotion_df = pd.read_csv(data_path + 'emotion.csv')
# emotion_df

In [9]:
identification_df = pd.read_csv(data_path + 'data_identification.csv')
# identification_df

In [10]:
test_df = identification_df[identification_df['identification'] == 'test']
# test_df

In [11]:
train_df = text_df.merge(emotion_df, left_on='tweet_id', right_on='tweet_id')
# train_df

In [12]:
x_list = train_df['text'].to_list()
y_list = train_df['emotion'].to_list()

In [13]:
x_list_list = list()
for x in x_list:
    x_list_list.append(x.split())
len(x_list_list)

1455563

In [14]:
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(x_list_list, y_list, test_size=0.2, random_state=42)
train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size=0.2, random_state=42)
print(len(train_x), len(train_y))
print(len(test_x), len(test_y))
print(len(valid_x), len(valid_y))

931560 931560
291113 291113
232890 232890


In [15]:
import kashgari
from kashgari.tasks.classification import BiGRU_Model
kashgari.config.use_cudnn_cell = True
import logging
logging.basicConfig(level='DEBUG')
from kashgari.embeddings import BertEmbedding
bert_embed = BertEmbedding('/home/Danny/pretrain_model/{}'.format(model_name))
model = BiGRU_Model(bert_embed, sequence_length=64)
model.fit(train_x, 
          train_y, 
          valid_x, 
          valid_y,
          epochs=epochs,
         )

2020-12-01 14:33:50,565 [DEBUG] kashgari - ------------------------------------------------
2020-12-01 14:33:50,565 [DEBUG] kashgari - Loaded transformer model's vocab
2020-12-01 14:33:50,566 [DEBUG] kashgari - config_path       : /home/Danny/pretrain_model/roberta-base/bert_config.json
2020-12-01 14:33:50,566 [DEBUG] kashgari - vocab_path      : /home/Danny/pretrain_model/roberta-base/vocab.txt
2020-12-01 14:33:50,567 [DEBUG] kashgari - checkpoint_path : /home/Danny/pretrain_model/roberta-base/bert_model.ckpt
2020-12-01 14:33:50,567 [DEBUG] kashgari - Top 50 words    : ['13 850314647', '262 800385005', '11 800251374', '284 432911125', '290 394899794', '286 386139013', '257 357878752', '287 311196488', '12 215156821', '329 155236946', '326 154060431', '319 147178919', '318 142591644', '447 130810923', '338 116498242', '351 114784681', '383 108664122', '373 100357189', '366 93880741', '379 93284459', '340 88803471', '355 85749070', '531 85009762', '247 82642284', '307 77095226', '82 763

NotFoundError: Unsuccessful TensorSliceReader constructor: Failed to find any matching files for /home/Danny/pretrain_model/roberta-base/bert_model.ckpt

In [None]:
model.evaluate(test_x, test_y)
model_path = 'model/{}_epoch_{}'.format(model_name, epochs)
model.save(model_path)

In [None]:
model_path = 'model/{}_epoch_{}'.format(model_name, epochs)
model = kashgari.utils.load_model(model_path)
# model.evaluate(test_x, test_y)

In [None]:
test_df = test_df.merge(text_df, left_on='tweet_id', right_on='tweet_id')
# test_df

In [None]:
text_list = test_df['text'].tolist()
# text_list

In [None]:
predict_list = model.predict(text_list)
predict_list

In [None]:
test_df['predict'] = predict_list
# test_df

In [None]:
output_df = test_df[['tweet_id', 'predict']]
output_df = output_df.rename(columns={'tweet_id':'id', 'predict':'emotion'})
# output_df

In [None]:
output_path = 'output/{}_epoch_{}.csv'.format(model.name, epochs)
output_df.to_csv(output_path, index=False, header=True)