In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
json_list = list()
with open('tweets_DM.json' , 'r') as file:
    for line in file:
        json_list.append(json.loads(line))

In [3]:
tweet_list = list()
for json in json_list:
    tweet_id = json['_source']['tweet']['tweet_id']
    hashtags = json['_source']['tweet']['hashtags']
    hashtag = ' '.join(hashtags)
    text = json['_source']['tweet']['text']
    text = text + ' ' + hashtag
    tweet_list.append([tweet_id, text])

In [4]:
text_df = pd.DataFrame(tweet_list, columns=['tweet_id', 'text'])

In [5]:
emotion_df = pd.read_csv('emotion.csv')
# emotion_df

In [6]:
identification_df = pd.read_csv('data_identification.csv')
# identification_df

In [7]:
test_df = identification_df[identification_df['identification'] == 'test']
# test_df

In [8]:
train_df = text_df.merge(emotion_df, left_on='tweet_id', right_on='tweet_id')
# train_df

In [9]:
x_list = train_df['text'].to_list()
y_list = train_df['emotion'].to_list()

In [10]:
# x_list = list()
# for i in x:
#     x_list.append([i])

In [11]:
# y_list = list()
# for i in y:
#     y_list.append([i])

In [12]:
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(x_list, y_list, test_size=0.2, random_state=42)
train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

In [13]:
print(len(train_x), len(train_y))
print(len(test_x), len(test_y))
print(len(valid_y), len(valid_y))

931560 931560
291113 291113
232890 232890


In [14]:
import kashgari
from kashgari.tasks.classification import BiLSTM_Model
kashgari.config.use_cudnn_cell = True
import logging
logging.basicConfig(level='DEBUG')

model = BiLSTM_Model()
model.fit(train_x, 
          train_y, 
          valid_x, 
          valid_y,
          epochs=50,
         )
model.save('model/BiLSTM_epoch_50')

╭─────────────────────────────────────────────────────────────────────────╮
│ ◎ ○ ○ ░░░░░░░░░░░░░░░░░░░░░  Important Message  ░░░░░░░░░░░░░░░░░░░░░░░░│
├─────────────────────────────────────────────────────────────────────────┤
│                                                                         │
│              We renamed again for consistency and clarity.              │
│                   From now on, it is all `kashgari`.                    │
│  Changelog: https://github.com/BrikerMan/Kashgari/releases/tag/v1.0.0   │
│                                                                         │
│         | Backend          | pypi version   | desc           |          │
│         | ---------------- | -------------- | -------------- |          │
│         | TensorFlow 2.x   | kashgari 2.x.x | coming soon    |          │
│         | TensorFlow 1.14+ | kashgari 1.x.x |                |          │
│         | Keras            | kashgari 0.x.x | legacy version |          │
│           

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 165)]             0         
_________________________________________________________________
layer_embedding (Embedding)  (None, 165, 100)          258800    
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               234496    
_________________________________________________________________
dense (Dense)                (None, 8)                 2056      
Total params: 495,352
Trainable params: 495,352
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22

In [15]:
model = kashgari.utils.load_model('model/BiLSTM_epoch_50')
model.evaluate(test_x, test_y)



              precision    recall  f1-score   support

       anger     0.6351    0.2576    0.3666      7946
anticipation     0.6842    0.6008    0.6398     49984
     disgust     0.4485    0.4757    0.4617     27669
        fear     0.7059    0.4441    0.5452     12846
         joy     0.6008    0.8117    0.6905    102943
     sadness     0.5139    0.5318    0.5227     38745
    surprise     0.7198    0.2298    0.3484      9816
       trust     0.6304    0.3713    0.4673     41164

    accuracy                         0.5931    291113
   macro avg     0.6173    0.4654    0.5053    291113
weighted avg     0.6028    0.5931    0.5794    291113



In [16]:
# model = kashgari.utils.load_model('model/BERT_base_BiLSTM_Model')
# model.evaluate(test_x, test_y)

In [17]:
# train_df

In [18]:
# test_df

In [19]:
test_df = test_df.merge(text_df, left_on='tweet_id', right_on='tweet_id')
# test_df

In [20]:
text_list = test_df['text'].tolist()
# text_list

In [21]:
predict_list = model.predict(text_list)
predict_list

['joy',
 'sadness',
 'joy',
 'joy',
 'disgust',
 'joy',
 'joy',
 'disgust',
 'anticipation',
 'disgust',
 'joy',
 'joy',
 'disgust',
 'trust',
 'joy',
 'joy',
 'disgust',
 'joy',
 'joy',
 'joy',
 'sadness',
 'joy',
 'disgust',
 'joy',
 'joy',
 'sadness',
 'disgust',
 'disgust',
 'joy',
 'sadness',
 'disgust',
 'joy',
 'joy',
 'trust',
 'disgust',
 'joy',
 'anticipation',
 'anticipation',
 'sadness',
 'sadness',
 'sadness',
 'disgust',
 'joy',
 'sadness',
 'sadness',
 'joy',
 'anticipation',
 'anticipation',
 'joy',
 'trust',
 'disgust',
 'joy',
 'joy',
 'joy',
 'joy',
 'disgust',
 'sadness',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',
 'trust',
 'joy',
 'joy',
 'disgust',
 'joy',
 'joy',
 'trust',
 'trust',
 'joy',
 'sadness',
 'joy',
 'joy',
 'sadness',
 'disgust',
 'anticipation',
 'disgust',
 'joy',
 'sadness',
 'joy',
 'sadness',
 'joy',
 'joy',
 'disgust',
 'sadness',
 'joy',
 'joy',
 'joy',
 'joy',
 'sadness',
 'anticipation',
 'joy',
 'anticipation',
 'joy',
 'sadness',
 'sadness',


In [22]:
test_df['predict'] = predict_list
# test_df

In [23]:
output_df = test_df[['tweet_id', 'predict']]
output_df = output_df.rename(columns={'tweet_id':'id', 'predict':'emotion'})
# output_df

In [24]:
output_df.to_csv('../output/BiLSTM_epoch_50.csv', index=False, header=True)