In [1]:
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
print(gpus)
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:2', device_type='GPU')]


In [2]:
epochs = 5
model_name = 'BiLSTM'

In [3]:
import pandas as pd
import numpy as np
import json

In [4]:
data_path = '/home/Danny/Data-Mining/lab2/kaggle/data/'

In [5]:
json_list = list()
with open(data_path + 'tweets_DM.json' , 'r') as file:
    for line in file:
        json_list.append(json.loads(line))

In [6]:
tweet_list = list()
for json in json_list:
    tweet_id = json['_source']['tweet']['tweet_id']
    hashtags = json['_source']['tweet']['hashtags']
    hashtag = ' '.join(hashtags)
    text = json['_source']['tweet']['text']
    text = text + ' ' + hashtag
    tweet_list.append([tweet_id, text])

In [7]:
text_df = pd.DataFrame(tweet_list, columns=['tweet_id', 'text'])

In [8]:
emotion_df = pd.read_csv(data_path + 'emotion.csv')
# emotion_df

In [9]:
identification_df = pd.read_csv(data_path + 'data_identification.csv')
# identification_df

In [10]:
test_df = identification_df[identification_df['identification'] == 'test']
# test_df

In [11]:
train_df = text_df.merge(emotion_df, left_on='tweet_id', right_on='tweet_id')
# train_df

In [12]:
x_list = train_df['text'].to_list()
y_list = train_df['emotion'].to_list()

In [13]:
x_list_list = list()
for x in x_list:
    x_list_list.append(x.split())
len(x_list_list)

1455563

In [30]:
x_list_list

[['People',
  'who',
  'post',
  '"add',
  'me',
  'on',
  '#Snapchat"',
  'must',
  'be',
  'dehydrated.',
  'Cuz',
  'man....',
  "that's",
  '<LH>',
  'Snapchat'],
 ['@brianklaas',
  'As',
  'we',
  'see,',
  'Trump',
  'is',
  'dangerous',
  'to',
  '#freepress',
  'around',
  'the',
  'world.',
  'What',
  'a',
  '<LH>',
  '<LH>',
  '#TrumpLegacy.',
  '#CNN',
  'freepress',
  'TrumpLegacy',
  'CNN'],
 ['Now', 'ISSA', 'is', 'stalking', 'Tasha', '😂😂😂', '<LH>'],
 ['@RISKshow',
  '@TheKevinAllison',
  'Thx',
  'for',
  'the',
  'BEST',
  'TIME',
  'tonight.',
  'What',
  'stories!',
  'Heartbreakingly',
  '<LH>',
  '#authentic',
  '#LaughOutLoud',
  'good!!',
  'authentic',
  'LaughOutLoud'],
 ['Still', 'waiting', 'on', 'those', 'supplies', 'Liscus.', '<LH>'],
 ['Love', 'knows', 'no', 'gender.', '😢😭', '<LH>'],
 ['@DStvNgCare',
  '@DStvNg',
  'More',
  'highlights',
  'are',
  'being',
  'shown',
  'than',
  'actual',
  'sports!',
  'Who',
  'watches',
  'triathlon',
  'highlights',
  

In [14]:
# y_list_list = list()
# for y in y_list:
#     y_list_list.append(y.split())
# len(y_list_list)

In [15]:
# x_list = list()
# for i in x:
#     x_list.append([i])

In [16]:
# y_list = list()
# for i in y:
#     y_list.append([i])

In [17]:
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(x_list_list, y_list, test_size=0.2, random_state=42)
train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size=0.2, random_state=42)
print(len(train_x), len(train_y))
print(len(test_x), len(test_y))
print(len(valid_x), len(valid_y))

931560 931560
291113 291113
232890 232890


In [18]:
import kashgari
from kashgari.tasks.classification import BiLSTM_Model
kashgari.config.use_cudnn_cell = True
import logging
logging.basicConfig(level='DEBUG')

model = BiLSTM_Model()
model.fit(train_x, 
          train_y, 
          valid_x, 
          valid_y,
          epochs=epochs,
         )

Preparing text vocab dict: 100%|██████████| 931560/931560 [00:04<00:00, 216533.27it/s]
Preparing text vocab dict: 100%|██████████| 232890/232890 [00:01<00:00, 199047.17it/s]
2020-11-30 13:09:25,342 [DEBUG] kashgari - --- Build vocab dict finished, Total: 257059 ---
2020-11-30 13:09:25,343 [DEBUG] kashgari - Top-10: ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '<LH>', 'the', 'to', 'I', 'a', 'and']
Preparing classification label vocab dict: 100%|██████████| 931560/931560 [00:00<00:00, 1944928.12it/s]
Preparing classification label vocab dict: 100%|██████████| 232890/232890 [00:00<00:00, 1920770.07it/s]
Calculating sequence length: 100%|██████████| 931560/931560 [00:00<00:00, 2062095.32it/s]
Calculating sequence length: 100%|██████████| 232890/232890 [00:00<00:00, 1996226.38it/s]
2020-11-30 13:09:27,318 [DEBUG] kashgari - Calculated sequence length = 27
2020-11-30 13:09:28,444 [DEBUG] kashgari - Model: "model_1"
_________________________________________________________________
Layer (type)       

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fbf67f44080>

In [19]:
model.evaluate(test_x, test_y)
model_path = 'model/{}_epoch_{}'.format(model_name, epochs)
model.save(model_path)

2020-11-30 16:35:14,238 [DEBUG] kashgari - predict input shape (291113, 49) x: 
[[   2  935    8 ...    0    0    0]
 [   2  788   61 ...    0    0    0]
 [   2 3465   80 ...    0    0    0]
 ...
 [   2    7 1600 ...    0    0    0]
 [   2    4    9 ...    0    0    0]
 [   2 2281 6163 ...    0    0    0]]
2020-11-30 16:36:06,599 [DEBUG] kashgari - predict output shape (291113, 8)
2020-11-30 16:36:06,795 [DEBUG] kashgari - predict output argmax: [0 1 0 ... 3 0 0]


              precision    recall  f1-score   support

       anger     0.3537    0.2479    0.2915      7946
anticipation     0.6124    0.5627    0.5865     49984
     disgust     0.3973    0.3853    0.3912     27669
        fear     0.4638    0.4454    0.4545     12846
         joy     0.6198    0.6873    0.6518    102943
     sadness     0.4419    0.4608    0.4512     38745
    surprise     0.2743    0.2784    0.2763      9816
       trust     0.4720    0.4118    0.4398     41164

    accuracy                         0.5317    291113
   macro avg     0.4544    0.4350    0.4429    291113
weighted avg     0.5270    0.5317    0.5279    291113



2020-11-30 16:36:16,171 [INFO] kashgari - model saved to /home/Danny/Data-Mining/lab2/kaggle/kashgari/model/BiLSTM_epoch_5


'/home/Danny/Data-Mining/lab2/kaggle/kashgari/model/BiLSTM_epoch_5'

In [20]:
model_path = 'model/{}_epoch_{}'.format(model_name, epochs)
model = kashgari.utils.load_model(model_path)
model.evaluate(test_x, test_y)

  
2020-11-30 16:36:28,975 [DEBUG] kashgari - predict input shape (291113, 49) x: 
[[   2  935    8 ...    0    0    0]
 [   2  788   61 ...    0    0    0]
 [   2 3465   80 ...    0    0    0]
 ...
 [   2    7 1600 ...    0    0    0]
 [   2    4    9 ...    0    0    0]
 [   2 2281 6163 ...    0    0    0]]
2020-11-30 16:37:22,795 [DEBUG] kashgari - predict output shape (291113, 8)
2020-11-30 16:37:23,008 [DEBUG] kashgari - predict output argmax: [0 1 0 ... 3 0 0]


              precision    recall  f1-score   support

       anger     0.3537    0.2479    0.2915      7946
anticipation     0.6124    0.5627    0.5865     49984
     disgust     0.3973    0.3853    0.3912     27669
        fear     0.4638    0.4454    0.4545     12846
         joy     0.6198    0.6873    0.6518    102943
     sadness     0.4419    0.4608    0.4512     38745
    surprise     0.2743    0.2784    0.2763      9816
       trust     0.4720    0.4118    0.4398     41164

    accuracy                         0.5317    291113
   macro avg     0.4544    0.4350    0.4429    291113
weighted avg     0.5270    0.5317    0.5279    291113



{'detail': {'anger': {'precision': 0.35374393966600826,
   'recall': 0.2479234835137176,
   'f1-score': 0.291527931927488,
   'support': 7946},
  'anticipation': {'precision': 0.6124199799677742,
   'recall': 0.5627000640204866,
   'f1-score': 0.5865081847565426,
   'support': 49984},
  'disgust': {'precision': 0.3972577219717575,
   'recall': 0.3853409953377426,
   'f1-score': 0.3912086299258824,
   'support': 27669},
  'fear': {'precision': 0.46384565499351493,
   'recall': 0.44543048419741554,
   'f1-score': 0.454451592407275,
   'support': 12846},
  'joy': {'precision': 0.6197642030026452,
   'recall': 0.6873318244076819,
   'f1-score': 0.6518016295961936,
   'support': 102943},
  'sadness': {'precision': 0.44194969798989997,
   'recall': 0.4607820363917925,
   'f1-score': 0.45116943177366975,
   'support': 38745},
  'surprise': {'precision': 0.27428743476515455,
   'recall': 0.2784229828850856,
   'f1-score': 0.2763397371081901,
   'support': 9816},
  'trust': {'precision': 0.4719

In [21]:
test_df = test_df.merge(text_df, left_on='tweet_id', right_on='tweet_id')
test_df

Unnamed: 0,tweet_id,identification,text
0,0x28cc61,test,@Habbo I've seen two separate colours of the e...
1,0x2db41f,test,@FoxNews @KellyannePolls No serious self respe...
2,0x2466f6,test,"Looking for a new car, and it says 1 lady owne..."
3,0x23f9e9,test,@cineworld “only the brave” just out and fount...
4,0x1fb4e1,test,Felt like total dog 💩 going into open gym and ...
...,...,...,...
411967,0x2c4dc2,test,6 year old walks in astounded. Mum! Look how b...
411968,0x31be7c,test,Only one week to go until the #inspiringvolunt...
411969,0x1ca58e,test,"I just got caught up with the manga for ""My He..."
411970,0x35c8ba,test,Speak only when spoken to and make hot ass mus...


In [33]:
text_list = list()
for text in test_df['text'].tolist():
    text_list.append(text.split())
text_list

[['@Habbo',
  "I've",
  'seen',
  'two',
  'separate',
  'colours',
  'of',
  'the',
  'elegant',
  'furni',
  'on',
  'your',
  'homepage?',
  '<LH>'],
 ['@FoxNews',
  '@KellyannePolls',
  'No',
  'serious',
  'self',
  'respecting',
  'individual',
  'believes',
  'much',
  'less',
  'agrees',
  'w',
  'her',
  '<LH>'],
 ['Looking',
  'for',
  'a',
  'new',
  'car,',
  'and',
  'it',
  'says',
  '1',
  'lady',
  'owner.',
  'That',
  'means',
  'needs',
  'new',
  'clutch,',
  'gearbox',
  'and',
  'brakes.',
  '#womendrivers',
  '<LH>',
  'womendrivers'],
 ['@cineworld',
  '“only',
  'the',
  'brave”',
  'just',
  'out',
  'and',
  'fountain',
  'park',
  'have',
  'only',
  '3',
  'showings',
  'per',
  'day!',
  '<LH>',
  '#robbingmembers',
  'robbingmembers'],
 ['Felt',
  'like',
  'total',
  'dog',
  '💩',
  'going',
  'into',
  'open',
  'gym',
  'and',
  'had',
  'a',
  'migraine',
  'Played',
  'through',
  'and',
  'sweated',
  'it',
  'out',
  'and',
  'my',
  'sweat',
  'ha

In [34]:
predict_list = model.predict(text_list)
predict_list

2020-11-30 16:52:21,142 [DEBUG] kashgari - predict input shape (411972, 63) x: 
[[   2    1  187 ...    0    0    0]
 [   2  766 4411 ...    0    0    0]
 [   2  652   10 ...    0    0    0]
 ...
 [   2    7   32 ...    0    0    0]
 [   2 4451   59 ...    0    0    0]
 [   2 1811   47 ...    0    0    0]]
2020-11-30 16:53:40,263 [DEBUG] kashgari - predict output shape (411972, 8)
2020-11-30 16:53:40,507 [DEBUG] kashgari - predict output argmax: [0 4 0 ... 0 0 0]


In [36]:
test_df['predict'] = predict_list
test_df

Unnamed: 0,tweet_id,identification,text,predict
0,0x28cc61,test,@Habbo I've seen two separate colours of the e...,joy
1,0x2db41f,test,@FoxNews @KellyannePolls No serious self respe...,disgust
2,0x2466f6,test,"Looking for a new car, and it says 1 lady owne...",joy
3,0x23f9e9,test,@cineworld “only the brave” just out and fount...,sadness
4,0x1fb4e1,test,Felt like total dog 💩 going into open gym and ...,disgust
...,...,...,...,...
411967,0x2c4dc2,test,6 year old walks in astounded. Mum! Look how b...,joy
411968,0x31be7c,test,Only one week to go until the #inspiringvolunt...,joy
411969,0x1ca58e,test,"I just got caught up with the manga for ""My He...",joy
411970,0x35c8ba,test,Speak only when spoken to and make hot ass mus...,joy


In [37]:
output_df = test_df[['tweet_id', 'predict']]
output_df = output_df.rename(columns={'tweet_id':'id', 'predict':'emotion'})
# output_df

In [39]:
output_path = 'output/{}_epoch_{}.csv'.format(model_name, epochs)
output_df.to_csv(output_path, index=False, header=True)