In [1]:
max_features = 128
maxlen = 128
epochs = 10

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

In [3]:
import pandas as pd
import numpy as np
import json

In [4]:
json_list = list()
with open('../data/tweets_DM.json' , 'r') as file:
    for line in file:
        json_list.append(json.loads(line))

In [5]:
tweet_list = list()
for json in json_list:
    tweet_id = json['_source']['tweet']['tweet_id']
    hashtags = json['_source']['tweet']['hashtags']
    hashtag = ' '.join(hashtags)
    text = json['_source']['tweet']['text']
    text = text + ' ' + hashtag
    tweet_list.append([tweet_id, text])

In [6]:
text_df = pd.DataFrame(tweet_list, columns=['tweet_id', 'text'])

In [7]:
emotion_df = pd.read_csv('../data/emotion.csv')
emotion_df

Unnamed: 0,tweet_id,emotion
0,0x3140b1,sadness
1,0x368b73,disgust
2,0x296183,anticipation
3,0x2bd6e1,joy
4,0x2ee1dd,anticipation
...,...,...
1455558,0x38dba0,joy
1455559,0x300ea2,joy
1455560,0x360b99,fear
1455561,0x22eecf,joy


In [8]:
identification_df = pd.read_csv('../data/data_identification.csv')
identification_df

Unnamed: 0,tweet_id,identification
0,0x28cc61,test
1,0x29e452,train
2,0x2b3819,train
3,0x2db41f,test
4,0x2a2acc,train
...,...,...
1867530,0x227e25,train
1867531,0x293813,train
1867532,0x1e1a7e,train
1867533,0x2156a5,train


In [9]:
test_df = identification_df[identification_df['identification'] == 'test']
test_df

Unnamed: 0,tweet_id,identification
0,0x28cc61,test
3,0x2db41f,test
15,0x2466f6,test
23,0x23f9e9,test
31,0x1fb4e1,test
...,...,...
1867495,0x2c4dc2,test
1867496,0x31be7c,test
1867500,0x1ca58e,test
1867515,0x35c8ba,test


In [10]:
train_df = text_df.merge(emotion_df, left_on='tweet_id', right_on='tweet_id')
train_df

Unnamed: 0,tweet_id,text,emotion
0,0x376b20,"People who post ""add me on #Snapchat"" must be ...",anticipation
1,0x2d5350,"@brianklaas As we see, Trump is dangerous to #...",sadness
2,0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂 <LH>,fear
3,0x1d755c,@RISKshow @TheKevinAllison Thx for the BEST TI...,joy
4,0x2c91a8,Still waiting on those supplies Liscus. <LH>,anticipation
...,...,...,...
1455558,0x321566,I'm SO HAPPY!!! #NoWonder the name of this sho...,joy
1455559,0x38959e,In every circumtance I'd like to be thankful t...,joy
1455560,0x2cbca6,there's currently two girls walking around the...,joy
1455561,0x24faed,"Ah, corporate life, where you can date <LH> us...",joy


In [11]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(train_df['emotion'])
train_df['label'] = le.transform(train_df['emotion'])
train_df

Unnamed: 0,tweet_id,text,emotion,label
0,0x376b20,"People who post ""add me on #Snapchat"" must be ...",anticipation,1
1,0x2d5350,"@brianklaas As we see, Trump is dangerous to #...",sadness,5
2,0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂 <LH>,fear,3
3,0x1d755c,@RISKshow @TheKevinAllison Thx for the BEST TI...,joy,4
4,0x2c91a8,Still waiting on those supplies Liscus. <LH>,anticipation,1
...,...,...,...,...
1455558,0x321566,I'm SO HAPPY!!! #NoWonder the name of this sho...,joy,4
1455559,0x38959e,In every circumtance I'd like to be thankful t...,joy,4
1455560,0x2cbca6,there's currently two girls walking around the...,joy,4
1455561,0x24faed,"Ah, corporate life, where you can date <LH> us...",joy,4


In [12]:
test_df = test_df.merge(text_df, left_on='tweet_id', right_on='tweet_id')
test_df

Unnamed: 0,tweet_id,identification,text
0,0x28cc61,test,@Habbo I've seen two separate colours of the e...
1,0x2db41f,test,@FoxNews @KellyannePolls No serious self respe...
2,0x2466f6,test,"Looking for a new car, and it says 1 lady owne..."
3,0x23f9e9,test,@cineworld “only the brave” just out and fount...
4,0x1fb4e1,test,Felt like total dog 💩 going into open gym and ...
...,...,...,...
411967,0x2c4dc2,test,6 year old walks in astounded. Mum! Look how b...
411968,0x31be7c,test,Only one week to go until the #inspiringvolunt...
411969,0x1ca58e,test,"I just got caught up with the manga for ""My He..."
411970,0x35c8ba,test,Speak only when spoken to and make hot ass mus...


In [13]:
pretrained_path = '/home/Danny/Data-Mining/lab2/kaggle/embedding/bert_tiny'
config_path = '/home/Danny/Data-Mining/lab2/kaggle/embedding/bert_tiny/bert_config.json'
checkpoint_path = '/home/Danny/Data-Mining/lab2/kaggle/embedding/bert_tiny/bert_model.ckpt'
dict_path = '/home/Danny/Data-Mining/lab2/kaggle/embedding/bert_tiny/vocab.txt'

In [14]:
from keras_bert import extract_embeddings

def word2vec(sentence_list, maxlen):
    x = extract_embeddings(pretrained_path, sentence_list)
    print(type(x))
    print(x.shape)
    print(x)
    x_train = np.zeros((len(sentence_list), maxlen, x[0].shape[1]))
    for i in range(len(sentence_list)):
        for j in range(len(x[i])):
            if len(x[i]) > maxlen:
                break
            x_train[i][j] = x[i][j]
    return x_train

In [None]:
x_array = word2vec(train_df['text'].tolist(), maxlen)

In [None]:
y_array = train_df['label'].to_numpy()

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_array, y_array, test_size=0.3, random_state=40)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# one-hot encoding
from keras.utils import to_categorical 
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [None]:
from keras_bert import load_trained_model_from_checkpoint, Tokenizer
from keras_bert import extract_embeddings
import re, os
import codecs

In [None]:
from keras.layers import *
from keras.models import Model
import keras.backend as K
from keras.optimizers import Adam
from tensorflow.python import keras

In [None]:
from keras.models import Sequential

model = Sequential()
model.add(Dense(512, activation='relu', input_dim=max_features))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(8, activation='softmax'))

model.compile(optimizer='adam', 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])
model.summary()

In [None]:
model.fit(x_train, y_train, batch_size=max_features, epochs=epochs, validation_data=(x_test, y_test))
model_path = 'model/keras_bert_len_{}_epo_{}'.format(max_features, epochs)
model.save(model_path)

In [None]:
# from keras.models import Sequential

# model = Sequential()
# model.add(Dense(512, activation='relu', input_dim=max_features))
# model.add(Dense(256, activation='relu'))
# model.add(Dense(128, activation='relu'))
# model.add(Dense(64, activation='relu'))
# model.add(Dense(8, activation='softmax'))

# model.compile(optimizer='adam', 
#               loss='categorical_crossentropy', 
#               metrics=['accuracy'])
# model.summary()

In [None]:
# model.fit(x_train, y_train, batch_size=max_features, epochs=epochs, validation_data=(x_test, y_test))
# model_path = 'model/keras_bert_len_{}_epo_{}'.format(max_features, epochs)
# model.save(model_path)

In [None]:
from tensorflow import keras
model = keras.models.load_model('model/test')
model.evaluate(x_test, y_test)

In [None]:
y_pred = model.predict_classes(x_test)
y_pred.shape

In [None]:
y_test = np.argmax(y_test, axis=1)
y_test.shape

In [None]:
label_count = len(pd.unique(train_df['label']))
target_names =  le.inverse_transform([i for i in range(label_count)])
target_names

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=target_names))

In [None]:
x_array = word2vec(test_df['text'], maxlen)
x_array.shape

In [None]:
y_pred = model.predict_classes(x_array, batch_size=10)
y_pred.shape

In [None]:
predict_array =  le.inverse_transform(y_pred)
predict_array.shape

In [None]:
test_df['predict'] = predict_array
test_df

In [None]:
output_df = test_df[['tweet_id', 'predict']]
output_df = output_df.rename(columns={'tweet_id':'id', 'predict':'emotion'})
output_df

In [None]:
output_path = '../../output/keras_bert_len_{}_epo_{}.csv'.format(max_features, epochs)
output_df.to_csv(output_path, index=False, header=True)