In [1]:
import pandas as pd
import numpy as np
import nltk
import gensim
import warnings
import logging
import ssl
import urllib.request
import gensim.downloader as api
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from sklearn import preprocessing


warnings.filterwarnings('ignore')
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
ssl._create_default_https_context = ssl._create_unverified_context

In [2]:
#Raw Data loading
tweets = pd.read_json('tweets_DM.json', lines=True)
emotion = pd.read_csv('emotion.csv')
trainOrTest = pd.read_csv('data_identification.csv')

In [3]:
#Use dict to store 'idenification' and 'emotion' based on 'tweet_id'
trainOrTestDict = {}; emoDict = {}
for id, type in zip(trainOrTest['tweet_id'].values, trainOrTest['identification'].values): trainOrTestDict[id] = type
for id, emo in zip(emotion['tweet_id'].values, emotion['emotion'].values): emoDict[id] = emo

In [5]:
label = ['score', 'hashtag', 'text', 'emotion'] #dataframe column label
train_score_list = []
train_hashtag_list = []
train_text_list = []
train_emotion_list = [] #train dataframe columns, including 'score', 'hashtag', 'text', 'emotion'

test_id_list = []
test_score_list = []
test_hashtag_list = []
test_text_list = [] #test dataframe columns, including 'id', 'score', 'text', 'hashtag'

#however, I only used 'text', 'emotion' in train df and 'id', 'text' in test df

for score, source in zip(tweets['_score'], tweets['_source']):
    id = source['tweet']['tweet_id']
    if trainOrTestDict[id] == 'train':
        train_score_list.append(score)
        train_hashtag_list.append(source['tweet']['hashtags'])
        train_text_list.append(source['tweet']['text'].replace('<LH>','')) # Remove "<LH>" in text, because that was generated from Linux system,
                                                                           # which should not occur in a normal sentence.
        train_emotion_list.append(emoDict[id])
    elif trainOrTestDict[id] == 'test':
        test_id_list.append(id)
        test_score_list.append(score)
        test_hashtag_list.append(source['tweet']['hashtags'])
        test_text_list.append(source['tweet']['text'].replace('<LH>','')) # Same operation as above
        
train_df = pd.DataFrame(np.array([train_score_list, train_hashtag_list, train_text_list, train_emotion_list]).T, columns=label)
test_df = pd.DataFrame(np.array([test_id_list, test_score_list, test_hashtag_list, test_text_list]).T, columns=['id'] + label[:-1])


In [42]:
#tokenizing both text data
train_df['tokenized'] = train_df['text'].apply(lambda x:nltk.word_tokenize(x))
test_df['tokenized'] = test_df['text'].apply(lambda x:nltk.word_tokenize(x))

In [90]:
#use LabelBinarizer to generate one-hot expression of emotion
mlb = preprocessing.LabelBinarizer()
mlb.fit(train_df['emotion'])
train_df['one_hot_emotion'] = mlb.transform(train_df['emotion']).tolist()

In [10]:
#save file, so next time won't do above again
train_df.to_json('train.json')
test_df.to_json('test.json')

# First Try
##### Sum the word embeddings of a sentence using pretrained glove-twitter-200 model, then train it with fully-connected NN
##### Idea: Words with similar emotion should have similar word embeddings, so adding them up may outstand some features.

In [None]:
#load data generated from above
train_df = pd.read_json('train.json')
test_df = pd.read_json('test.json')

In [None]:
#load pretrain model
glove_twitter_200_model = api.load("glove-twitter-200")

In [40]:
import tensorflow as tf
from sklearn.model_selection import train_test_split

NUM_CLASS = len(train_df['one_hot_emotion'][0])
BATCH_SIZE = 256

#fully connected neural network
def model(input_shape):
    m = tf.keras.Sequential([
        tf.keras.layers.Dense(512, input_shape = input_shape, activation='relu'),
        tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Dense(NUM_CLASS, activation='softmax'),
    ])
    m.compile(optimizer='adam',
              loss=tf.keras.losses.categorical_crossentropy,
              metrics=[tf.keras.metrics.CategoricalAccuracy(),
                       tf.keras.metrics.Precision(),
                       tf.keras.metrics.Recall()])
    m.summary()
    return m

#Summation of word embeddings
def handleInput(w2v_model, tokenized): 
    ret = []
    sentenceVec = np.zeros(w2v_model['.'].shape)
    for words in tokenized:
        for word in words:
            try:
                sentenceVec = w2v_model[word.lower()]
            except:
                continue
        ret.append(sentenceVec)
    return np.array(ret)

#function to generate output that will submit to Kaggle
def generateOutput(X, out_path, m = None, model_path = 'Trial_fc'):
    if not m:
        m = tf.keras.models.load_model(model_path)
    res = m.predict(X, batch_size=BATCH_SIZE).tolist()
    mlb = preprocessing.LabelBinarizer()
    mlb.fit(train_df['emotion'])
    for i in range(len(res)):
        top = max(res[i])
        for j in range(len(res[i])):
            if res[i][j] != top: res[i][j] = 0
            else: res[i][j] = 1
                
    res = mlb.inverse_transform(np.array(res))
    out_df = pd.DataFrame(np.concatenate([test_df['id'].to_numpy().astype(np.str_)[np.newaxis, :], res[np.newaxis, :]], axis = 0).T,\
                          columns=['id', 'emotion'])
    out_df.to_csv(out_path, index=False)
    

In [18]:
#caculate the summation word embeddings based on their tokenized sentence
X = handleInput(glove_twitter_200_model, train_df['tokenized'])
tX = handleInput(glove_twitter_200_model, test_df['tokenized'])
# save them, it takes some times and I don't want to wait for them every time.
# np.save('X', X)
# np.save('tX', tX)

In [5]:
#Loaddddddd
X = np.load('X.npy')
tX = np.load('tX.npy')

In [33]:
#train_test_splittttt
X_train, X_test, y_train, y_test = train_test_split(X, np.array(train_df['one_hot_emotion'].to_list()), test_size=0.2, random_state=42)

In [41]:
#build model
m = model(X_train[0].shape)

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_6 (Dense)             (None, 512)               102912    
                                                                 
 dropout_4 (Dropout)         (None, 512)               0         
                                                                 
 dense_7 (Dense)             (None, 256)               131328    
                                                                 
 dropout_5 (Dropout)         (None, 256)               0         
                                                                 
 dense_8 (Dense)             (None, 8)                 2056      
                                                                 
Total params: 236,296
Trainable params: 236,296
Non-trainable params: 0
_________________________________________________________________


In [None]:
#train it!
m.fit(x=X_train, y=y_train, epochs=20, batch_size=BATCH_SIZE)

In [None]:
m.evaluate(x=X_test, y=y_test)

In [57]:
generateOutput(tX, 'sub.csv')



##### First try is somehow better (scoring 0.32886) than predicting all output as 'joy'(scoring 0.30765), and I think maybe fined-tune the word2vec model before extracting word embedding from it might get a more power solution.
##### Note that the reason predicting all output as 'joy' is that 'joy' holding a great proportion in raw data.

# Second try
##### Train a roBERTa model mentioned in the class, and predicting the output by a fully-connected neural network.
##### Idea: roBERTa sounds a powerful model!

In [None]:
import numpy as np
import regex as re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statistics
import math
import os

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

import tensorflow as tf
import tensorflow.keras.backend as K
import tokenizers
from transformers import RobertaTokenizer, TFRobertaModel

from collections import Counter

import warnings
tf.keras.mixed_precision.set_global_policy('mixed_float16')
warnings.filterwarnings("ignore")
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [None]:
# Detect hardware, return appropriate distribution strategy (you can see that it is pretty easy to set up).
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is set (always set in Kaggle)
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
    print('Running on TPU ', tpu.master())
except ValueError:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print('Number of replicas:', strategy.num_replicas_in_sync)

Number of replicas: 1


In [None]:
MODEL_NAME = 'roberta-base'
MAX_LEN = 64

BATCH_SIZE = 64 * strategy.num_replicas_in_sync
EPOCHS = 3

### Load data

In [None]:
df = pd.read_json('train.json')
df.head()

Unnamed: 0,score,hashtag,text,emotion,tokenized,one_hot_emotion
0,391,[Snapchat],"People who post ""add me on #Snapchat"" must be ...",anticipation,"[People, who, post, ``, add, me, on, #, Snapch...","[0, 1, 0, 0, 0, 0, 0, 0]"
1,433,"[freepress, TrumpLegacy, CNN]","@brianklaas As we see, Trump is dangerous to #...",sadness,"[@, brianklaas, As, we, see, ,, Trump, is, dan...","[0, 0, 0, 0, 0, 1, 0, 0]"
2,376,[],Now ISSA is stalking Tasha 😂😂😂,fear,"[Now, ISSA, is, stalking, Tasha, 😂😂😂]","[0, 0, 0, 1, 0, 0, 0, 0]"
3,120,"[authentic, LaughOutLoud]",@RISKshow @TheKevinAllison Thx for the BEST TI...,joy,"[@, RISKshow, @, TheKevinAllison, Thx, for, th...","[0, 0, 0, 0, 1, 0, 0, 0]"
4,1021,[],Still waiting on those supplies Liscus.,anticipation,"[Still, waiting, on, those, supplies, Liscus, .]","[0, 1, 0, 0, 0, 0, 0, 0]"


In [None]:
X_data = df[['text']].to_numpy().reshape(-1)
y_data = df[['emotion']].to_numpy().reshape(-1)

### Tokenize&Encode

In [None]:
def roberta_encode(texts, tokenizer):
    ct = len(texts)
    input_ids = np.ones((ct, MAX_LEN), dtype='int32')
    attention_mask = np.zeros((ct, MAX_LEN), dtype='int32')
    token_type_ids = np.zeros((ct, MAX_LEN), dtype='int32') # Not used in text classification

    for k, text in enumerate(texts):
        #tokenize
        tok_text = tokenizer.tokenize(text)
        
        #truncate and convert tokens to numerical IDs
        enc_text = tokenizer.convert_tokens_to_ids(tok_text[:(MAX_LEN-2)])
        
        input_length = len(enc_text) + 2
        input_length = input_length if input_length < MAX_LEN else MAX_LEN
        
        #add tokens [CLS] and [SEP] at the beginning and the end
        input_ids[k,:input_length] = np.asarray([0] + enc_text + [2], dtype='int32')
        
        #set to 1s in the attention input
        attention_mask[k,:input_length] = 1

    return {
        'input_word_ids': input_ids,
        'input_mask': attention_mask,
        'input_type_ids': token_type_ids
    }

In [None]:
#transform categories into numbers
category_to_id = {}
category_to_name = {}

for index, c in enumerate(y_data):
    if c in category_to_id:
        category_id = category_to_id[c]
    else:
        category_id = len(category_to_id)
        category_to_id[c] = category_id
        category_to_name[category_id] = c
    
    y_data[index] = category_id

n_categories = len(list(category_to_name.keys()))
category_to_name

{0: 'anticipation',
 1: 'sadness',
 2: 'fear',
 3: 'joy',
 4: 'anger',
 5: 'trust',
 6: 'disgust',
 7: 'surprise'}

In [None]:
#import tokenizer from HuggingFace
tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)

#split into train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.3, random_state=777) # random_state to reproduce results

X_train = roberta_encode(X_train, tokenizer)
X_test = roberta_encode(X_test, tokenizer)

y_train = np.asarray(y_train, dtype='int32')
y_test = np.asarray(y_test, dtype='int32')

In [None]:
#save them 
np.save('roberta_xtrain', X_train)
np.save('roberta_xtest', X_test)
np.save('roberta_ytrain', y_train)
np.save('roberta_ytest', y_test)

In [None]:
n_categories = 8
X_train = np.load('roberta_xtrain.npy', allow_pickle=True).item()
X_test = np.load('roberta_xtest.npy', allow_pickle=True).item()
y_train = np.load('roberta_ytrain.npy')
y_test = np.load('roberta_ytest.npy')

### Build model

In [None]:
def build_model(n_categories):
    with strategy.scope():
        input_word_ids = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_word_ids')
        input_mask = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_mask')
        input_type_ids = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_type_ids')

        # Import RoBERTa model from HuggingFace
        roberta_model = TFRobertaModel.from_pretrained(MODEL_NAME)
        x = roberta_model(input_word_ids, attention_mask=input_mask, token_type_ids=input_type_ids)

        # Huggingface transformers have multiple outputs, embeddings are the first one,
        # so let's slice out the first position
        x = x[0]

        x = tf.keras.layers.Dropout(0.1)(x)
        x = tf.keras.layers.Flatten()(x)
        x = tf.keras.layers.Dense(256, activation='relu')(x)
        x = tf.keras.layers.Dense(n_categories, activation='softmax')(x)

        model = tf.keras.Model(inputs=[input_word_ids, input_mask, input_type_ids], outputs=x)
        model.compile(
            optimizer=tf.keras.optimizers.Adam(lr=1e-5),
            loss=tf.metrics.sparse_categorical_crossentropy,
            metrics=['accuracy'])

        return model
    

In [None]:
with strategy.scope():
    model = build_model(n_categories)
    model.summary()

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_word_ids (InputLayer)    [(None, 64)]         0           []                               
                                                                                                  
 input_mask (InputLayer)        [(None, 64)]         0           []                               
                                                                                                  
 input_type_ids (InputLayer)    [(None, 64)]         0           []                               
                                                                                                  
 tf_roberta_model (TFRobertaMod  TFBaseModelOutputWi  124645632  ['input_word_ids[0][0]',         
 el)                            thPoolingAndCrossAt               'input_mask[0][0]',         

### Train it

In [None]:
with strategy.scope():
    print('Training...')
    history = model.fit(X_train,
                        y_train,
                        epochs=EPOCHS,
                        batch_size=BATCH_SIZE,
                        verbose=1,
                        validation_data=(X_test, y_test))

### Evaluation

In [None]:
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1] * 100))

Accuracy: 68.20%


### Predict and generate output file

In [None]:

def generatePrediciton(X_dut, id_dut, model, out_path):
    y_pred = [category_to_name[np.argmax(i)] for i in model.predict(X_dut)]
    df_out = pd.DataFrame(np.concatenate([id_dut[np.newaxis, :], np.array(y_pred)[np.newaxis, :]], axis=0).T, columns=['id', 'emotion'])
    df_out.to_csv(out_path, index=False)

In [None]:
df_dut = pd.read_json('dut.json')
id_dut = df_dut['id'].to_numpy().reshape(-1)
#X_dut = roberta_encode(df_dut['text'].to_numpy().reshape(-1), tokenizer) #used for first time
#np.save('roberta_xdut', X_dut)
X_dut = np.load('roberta_xdut.npy', allow_pickle=True).item()

In [None]:
generatePrediciton(X_dut, id_dut, model, 'submission.csv')

##### This method scored 0.56233 which is my best submission and the truth it was ranking third at that time satisfied me. 