## Install required libraries

In [1]:
# !pip install matplotlib==3.1.0
# !pip install hazm
# !pip install parsivar
# !pip install keras-self-attention
# !pip install np_utils

# !wget https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.fa.vec
# !wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fa.300.vec.gz
# !wget https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.fa.zip
# !gunzip cc.fa.300.vec.gz

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Import Libraries

In [5]:
import tensorflow as tf
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
from keras.models import Model, Sequential
from keras.layers import *
from keras.utils import to_categorical
from keras.utils import pad_sequences
from keras.metrics import categorical_accuracy
import sklearn.metrics as skm

# from parsivar import Normalizer, FindStems
# from parsivar import Tokenizer as Tokenizer_Parsivar

from keras.preprocessing.text import Tokenizer
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from collections import Counter
# from keras_self_attention import SeqSelfAttention


# Import & Analyze Dataset

In [6]:
# Convert dataframes to numpy arrays

dataset_train = pd.read_excel('drive/MyDrive/ShortPersianEmo 2023 Code/Data/ShortPersianEmo/train_fa.xlsx', header=None).dropna()
x_train = dataset_train[0]
y_train = dataset_train[1]

dataset_test = pd.read_excel('drive/MyDrive/ShortPersianEmo 2023 Code/Data/ShortPersianEmo/test_fa.xlsx', header=None).dropna()
x_test = dataset_test[0]
y_test = dataset_test[1]

x_train = np.asarray(x_train)
y_train = np.asarray(y_train)

x_test = np.asarray(x_test)
y_test = np.asarray(y_test)

le = preprocessing.LabelEncoder()
le.fit(y_train)
y_train = le.transform(y_train)
y_test = le.transform(y_test)

# Prepare labels for categorical prediction
categorical_y_train = to_categorical(y_train, len(dataset_test[1].unique()))
categorical_y_test = to_categorical(y_test, len(dataset_test[1].unique()))


# Preprocess

In [7]:
import re

def preprocess_text(text):
  text = text.replace('_',' ')
  text = text.replace('.',' ')

  for ch in ['\\','`','*','%','^','?','…','{','}','[',']','(',')','>','#','+','-','.','$','/','\'','؛' ,'،','!','«','»','؟',':','\"','♥']:
      if ch in text:
          text = text.replace(ch,' ')

  text = re.sub('\d+', ' ', text)   # remove numbers
  text = re.sub('\s+', ' ', text)   # remove whitespaces

  return text

# Prepare FastText Model

In [None]:
EMBEDDING_FILE = 'drive/MyDrive/cc.fa.300.vec'

ft_model = KeyedVectors.load_word2vec_format(EMBEDDING_FILE)

In [15]:
ft_words = []
for ft_word in ft_model.index_to_key:
    ft_words.append(ft_word)

In [16]:
embedding_list = list()
for w in ft_words:
  embedding_list.append(ft_model[w])

all_embedding = np.stack(embedding_list)
emb_mean, emb_std = all_embedding.mean(), all_embedding.std()

# Prepare data for Deep Learning model

In [17]:
# Apply preprocessing step to training data
train_docs = np.empty_like(x_train)
for index, document in enumerate(x_train):
  train_docs[index] = preprocess_text(document)

# Applying preprocessing step to test data
test_docs = np.empty_like(x_test)
for index, document in enumerate(x_test):
  test_docs[index] = preprocess_text(document)

In [18]:
num_words = 2000
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(train_docs)

In [19]:
max_length = 100

In [20]:
# Embed training sequences
encoded_docs = tokenizer.texts_to_sequences(train_docs)
x_train_padded = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

# Embed testing sequences
encoded_docs = tokenizer.texts_to_sequences(test_docs)
x_test_padded = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

In [22]:
embed_size = 300
nb_words = len(tokenizer.word_index)
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
embeddedCount = 0
N_embeddedCount = []
for word, i in tokenizer.word_index.items():
    i -= 1
    if word in ft_model.index_to_key:
        embedding_vector = ft_model[word]
        embedding_matrix[i] = embedding_vector
        embeddedCount += 1
    else:
        embedding_vector = ft_model['غیره']
        embedding_matrix[i] = embedding_vector
        N_embeddedCount.append(word)
print('total embedded:', embeddedCount, 'common words')
print('Embedding matrix shape:', embedding_matrix.shape)

total embedded: 9964 common words
Embedding matrix shape: (10927, 300)


In [23]:
len(N_embeddedCount)

963

# Deep Learning Models

In [24]:
import math
def ceiltoup(x):
  return math.ceil(x * 100) / 100.0

f1_macro = []
accuracy = []
for i in range(10):
  Fs_Model = Sequential()
  Fs_Model.add(Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1], weights=[embedding_matrix], trainable=True))
  Fs_Model.add(GlobalMaxPool1D())
  Fs_Model.add(Dense(len(dataset_test[1].unique()), activation='softmax'))
  Fs_Model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
  EarlyStopping_callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', min_delta=1e-3, patience=10, verbose=0, mode='auto', restore_best_weights=True)
  hist_blstm = Fs_Model.fit(x_train_padded, categorical_y_train, validation_data=(x_test_padded, categorical_y_test), callbacks=[EarlyStopping_callback], epochs=100, verbose=0)
  y_pred = np.argmax(Fs_Model.predict(x_test_padded), axis=1)
  f1_macro.append(skm.f1_score(y_test, y_pred, average="macro"))
  accuracy.append(skm.accuracy_score(y_test, y_pred, normalize=True))
print('f1 macro : ' + str(f1_macro) + '\nmean f1 macro : ' + str(ceiltoup(np.mean(f1_macro))))
print('accuracy : ' + str(accuracy) + '\nmean accuracy : ' + str(ceiltoup(np.mean(accuracy))))

f1 macro : [0.6137590497868128, 0.6277459932941163, 0.6104200308377825, 0.6302025958902978, 0.6064297765782783, 0.6162540493046403, 0.5992575554209606, 0.6126203255609913, 0.6256916262117099, 0.5980285364040134]
mean f1 macro : 0.62
accuracy : [0.6423357664233577, 0.6551094890510949, 0.6368613138686131, 0.6514598540145985, 0.6332116788321168, 0.6496350364963503, 0.6405109489051095, 0.6405109489051095, 0.6551094890510949, 0.6368613138686131]
mean accuracy : 0.65
