In [1]:
!conda install -y gdown

download clean dataset

In [2]:
!gdown --id 1FVn-onNGHAxs_Xw8Nh0mkvxCQdak0Hgj

In [3]:
!gdown --id 1-0RazmVNm7eB707h9otYoaJk8zOHrNca

# install packages

In [4]:
!pip install parsivar

In [5]:
!pip install hazm

In [6]:
!pip install googletrans==4.0.0-rc1

In [7]:
!pip install finglish

In [8]:
!pip install langdetect

In [13]:
# !pip install -U textblob
# !python -m textblob.download_corpora

In [9]:
!pip install transformers

# imports

In [10]:
from googletrans import Translator
import parsivar
from finglish import f2p
import hazm
import requests
import io
from textblob import Word
import os
from langdetect import detect

In [11]:
import pandas as pd
import numpy as np
import tensorflow as tf
from matplotlib import pyplot as plt
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix, classification_report

In [12]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Input, Conv1D, MaxPooling1D, LSTM, Dense, Dropout, GlobalMaxPooling1D
from tensorflow.keras import Model
import keras

In [86]:
from transformers import BertTokenizer, TFBertModel
from transformers import XLMRobertaTokenizer, TFXLMRobertaModel

In [14]:
tf.__version__, keras.__version__

# utility

In [15]:
def get_stopwords():
  url = "https://raw.githubusercontent.com/zolfaShefreie/Sentiment-Analysis-of-SnappFood-Comments/main/persian_stopword"
  content = requests.get(url).content.decode("utf-8") 
  return set([w for w in content.split('\n') if w])


In [16]:
class TextCleaner:
    punc = '''!()-[]{.};:'"\,<>/?@#$%^&*_~`|’“”…—–'''
    normalizer = parsivar.Normalizer(date_normalizing_needed=True, 
                                     statistical_space_correction=True)
    stemmer = parsivar.FindStems()
    lemmatizer = hazm.Lemmatizer()
    tokenizer = parsivar.Tokenizer()
    translator = Translator()
    stopwords = get_stopwords()

    @classmethod
    def normalize(cls, text):
        """
        convert to persian and normlize the text
        """
        if detect(text) == "en":
            text = cls.translator.translate(text, src="en", dest="fa").text
        elif detect(text) != "fa":
            text = f2p(text)
        text = cls.normalizer.normalize(text)
        return text
    
    @classmethod
    def remove_punc(cls, text):
        for each in cls.punc:
            text = text.replace(each, ' ')
        return text
    
    @classmethod
    def stem_lemmet_word(cls, word):
        """
        apply stem and lemmet on word
        """
        new_word = word.strip()
        new_word = cls.stemmer.convert_to_stem(new_word)
        if '&' in new_word:
            new_word = new_word.split('&')[0]
        new_word = cls.lemmatizer.lemmatize(new_word)
        if '#' in new_word:
            new_word = new_word.split('#')[0]
        return new_word
        

    @classmethod
    def word_cleaning(cls, text, use_stemm=True, use_lemmet=True, 
                      remove_stopword=True):
      tokens = cls.tokenizer.tokenize_words(text)
      new_text = str()
      for token in tokens:
          if use_stemm:
              token = cls.stemmer.convert_to_stem(token)
              if '&' in token:
                  token = token.split('&')[0]
          if use_lemmet:
              token = cls.lemmatizer.lemmatize(token)
              if '#' in token:
                  token = token.split('#')[0]
          if remove_stopword and token in cls.stopwords:
              continue
          new_text += " " + token
      return new_text
    
    @classmethod
    def clean_text(cls, text, use_stemm=False, use_lemmet=False, 
                   remove_stopword=False):

        text = cls.normalize(text)
        text = cls.remove_punc(text)
        text = cls.word_cleaning(text, use_stemm, use_lemmet, remove_stopword)
        text = text.replace('‌', ' ')
        text = " ".join(text.split())
        return text


In [17]:
def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = map(float, tokens[1:])
    return data

In [18]:
def get_embedding_matrix(word_index, embedding_index, embed_dim, check_word_clean=True):
    """
    create matrix besed on word_index
    """
    embedding_matrix = np.zeros((len(word_index) + 1, embed_dim))
    miss_word = set()
    for word, i in word_index.items():
        embedding_vector = embedding_index.get(word, None)
        if embedding_vector is None:
            embedding_vector = embedding_index.get(TextCleaner.stem_lemmet_word(word), None)
        if embedding_vector is not None:
            try:
                embedding_matrix[i] = np.fromiter(embedding_vector, dtype='float32')
            except:
                pass
        else:
            miss_word.add(word)
    return embedding_matrix, miss_word

# load dataset

In [18]:
train_file_path = "./train.csv"
validation_file_path = "./dev.csv"
need_clean_train = False
need_cleean_validation = False

In [19]:
train_df = pd.read_csv(train_file_path, delimiter='\t' if need_clean_train else None , index_col=0)
validation_df = pd.read_csv(validation_file_path, delimiter='\t' if need_clean_train else None, index_col=0)

In [20]:
train_df

In [21]:
validation_df

# Model Using mBert and XLM-Roberta 

In [22]:
def tokenize(sentences, tokenizer, max_len=128):
    input_ids, input_masks, input_segments = [],[],[]
    for sentence in sentences:
        inputs = tokenizer.encode_plus(sentence, add_special_tokens=True, max_length=max_len, padding='max_length', 
                                             return_attention_mask=True, return_token_type_ids=True, truncation=True)
        input_ids.append(inputs['input_ids'])
        input_masks.append(inputs['attention_mask'])
        input_segments.append(inputs['token_type_ids'])        
        
    return np.asarray(input_ids, dtype='int32'), np.asarray(input_masks, dtype='int32'), np.asarray(input_segments, dtype='int32')

## using mBert

In [23]:
max_len = 128

In [24]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')

In [25]:
train_ids, train_mask, train_segments = tokenize(train_df.clean_comment, tokenizer, max_len)
val_ids, val_mask, val_segment = tokenize(validation_df.clean_comment, tokenizer, max_len)

In [26]:
train_ids.shape, val_ids.shape

In [27]:
save_model_path = "./mbert_model.h5"

In [63]:
os.remove(save_model_path)

In [65]:
def get_mbert_model(max_len, save_model_path):
    """
    load model if exists else create a model and return compiled model
    """
    mbert_model = TFBertModel.from_pretrained("bert-base-multilingual-uncased")
    if os.path.exists(save_model_path):
        model = tf.keras.models.load_model(save_model_path, custom_objects={'TFBertModel': TFBertModel})
        return model, False
    
    else:
        input_ids = Input(shape=(max_len,), name='input_token', dtype='int32')
        input_mask_ids = Input(shape=(max_len,), name='input_token_mask', dtype='int32')
        x = mbert_model(input_ids, input_mask_ids)[0][:,0,:]
        x = Dropout(0.5)(x)
        preds = Dense(1, activation='sigmoid')(x)
        model = Model(inputs=[input_ids, input_mask_ids], outputs=preds)
        model.compile(loss='binary_crossentropy', 
                      optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
                      metrics=['accuracy'])
        return model, True

In [66]:
model, need_train = get_mbert_model(max_len, save_model_path)

In [67]:
model.summary()

### train model

In [68]:
if need_train:
    history = model.fit([train_ids, train_mask], train_df.label_id, 
              validation_data=([val_ids, val_mask], validation_df.label_id),
              epochs=10, 
              batch_size=64,
              callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1, mode='min')])
    model.save(save_model_path)

In [69]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [70]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

### evaluate

In [75]:
test_path = "./dev.csv"
raw_version = False

In [77]:
test_df = pd.read_csv(test_path, delimiter='\t' if raw_version else None  , index_col=0)
test_df['clean_comment'] = test_df['comment'].apply(TextCleaner.clean_text)
test_ids, test_mask, test_segments = tokenize(test_df.clean_comment, tokenizer, max_len)

In [78]:
y_pred = model.predict([test_ids, test_mask])

In [79]:
y_pred = np.array(y_pred > 0.5).astype("int32")

In [80]:
print(classification_report(test_df.label_id, y_pred))

In [81]:
print("precision_score", precision_score(test_df.label_id, y_pred , average="macro"))
print("recall_score", recall_score(test_df.label_id, y_pred , average="macro"))
print("f1_score", f1_score(test_df.label_id, y_pred , average="macro"))

## using XLM-Roberta

In [87]:
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")

In [88]:
max_len = 128

In [89]:
train_ids, train_mask, train_segments = tokenize(train_df.clean_comment, tokenizer, max_len)
val_ids, val_mask, val_segment = tokenize(validation_df.clean_comment, tokenizer, max_len)

In [90]:
save_model_path = "./xml_roberta_model.h5"

In [95]:
def get_xml_roberta_model(max_len, save_model_path):
    """
    load model if exists else create a model and return compiled model
    """
    if os.path.exists(save_model_path):
        model = tf.keras.models.load_model(save_model_path, custom_objects={'TFXLMRobertaModel': TFXLMRobertaModel})
        return model, False
    
    else:
        roberta_model = TFXLMRobertaModel.from_pretrained("xlm-roberta-base")

        input_ids = Input(shape=(max_len,), name='input_token', dtype='int32')
        input_mask_ids = Input(shape=(max_len,), name='input_token_mask', dtype='int32')
        x = roberta_model(input_ids, input_mask_ids)[0][:,0,:]
        preds = Dense(1, activation='sigmoid')(x)
        model = Model(inputs=[input_ids, input_mask_ids], outputs=preds)
        model.compile(loss='binary_crossentropy', 
                      optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
                      metrics=['accuracy'])
        return model, True

In [96]:
model, need_train = get_xml_roberta_model(max_len, save_model_path)

In [97]:
model.summary()

### train model

In [98]:
if need_train:
    history = model.fit([train_ids, train_mask], train_df.label_id, 
              validation_data=([val_ids, val_mask], validation_df.label_id),
              epochs=50, 
              callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1, mode='min')])
    model.save(save_model_path)

In [99]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [100]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

### evaluate model

In [101]:
test_path = "./dev.csv"
raw_version = False

In [102]:
test_df = pd.read_csv(test_path, delimiter='\t' if raw_version else None  , index_col=0)
test_df['clean_comment'] = test_df['comment'].apply(TextCleaner.clean_text)
test_ids, test_mask, test_segments = tokenize(test_df.clean_comment, tokenizer, max_len)

In [103]:
y_pred = model.predict([test_ids, test_mask])

In [104]:
y_pred = np.array(y_pred > 0.5).astype("int32")

In [105]:
print(classification_report(test_df.label_id, y_pred))

In [106]:
print("precision_score", precision_score(test_df.label_id, y_pred , average="macro"))
print("recall_score", recall_score(test_df.label_id, y_pred , average="macro"))
print("f1_score", f1_score(test_df.label_id, y_pred , average="macro"))