In [None]:
%matplotlib inline       # import packages and libraries
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import spacy
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = stopwords.words('russian')
stop_words.extend(['наш', 'ваш', 'твой', 'свой', 'это'])
stop_words.remove('не') # exclude 'не' and 'нельзя' from stop_words list
stop_words.remove('нельзя')
np.random.seed(5)
plt.style.use("ggplot")

import tensorflow as tf
print('Tensorflow version:', tf.__version__)
print('GPU detected:', tf.config.list_physical_devices('GPU'))

In [None]:
!python -m spacy download ru_core_news_lg 

## Data Preprocessing Steps

In [None]:
data = pd.read_csv("path_to_data", encoding = 'utf-8', sep = ",") # load dataset
data = data.fillna(method="ffill") # fill NA values
data.head(7) # show first 7 rows

In [None]:
 data["Tag"] = data["Tag"].apply(lambda row: re.sub(r'^[BI]_', '', row)) # remove BIO annotation
data["Tag"].value_counts() # show tags' distribution

In [None]:
nlp = spacy.load("ru_core_news_lg", disable=['parser', 'ner']) # load big model

In [None]:
data["Word"] = data["Word"].apply(lambda row: " ".join([w.lemma_ for w in nlp(row)])) #lemmatize words
data.head(5) # show first 5 lemmatized words

In [None]:
data["is_notstop"] = data["Word"].apply(lambda row: row not in stop_words) # find words that are not in stop_word list
data = data.loc[data.is_notstop == True] # keep only not stop_words
data = data[['id', 'Word', 'Tag']]

In [None]:
print("Unique id in corpus:", data['id'].nunique())
print("Unique words in corpus:", data['Word'].nunique())

In [None]:
words = set(list(data['Word'].values)) # create list of words
words.add('PADword') # add pads to list of words
n_words = len(words) # count number of unique words
n_words

In [None]:
tags = list(set(data["Tag"].values)) # create list of tags
tags.sort() # sort tags in list of tags
n_tags = len(tags) # count number of unique tags
n_tags

In [None]:
import tqdm # building vocabulary
def sentence_integrate(data):
  agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(), s["Tag"].values.tolist())] 
  return data.groupby('id').apply(agg_func).tolist()

In [None]:
sentences=sentence_integrate(data)
sentences[1]

In [None]:
tags2index = {t:i for i,t in enumerate(tags)} # get indexes for tags

In [None]:
max_len = 60 # set maximum length
X = [[w[0] for w in s] for s in sentences] # save words as X-axis
new_X = [] # if text review is shorter than 60 words then add pads
for seq in X:
    new_seq = []
    for i in range(max_len):
        try:
            new_seq.append(seq[i])
        except:
            new_seq.append("PADword")
    new_X.append(new_seq)
#new_X[1]

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
y = [[[tags2index[w[1]]] for w in s] for s in sentences] # save tags indexes as y-axis
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tags2index["O"]) # add padword's tag (O) if the sequence is shorter than 60 
#y[1]

## Pre-trained ELMo Usage (Embeddings Extraction)

In [None]:
import time

In [None]:
!pip install --upgrade simple_elmo # install simple_elmo

In [None]:
from simple_elmo import ElmoModel
model = ElmoModel()

In [None]:
model.load("path_to_elmo") # load model

In [None]:
start = time.time()
elmo_vectors = model.get_elmo_vectors(new_X, layers = 'average') # get embeddings for words
end = time.time()

processing_time = int(end - start)

print(f"ELMo embeddings for your input are ready in {processing_time} seconds")
print(f"Tensor shape: {elmo_vectors.shape}")


In [None]:
elmo_vectors[0] # show embeddings for 1st review

## Train model

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(elmo_vectors, y, test_size=0.2, random_state=1) # split data into train and test sets

In [None]:
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, Flatten
from tensorflow.keras.layers import InputLayer, TimeDistributed, Dropout, Bidirectional
from tensorflow import keras

In [None]:
x_train.shape

In [None]:
y_test.shape

In [None]:
model_lstm = keras.Sequential()
model_lstm.add(InputLayer(input_shape=(60,1024)))   # create model (3rd architecture)
model_lstm.add(Bidirectional(LSTM(1024, return_sequences=True)))
model_lstm.add(Dense(512, activation = 'relu'))
# model_lstm.add(Dropout(0.1))
model_lstm.add(Dense(8, activation = 'softmax'))


model_lstm.summary()

In [None]:
tf.keras.utils.plot_model(
    model_lstm, to_file='model.png', show_shapes=True, show_dtype=False, # visualise
    show_layer_names=True, rankdir='TB', expand_nested=True, dpi=100,
)

In [None]:
model_lstm.compile(optimizer="adam", 
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

Before running the code below, please, note that it requires quite a lot of time


In [None]:
history = model_lstm.fit(    
    x=x_train,
    y=y_train,
    validation_data=(x_test,y_test),            # train model
    batch_size=32,
    epochs=3,
    verbose=1
    
)

## Perfrom ABSA on new data


In [None]:
data_df = pd.read_csv("path_to_new_data", usecols=["id", "text"]) # load new data
data_df.dropna(inplace=True) 

data_dict = data_df.set_index('id')['text'].to_dict()  # and save as dictionary

# preprocess data 

from nltk.tokenize import sent_tokenize

split_dict = {} # create dictionary for preprocessed data
tokenizer = nltk.RegexpTokenizer(r"\w+") 

for idx, r in data_dict.items():
    snts = sent_tokenize(r) # split into sentences
    
    tknzd_snts = [tokenizer.tokenize(s) for s in snts] # split into words
    
    snts_wo_sw = [] # create list for words (without stop_words)
    
    for s in tknzd_snts:
        new_s = [w for w in s if not w in stop_words] # save words that are not in stop_words list
        snts_wo_sw.append(new_s) # add these words to the list
    
    max_s_len = 60   # set the maximum length
    
    final_snts = [] # create list where sentences are tokenized, pads are added and stop-words are removed

    for s in snts_wo_sw:   # add pads to sentences that have less than 60 words
        new_s = []
        for i in range(max_s_len):
            try:
                new_s.append(s[i])
            except:
                new_s.append("PADword")
        final_snts.append(new_s)
    split_dict[idx] = final_snts 

Before running the code below, please, note that it requires quite a lot of time

In [None]:
vector_dict = {}

for idx in split_dict:  # get embeddings for each review
    print(idx)
    vector_dict[idx] = model.get_elmo_vectors(split_dict[idx], layers = 'average')
    
print("done!")

Run this chunk if you want to use reloaded model (if don't, just skip it)


In [None]:
import tensorflow_hub as hub

export_path_keras = "path_to_model"
 
reloaded = tf.keras.models.load_model(
  export_path_keras,
  # `custom_objects` tells keras how to load a `hub.KerasLayer`
  custom_objects={'KerasLayer': hub.KerasLayer})
 
reloaded.summary()

In [None]:
pos = "s_positive" # creating sets with tags as they will be needed later
neg = "s_negative"
s_tags = {pos, neg}
a_tags = {"a_exercise", "a_material", "a_presentation", "a_course_arrangement", "a_general"}

tags = list(s_tags) + list(a_tags) + ["O"]
tags.sort()
tags

In [None]:
all_data = {} 

for id_r in vector_dict: # predicting tags for embeddings

    sent2tags = []
    p = model_lstm.predict(vector_dict[id_r])[0] # if you use reloaded model then change 'model_lstm' into 'reloaded'
    p = np.argmax(p, axis=-1)
    p = p.reshape(1, 60, 1)

    for i, sent in enumerate(vector_dict[id_r]):
        curr_sent_labels = [] 
        for n, w, pred in zip(range(len(vector_dict[id_r][i])), vector_dict[id_r][i], p[0]):
            curr_sent_labels.append(tags[pred[0]])
        sent2tags.append(curr_sent_labels)
    all_data[id_r] = sent2tags

In [None]:
from collections import deque

# this function gets sentiment of aspect in each sentence, k - is the sliding window's size (can be changed)
# the output is not normalized

def count_sentence_sentiment(snt, k=3):
    asp2sent = {}
    
    for a in a_tags:
        asp2sent[a] = 0
        
    i = k  
    curr_win = deque(snt[0:k], maxlen=k)
    while i <= len(snt):
        if (a_tags & set(curr_win)) and (s_tags & set(curr_win)):
            curr_s = 0
            
            for t in curr_win:
                if t == pos:
                    curr_s += 1
                if t == neg:
                    curr_s -= 1

            for t in curr_win:
                if t in a_tags:
                    asp2sent[t] += curr_s
                    
        if i != len(snt):
            curr_win.append(snt[i])
        i += 1
        
    return asp2sent

# normalization function

def normalize_sent(a_sent_dict):
    norm_a_sent = {}
    for a in a_sent_dict:
        if a_sent_dict[a] >= 1:
            norm_a_sent[a] = 1
        elif a_sent_dict[a] <= -1:
            norm_a_sent[a] = -1
        else:
            norm_a_sent[a] = 0
    return norm_a_sent


In [None]:
from collections import Counter

# this function gets sentiment for aspects in the whole review 
# the output is not normalized 

def count_review_sent(review):
    counter = Counter()
    for s in review:
        s_sent = count_sentence_sentiment(s) # get aspects' sentiments for each sentence
        s_sent_n = normalize_sent(s_sent) # normalize them
        counter.update(s_sent_n) # sum normalized values (but the summary itself is not normalized)
    return dict(counter)

In [None]:
final_ids = []
res_sent = []
texts = []

for i, r in all_data.items():
#     here values are normalized for the review
    norm_sent = normalize_sent(count_review_sent(r))
    final_ids.append(i)
    texts.append(data_dict[i])
    res_sent.append(norm_sent)

In [None]:
# creating a table
df_texts = pd.DataFrame({"id": final_ids, "text": texts})
df_sent = pd.DataFrame(res_sent)
df_res = pd.concat([df_texts, df_sent], axis=1)

df_res

Saving the results

In [None]:
df_res.to_excel("absa_results.xlsx", index=False) 