# Implementation of the ABSA for Hotel system

# Download data

In [None]:
# Clone repo và không in ra progress (quite mode)
!git clone -q https://github.com/thinhntr/CS221.M11.KHCL-Aspect-Based-Sentiment-Analysis

# Kiểm tra đường dẫn hiện tại
!pwd

/content


In [None]:
# Hiện file/thư mục con trong đường dẫn hiện tại
!ls

CS221.M11.KHCL-Aspect-Based-Sentiment-Analysis	sample_data


# Load data

In [None]:
import pandas as pd
from pathlib import Path

repo_dir = Path('./CS221.M11.KHCL-Aspect-Based-Sentiment-Analysis')
data_dir = repo_dir / 'data' / 'csv'

train_fp = data_dir / 'train.csv'
dev_fp   = data_dir / 'dev.csv'
test_fp  = data_dir / 'test.csv'

# Kiểm tra đường dẫn có tồn tại không
assert train_fp.is_file()
assert dev_fp.is_file()
assert test_fp.is_file()


train_df = pd.read_csv(train_fp)
dev_df = pd.read_csv(dev_fp)
test_df = pd.read_csv(test_fp)

len(train_df), len(dev_df), len(test_df)

(2961, 1290, 500)

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder

enc = OrdinalEncoder()

def get_X_yA_yAS(df, not_fitted=False):
    X = df.review.values.copy()
    y = df.drop('review', axis=1)
    yA = y.notna().values
    yAS = y.fillna('dne')
    if not_fitted:
        enc.fit(yAS)
    yAS = enc.transform(yAS)
    return X, yA, yAS


X_train, yA_train, yAS_train = get_X_yA_yAS(train_df, not_fitted=True)
X_dev, yA_dev, yAS_dev = get_X_yA_yAS(dev_df)
X_test, yA_test, yAS_test = get_X_yA_yAS(test_df)

print(X_train.shape, yA_train.shape, yAS_train.shape)
print(X_dev.shape, yA_dev.shape, yAS_train.shape)
print(X_test.shape, yA_test.shape, yAS_train.shape)

(2961,) (2961, 12) (2961, 12)
(1290,) (1290, 12) (2961, 12)
(500,) (500, 12) (2961, 12)


# Model

In [None]:
!pip install -q emoji

import emoji
import numpy as np
from sklearn.preprocessing import FunctionTransformer


def remove_emoji(texts):
    return np.array([emoji.get_emoji_regexp().sub('', text) for text in texts])


emoji_remover = FunctionTransformer(remove_emoji)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier


input_preprocess = make_pipeline(emoji_remover, 
                                 TfidfVectorizer())

pipeline = make_pipeline(
    input_preprocess,
    RandomForestClassifier()
)

pipeline.fit(X_train, yAS_train)

Pipeline(memory=None,
         steps=[('pipeline',
                 Pipeline(memory=None,
                          steps=[('functiontransformer',
                                  FunctionTransformer(accept_sparse=False,
                                                      check_inverse=True,
                                                      func=<function remove_emoji at 0x7fc303ecb050>,
                                                      inv_kw_args=None,
                                                      inverse_func=None,
                                                      kw_args=None,
                                                      validate=False)),
                                 ('tfidfvectorizer',
                                  TfidfVectorizer(analyzer='word', binary=False,
                                                  decode_error='strict',
                                                  dtype...
                 RandomForestClassifier(bootstrap=T

In [None]:
from sklearn.metrics import f1_score


def multioutput_to_multilabel(y):
    nrow = y.shape[0]
    ncol = y.shape[1]
    multilabel = np.zeros((nrow, 4 * ncol), dtype=np.bool)
    for i in range(nrow):
        for j in range(ncol):
            pos = int(j * 4 + y[i, j])
            multilabel[i, pos] = True
    return multilabel


def custom_f1_score(y_true, y_pred, **kwargs):
    y_true = multioutput_to_multilabel(y_true)
    y_pred = multioutput_to_multilabel(y_pred)
    return f1_score(y_true, y_pred, **kwargs)

In [None]:
y_pred = pipeline.predict(X_dev)
custom_f1_score(yAS_dev, y_pred, average='samples')

0.8587855297157624

In [None]:
custom_f1_score(yAS_dev, y_pred, average='micro')

0.8587855297157624

In [None]:
custom_f1_score(yAS_dev, y_pred, average='weighted', zero_division=0)

0.8233522651652272

In [None]:
custom_f1_score(yAS_dev, y_pred, average='macro', zero_division=0)

0.30169544051464864

In [None]:
# from transformers import AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained('vinai/phobert-base')

# def preprocess(df):
#     tokens = tokenizer(df.review.values.tolist(), 
#                        padding="max_length", 
#                        truncation=True, 
#                        return_tensors="np")

#     X = tokens.input_ids
#     y = df.drop('review', axis=1).notna().values
#     return X, y

# X_train, y_train = preprocess(train_df)
# X_dev, y_dev = preprocess(dev_df)
# X_test, y_test = preprocess(test_df)

# X_train.shape, y_train.shape, X_dev.shape, y_dev.shape, X_test.shape, y_test.shape

# Draft

```
import re
import json
import requests
import numpy as np
import pandas as pd


aspects_url = 'https://raw.githubusercontent.com/thinhntr/CS221.M11.KHCL-Aspect-Based-Sentiment-Analysis/main/data/csv/aspects.json'
aspects = json.loads(requests.get(aspects_url).text)

def label_encoder(label):
    y = [np.nan] * len(aspects)
    ap_stm = re.findall('{(.+?), ([a-z]+)}', label)

    for aspect, sentiment in ap_stm:
        idx = aspects.index(aspect)
        y[idx] = sentiment

    return y

def txt2df(filepath):
    with open(filepath, 'r', encoding='utf-8-sig') as txt:
        data = txt.read().split('\n')

    df = pd.DataFrame()
    df['review'] = [review for review in data[1::4]]
    df[aspects] = [label_encoder(label) for label in data[2::4]]

    return df

def label_decoder(encoded_label):
    aps_stms = encoded_label[encoded_label.notna()]
    
    return ', '.join([f'{{{aspect}, {sentiment}}}' 
                      for aspect, sentiment in 
                      zip(aps_stms.index, aps_stms)])

def csv2str(filepath):
    df = pd.read_csv(filepath)
    rows = []
    for id, row in df.iterrows():
        review = row[0]
        labels = label_decoder(row[1:])
        rows.extend((f'#{id+1}', review, labels, ''))
    return '\n'.join(rows)


root_dir = Path('CS221.M11.KHCL-Aspect-Based-Sentiment-Analysis/data')

train_txt_fp = root_dir/'original/1-VLSP2018-SA-Restaurant-train (7-3-2018).txt'
dev_txt_fp = root_dir/'original/2-VLSP2018-SA-Restaurant-dev (7-3-2018).txt'
test_txt_fp = root_dir/'original/3-VLSP2018-SA-Restaurant-test (8-3-2018).txt'

train_csv_fp = root_dir/'csv/train.csv'
dev_csv_fp = root_dir/'csv/dev.csv'
test_csv_fp = root_dir/'csv/test.csv'

assert train_txt_fp.is_file()
assert dev_txt_fp.is_file()
assert test_txt_fp.is_file()

train_df = txt2df(train_fp)
dev_df = txt2df(dev_fp)
test_df = txt2df(test_fp)

train_df.to_csv(train_csv_fp, index=False)
dev_df.to_csv(dev_csv_fp, index=False)
test_df.to_csv(test_csv_fp, index=False)

print(csv2str(train_csv_fp))
print(csv2str(dev_csv_fp))
print(csv2str(test_csv_fp))
```


In [None]:
!pip install -Uq scikit-learn

[K     |████████████████████████████████| 23.2 MB 1.4 MB/s 
[?25h

In [None]:
from sklearn.feature_extraction.text import (CountVectorizer,
                                             HashingVectorizer,
                                             TfidfVectorizer,
                                             TfidfTransformer)

corpus = ['is First document.',
          'is is second document']
        #   'this is the first document?',
        #   'and this is the third document']

cvectorizer = CountVectorizer(ngram_range=(1, 1))
X1 = np.asarray(cvectorizer.fit_transform(corpus).todense())

X2 = np.asarray(TfidfTransformer().fit_transform(X1).todense())

tvectorizer = TfidfVectorizer()
X3 = tvectorizer.fit_transform(corpus).todense()