# Install packages

In [2]:
!pip install -Uq emoji \
                 optuna \
                 flashtext \
                 underthesea \
                 scikit-learn \

# Read data

## TextCleaner class

In [3]:
import re
import unicodedata
import pandas as pd
from functools import partial
from emoji import get_emoji_regexp
from flashtext import KeywordProcessor
from sklearn.base import BaseEstimator, TransformerMixin

HASHTAG = 'hashtag'

class TextCleanerBase(BaseEstimator, TransformerMixin):
    def __init__(self):
        super().__init__()

        # Find emojis
        emoji = get_emoji_regexp()

        # Create preprocessing function
        self.remove_emoji      = partial(emoji.sub, '')
        self.normalize_unicode = partial(unicodedata.normalize, 'NFC')

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if not isinstance(X, pd.Series):
            X = pd.Series(X)

        return X.apply(str.lower) \
                .apply(self.remove_emoji) \
                .apply(self.normalize_unicode)
        

class TextCleaner(TextCleanerBase):
    def __init__(self):
        super().__init__()

        # Find hashtag
        hashtag = re.compile('#\S+')

        # Find price tags
        pricetag = '((?:(?:\d+[,\.]?)+) ?(?:nghìn đồng|đồng|k|vnd|d|đ))'
        pricetag = re.compile(pricetag)

        # Find special characters
        specialchar = r"[\"#$%&'()*+,\-.\/\\:;<=>@[\]^_`{|}~\n\r\t]"
        specialchar = re.compile(specialchar)

        # Spelling correction
        rules = {
            "òa":["oà"], "óa":["oá"], "ỏa":["oả"], "õa":["oã"], "ọa":["oạ"],
            "òe":["oè"], "óe":["oé"], "ỏe":["oẻ"], "õe":["oẽ"], "ọe":["oẹ"],
            "ùy":["uỳ"], "úy":["uý"], "ủy":["uỷ"], "ũy":["uỹ"], "ụy":["uỵ"],
            "ùa":["uà"], "úa":["uá"], "ủa":["uả"], "ũa":["uã"], "ụa":["uạ"],
            "xảy":["xẩy"], "bảy":["bẩy"], "gãy":["gẫy"],
            "không":["k", "hông", "ko", "khong"]}

        kp = KeywordProcessor(case_sensitive=False)
        kp.add_keywords_from_dict(rules)

        # Create preprocessing functions
        self.autocorrect          = kp.replace_keywords
        self.normalize_pricetag   = partial(pricetag.sub, 'giá_tiền')
        self.normalize_hashtag    = partial(hashtag.sub, HASHTAG)
        self.remove_specialchar   = partial(specialchar.sub, '')

    def transform(self, X):
        X = super().transform(X)

        return X.apply(self.autocorrect) \
                .apply(self.normalize_pricetag) \
                .apply(self.normalize_hashtag) \
                .apply(self.remove_specialchar)


## mo2ml - Multioutput to multilabel

In [4]:
aspects = ['FOOD#PRICES',
           'FOOD#QUALITY',
           'FOOD#STYLE&OPTIONS',
           'DRINKS#PRICES',
           'DRINKS#QUALITY',
           'DRINKS#STYLE&OPTIONS',
           'RESTAURANT#PRICES',
           'RESTAURANT#GENERAL',
           'RESTAURANT#MISCELLANEOUS',
           'SERVICE#GENERAL',
           'AMBIENCE#GENERAL',
           'LOCATION#GENERAL']

sentiments = ['-', 'o', '+']

def mo2ml(y):
    """Convert multi-output to multi-label data
    """
    newcols = [f'{a} {s}' for a in aspects for s in sentiments]

    nrows, ncols = len(y), len(newcols)
    ml = pd.DataFrame(np.zeros((nrows, ncols), dtype='bool'),
                      columns=newcols)
    
    for i, a in enumerate(aspects):
        for j in range(1, 4):
            indices = y[a] == j
            ml.iloc[indices, i * 3 + j - 1] = True

    return ml

## mo2df - Multioutput to DataFrame

In [5]:
def mo2df(y):
    if isinstance(y, pd.DataFrame):
        return y
    return pd.DataFrame(y, columns=aspects)

## Download csv

In [6]:
import numpy as np
import pandas as pd

root = 'https://raw.githubusercontent.com/thinhntr/absa/main/data/csv/'
train_url = root + 'train.csv'
dev_url = root + 'dev.csv'
test_url = root + 'test.csv'

def read_csv(url):
    df = pd.read_csv(url)

    X = df.pop('review')
    y = df.replace({np.nan: 0, 
                    'negative': 1, 
                    'neutral': 2, 
                    'positive': 3}).astype(np.uint8)

    print('X.shape:', X.shape, 'y.shape:', y.shape)
    return X, y

Xtrain, ytrain = read_csv(train_url)
Xdev,   ydev   = read_csv(dev_url)
Xtest,  ytest  = read_csv(test_url)

# Basic text cleanup
cleaner_base  = TextCleanerBase()

xtrain_basecl = cleaner_base.transform(Xtrain)
xdev_basecl   = cleaner_base.transform(Xdev)
xtest_basecl  = cleaner_base.transform(Xtest)

# Advanced text cleanup
cleaner       = TextCleaner()

xtrain        = cleaner.transform(Xtrain)
xdev          = cleaner.transform(Xdev)
xtest         = cleaner.transform(Xtest)

X.shape: (2961,) y.shape: (2961, 12)
X.shape: (1290,) y.shape: (1290, 12)
X.shape: (500,) y.shape: (500, 12)


In [7]:
# y target for phase a
ytrain_a  = ytrain != 0
ydev_a    = ydev   != 0
ytest_a   = ytest  != 0

# y target for phase b
ytrain_b  = ytrain.copy()
ydev_b    = ydev  .copy()
ytest_b   = ytest .copy()

# y target for evaluation
ytrain_ml = mo2ml(ytrain)
ydev_ml   = mo2ml(ydev)
ytest_ml  = mo2ml(ytest)

In [8]:
# import requests

# def download_img(url, save_path):
#     with open(save_path, 'wb') as f:
#         response = requests.get(url)
#         f.write(response.content)

# download_img('https://image.flaticon.com/icons/png/512/24/24208.png', 'vn.png')

# EDA

## References
- https://developers.google.com/machine-learning/guides/text-classification/step-2
- https://github.com/google/eng-edu/blob/main/ml/guides/text_classification/explore_data.py

## Utils

In [9]:
def get_num_words_per_sample(sample_texts):
    """Gets the median number of words per sample given corpus.

    # Arguments
        sample_texts: list, sample texts.

    # Returns
        int, median number of words per sample.
    """
    num_words = [len(s.split()) for s in sample_texts]
    return np.median(num_words)

## Key metrics

In [10]:
num_samples = len(xtrain)
num_aspects = len(aspects)
num_classes = num_aspects * 3
num_words_per_sample = get_num_words_per_sample(xtrain)
sw_ratio = num_samples / num_words_per_sample


print("Xtrain key metrics")
print("Number of samples:", num_samples)
print("Number of aspects:", num_aspects)
print("Number of classes:", num_classes)
print("Number of words per sample:", num_words_per_sample)
print("Number of samples/number of words per sample ratio", sw_ratio)

Xtrain key metrics
Number of samples: 2961
Number of aspects: 12
Number of classes: 36
Number of words per sample: 49.0
Number of samples/number of words per sample ratio 60.42857142857143


## Class distribution

In [11]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def get_pie(df, name):
    count = df.sum(axis=0)
    return go.Pie(labels=count.index, values=count, 
                  textposition='inside', name=name)


names = ('Train', 'Dev', 'Test')
phaseA = (ytrain_a, ydev_a, ytest_a)

fig = make_subplots(cols=3, subplot_titles=names,
                    specs=[[{'type': 'pie'}] * 3])

for i, (df, name) in enumerate(zip(phaseA, names), 1):
    fig.add_trace(get_pie(df, name), row=1, col=i)

fig.update_layout(title='# of samples per aspect')

In [12]:
phaseML = (ytrain_ml, ydev_ml, ytest_ml)

fig = make_subplots(cols=3, subplot_titles=names,
                    specs=[[{'type': 'pie'}] * 3])

for i, (df, name) in enumerate(zip(phaseML, names), 1):
    fig.add_trace(get_pie(df, name), row=1, col=i)

fig.update_layout(title='# of samples per class (entity, sentiment)')

## Sample length distribution

In [13]:
count0 = [len(s) for s in xtrain]
count1 = [len(s) for s in xdev]
count2 = [len(s) for s in xtest]

fig = go.Figure()
fig.add_trace(go.Histogram(x=count0, name='train'))
fig.add_trace(go.Histogram(x=count1, name='dev'))
fig.add_trace(go.Histogram(x=count2, name='test'))

fig.update_layout(title='Sample length distribution', barmode='overlay')
fig.update_traces(opacity=0.5)

# Feature extraction (Convert reviews to vectors)

##  Basic features (1, 2, 3 grams)


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1, 3),
                             min_df=2, max_df=0.9)

# x data using basic clean up class and basic features extrator
xtrain_basecl_basef = vectorizer.fit_transform(xtrain_basecl)
xdev_basecl_basef   = vectorizer.transform(xdev_basecl)
xtest_basecl_basef  = vectorizer.transform(xtest_basecl)

# x data using advanced clean up class and basic features extrator
xtrain_basef = vectorizer.fit_transform(xtrain)
xdev_basef   = vectorizer.transform(xdev)
xtest_basef  = vectorizer.transform(xtest)

In [15]:
xtrain_basecl_basef.shape, xtrain_basef.shape

((2961, 33802), (2961, 32971))

## More features (pos tag, result from phase a)

In [75]:
from underthesea import pos_tag, word_tokenize
from scipy import sparse
from sklearn.feature_extraction.text import (CountVectorizer,
                                             TfidfTransformer,
                                             TfidfVectorizer)

from sklearn.pipeline import make_pipeline

class FeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self):
        super().__init__()

        self.for_phase_a = True

        # POS tags encoding
        self.allow_tags = {tag: i for i, tag in enumerate('NVA', 1)}

    def _nva_extractor(self, X):
        """Extract noun, verb, adjective tokens and tags
        """
        reviews = []
        for x in X:
            review = [item for item in pos_tag(x) \
                      if item[1] in self.allow_tags]
            reviews.append(review)
        return reviews

    def _postag_vtrz(self, reviews):
        """Convert pos tags to a feature matrix
        """
        vocab = self.tfidf_nva[0].vocabulary
        features = np.zeros((len(reviews), len(vocab)))
        for review, feature in zip(reviews, features):
            for token, tag in review:
                try:
                    feature[vocab[token]] = self.allow_tags[tag]
                except KeyError:
                    pass
        return features

    def fit(self, X, y=None):
        reviews = self._nva_extractor(X)

        # Create the vocabulary
        vocab = {item[0] for review in reviews
                 for item in review}
        vocab = {word: i for i, word in enumerate(vocab)}

        # Tfidf for noun, verb, adjective tokens
        count_nva = CountVectorizer(tokenizer=word_tokenize,
                                    vocabulary=vocab,
                                    min_df=2, max_df=0.9)
        tfidf_vec = TfidfTransformer()
        self.tfidf_nva = make_pipeline(count_nva, tfidf_vec).fit(X)

        # 1, 2, 3 grams
        self.tfidf_123 = TfidfVectorizer(ngram_range=(1, 3),
                                         min_df=2, max_df=0.9).fit(X)
        
        # 2, 3 grams
        self.tfidf_23 = TfidfVectorizer(ngram_range=(2, 3),
                                        min_df=2, max_df=0.9).fit(X)

        return self

    def transform(self, X):
        reviews = self._nva_extractor(X)

        if self.for_phase_a:
            features = [self.tfidf_123.transform(X),
                        self.tfidf_nva.transform(X),
                        self._postag_vtrz(reviews)] 
            return sparse.hstack(features)

        features = [self.tfidf_23.transform(X),
                    self.tfidf_nva.transform(X),
                    [['!' in text or '?' in text] for text in X]]
        return sparse.hstack(features)

In [76]:
# x data using advanced clean up class and advanced feature extractor
fe = FeatureExtractor().fit(xtrain)

fe.for_phase_a = True
xtrain_a = fe.transform(xtrain)
xdev_a   = fe.transform(xdev)
xtest_a  = fe.transform(xtest)

fe.for_phase_a = False
xtrain_b = fe.transform(xtrain)
xdev_b   = fe.transform(xdev)
xtest_b  = fe.transform(xtest)

In [77]:
xtrain_a.shape, xtrain_b.shape

((2961, 49621), (2961, 38205))

# Model Architectures

## End-to-End Architecture

In [19]:
from sklearn.multioutput import MultiOutputClassifier as MOC

## TwoStageSimple Architecture

In [55]:
import scipy

def add_features(X, y):
    return scipy.sparse.hstack((X, y))

class TwoStageSimple:
    def __init__(self, model_a, model_b):
        self.model_a = model_a 
        self.model_b = model_b

    def fit(self, X, y_a, y_b):
        self.model_a.fit(X, y_a)
        X = add_features(X, y_a)
        self.model_b.fit(X, y_b)
        return self
    
    def predict(self, X):
        y_a = self.model_a.predict(X) 
        X = add_features(X, y_a)
        return self.model_b.predict(X)

## TwoStageAdvanced Architecture

In [79]:
class TwoStageAdvanced(TwoStageSimple):
    def fit(self, X_a, X_b, y_a, y_b):
        self.model_a.fit(X_a, y_a)
        X = add_features(X_b, y_a)
        self.model_b.fit(X, y_b)
        return self
    
    def predict(self, X_a, X_b):
        y_a = self.model_a.predict(X_a)
        X = add_features(X_b, y_a)
        pred = self.model_b.predict(X)
        return pred

# Evaluation functions

In [62]:
from sklearn.metrics import f1_score, classification_report


def quick_f1(y_true, y_pred):
    y_pred = mo2ml(mo2df(y_pred))
    return round(f1_score(y_true, y_pred, average='micro', zero_division=0), 4)

def evaluate(model, X, y, average='micro'):
    yb_true  = mo2ml(y)

    yb_pred  = mo2df(model.predict(X))
    yb_pred  = mo2ml(yb_pred)

    return classification_report(yb_true, yb_pred, zero_division=0)

# Compare models

## Base Model

In [23]:
from sklearn.svm import LinearSVC

In [24]:
clf0 = MOC(LinearSVC(random_state=5))
clf0.fit(xtrain_basecl_basef, ytrain_b)

print(quick_f1(ytrain_ml, clf0.predict(xtrain_basecl_basef)))
print(quick_f1(ydev_ml  , clf0.predict(xdev_basecl_basef)))
print(quick_f1(ytest_ml , clf0.predict(xtest_basecl_basef)))
print(evaluate(clf0, xtest_basecl_basef, ytest_b))

1.0
0.658
0.6063
              precision    recall  f1-score   support

           0       0.25      0.04      0.06        28
           1       0.51      0.37      0.43       175
           2       0.47      0.66      0.55       128
           3       0.00      0.00      0.00        11
           4       0.69      0.21      0.32        43
           5       0.85      0.99      0.91       403
           6       0.00      0.00      0.00        16
           7       0.00      0.00      0.00        53
           8       0.74      0.96      0.83       334
           9       0.00      0.00      0.00         3
          10       0.00      0.00      0.00        45
          11       0.00      0.00      0.00        28
          12       0.00      0.00      0.00         6
          13       0.00      0.00      0.00        11
          14       0.91      0.19      0.31        54
          15       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         4
          

In [25]:
clf1 = MOC(LinearSVC(random_state=5))
clf1.fit(xtrain_basef, ytrain_b)

print(quick_f1(ytrain_ml, clf1.predict(xtrain_basef)))
print(quick_f1(ydev_ml  , clf1.predict(xdev_basef)))
print(quick_f1(ytest_ml , clf1.predict(xtest_basef)))
print(evaluate(clf1, xtest_basef, ytest_b))

0.9989
0.6631
0.6105
              precision    recall  f1-score   support

           0       0.20      0.04      0.06        28
           1       0.58      0.43      0.49       175
           2       0.48      0.66      0.56       128
           3       0.00      0.00      0.00        11
           4       0.69      0.21      0.32        43
           5       0.84      0.99      0.91       403
           6       0.00      0.00      0.00        16
           7       0.00      0.00      0.00        53
           8       0.74      0.96      0.84       334
           9       0.00      0.00      0.00         3
          10       0.00      0.00      0.00        45
          11       0.00      0.00      0.00        28
          12       0.00      0.00      0.00         6
          13       0.00      0.00      0.00        11
          14       0.92      0.20      0.33        54
          15       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         4
      

## Linear SVC hyperparameter tuning

In [32]:
import optuna
from optuna.samplers import TPESampler

In [33]:
def callback(study, trial):
    if study.best_trial.number == trial.number:
        study.set_user_attr(key='best_model', value=trial.user_attrs['model'])

In [45]:
def linearsvc_objective(trial):
    params = dict(
        C=trial.suggest_float('C', 1e-9, 1e2, log=True),
        class_weight=trial.suggest_categorical('class_weight', ['balanced', None]),
        loss=trial.suggest_categorical('loss', ['hinge', 'squared_hinge']),
        max_iter=2000,
        random_state=5
    )

    clf = MOC(LinearSVC(**params))
    clf.fit(xtrain_basef, ytrain_b)
    trial.set_user_attr(key="model", value=clf)
    
    y_pred = clf.predict(xdev_basef)
    return quick_f1(ydev_ml, y_pred)

sampler = TPESampler(seed=22)
linearsvc_study = optuna.create_study(sampler=sampler, direction='maximize')
linearsvc_study.optimize(linearsvc_objective, n_trials=50, callbacks=[callback])


clf2 = linearsvc_study.user_attrs['best_model']

print(evaluate(clf2, xtest_basef, ytest_b))

print('train:', quick_f1(ytrain_ml, clf2.predict(xtrain_basef)))
print('dev:  ', quick_f1(ydev_ml  , clf2.predict(xdev_basef)))
print('test: ', quick_f1(ytest_ml , clf2.predict(xtest_basef)))

print(clf2.estimators_[0].get_params())
print(linearsvc_study.best_params)

              precision    recall  f1-score   support

           0       0.38      0.21      0.27        28
           1       0.55      0.59      0.57       175
           2       0.51      0.55      0.53       128
           3       0.00      0.00      0.00        11
           4       0.59      0.30      0.40        43
           5       0.86      0.98      0.91       403
           6       0.50      0.06      0.11        16
           7       0.00      0.00      0.00        53
           8       0.73      0.99      0.84       334
           9       0.00      0.00      0.00         3
          10       0.00      0.00      0.00        45
          11       0.00      0.00      0.00        28
          12       0.00      0.00      0.00         6
          13       0.00      0.00      0.00        11
          14       0.82      0.17      0.28        54
          15       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         4
          17       0.75    

## Non-Linear SVM

In [69]:
from sklearn.svm import SVC

def svc_objective(trial):
    params = dict(
        class_weight=trial.suggest_categorical('class_weight', ['balanced', None]),
        kernel=trial.suggest_categorical('kernel', ['poly', 'rbf', 'sigmoid']),
        gamma=trial.suggest_categorical('gamma', ['auto', 'scale']),
        max_iter=3000,
        random_state=5
    )

    clf = MOC(SVC(**params))
    clf.fit(xtrain_basef, ytrain_b)
    trial.set_user_attr(key="model", value=clf)

    y_pred = clf.predict(xdev_basef)
    return quick_f1(ydev_ml, y_pred)

sampler = TPESampler(seed=22)
svc_study = optuna.create_study(direction='maximize')
svc_study.optimize(svc_objective, n_trials=10, callbacks=[callback])


clf3 = svc_study.user_attrs['best_model']

print(evaluate(clf3, xtest_basef, ytest_b))

print('train:', quick_f1(ytrain_ml, clf3.predict(xtrain_basef)))
print('dev:  ', quick_f1(ydev_ml  , clf3.predict(xdev_basef)))
print('test: ', quick_f1(ytest_ml , clf3.predict(xtest_basef)))

print(clf3.estimators_[0].get_params())
print(svc_study.best_params)

[32m[I 2021-12-23 09:16:24,645][0m A new study created in memory with name: no-name-56f678e4-01c9-4ac2-84db-be36052bb17b[0m

Solver terminated early (max_iter=3000).  Consider pre-processing your data with StandardScaler or MinMaxScaler.


Solver terminated early (max_iter=3000).  Consider pre-processing your data with StandardScaler or MinMaxScaler.


Solver terminated early (max_iter=3000).  Consider pre-processing your data with StandardScaler or MinMaxScaler.


Solver terminated early (max_iter=3000).  Consider pre-processing your data with StandardScaler or MinMaxScaler.


Solver terminated early (max_iter=3000).  Consider pre-processing your data with StandardScaler or MinMaxScaler.


Solver terminated early (max_iter=3000).  Consider pre-processing your data with StandardScaler or MinMaxScaler.

[32m[I 2021-12-23 09:19:16,461][0m Trial 0 finished with value: 0.6474 and parameters: {'class_weight': 'balanced', 'kernel': 'rbf', 'gamma': 'scale'}. Best is trial 0 with value: 0

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        28
           1       0.52      0.63      0.57       175
           2       0.51      0.45      0.48       128
           3       0.00      0.00      0.00        11
           4       0.80      0.09      0.17        43
           5       0.82      1.00      0.90       403
           6       0.00      0.00      0.00        16
           7       0.00      0.00      0.00        53
           8       0.71      0.98      0.82       334
           9       0.00      0.00      0.00         3
          10       0.00      0.00      0.00        45
          11       0.00      0.00      0.00        28
          12       0.00      0.00      0.00         6
          13       0.00      0.00      0.00        11
          14       1.00      0.07      0.14        54
          15       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         4
          17       0.67    

## Logistic Regression

In [70]:
from sklearn.linear_model import LogisticRegression

def logistic_objective(trial):
    params = dict(
        class_weight=trial.suggest_categorical('class_weight', ['balanced', None]),
        C=trial.suggest_float('C', 1e-5, 20),
        random_state=5,
        max_iter=200
    )    

    clf = MOC(LogisticRegression(**params))
    clf.fit(xtrain_basef, ytrain_b)
    trial.set_user_attr(key="model", value=clf)

    y_pred = clf.predict(xdev_basef)
    return quick_f1(ydev_ml, y_pred)

sampler = TPESampler(seed=22)
logistic_study = optuna.create_study(sampler=sampler, direction='maximize')
logistic_study.optimize(logistic_objective, n_trials=5, callbacks=[callback])


clf4 = logistic_study.user_attrs['best_model']

print(evaluate(clf4, xtest_basef, ytest_b))

print('train:', quick_f1(ytrain_ml, clf4.predict(xtrain_basef)))
print('dev:  ', quick_f1(ydev_ml  , clf4.predict(xdev_basef)))
print('test:', quick_f1(ytest_ml  , clf4.predict(xtest_basef)))

print(clf4.estimators_[0].get_params())
print(logistic_study.best_params)

[32m[I 2021-12-23 09:36:01,441][0m A new study created in memory with name: no-name-278b51b5-a4d0-42ef-a7c3-38dd87eec8ff[0m
[32m[I 2021-12-23 09:36:50,685][0m Trial 0 finished with value: 0.659 and parameters: {'class_weight': None, 'C': 8.41076650090714}. Best is trial 0 with value: 0.659.[0m
[32m[I 2021-12-23 09:37:39,030][0m Trial 1 finished with value: 0.6671 and parameters: {'class_weight': 'balanced', 'C': 6.777285823434402}. Best is trial 1 with value: 0.6671.[0m
[32m[I 2021-12-23 09:38:19,178][0m Trial 2 finished with value: 0.6579 and parameters: {'class_weight': None, 'C': 4.408098128709346}. Best is trial 1 with value: 0.6671.[0m
[32m[I 2021-12-23 09:39:15,237][0m Trial 3 finished with value: 0.666 and parameters: {'class_weight': 'balanced', 'C': 11.224078321223027}. Best is trial 1 with value: 0.6671.[0m
[32m[I 2021-12-23 09:39:59,739][0m Trial 4 finished with value: 0.6646 and parameters: {'class_weight': 'balanced', 'C': 3.7822352140976876}. Best is tria

              precision    recall  f1-score   support

           0       0.40      0.21      0.28        28
           1       0.55      0.59      0.57       175
           2       0.50      0.66      0.57       128
           3       0.00      0.00      0.00        11
           4       0.52      0.53      0.53        43
           5       0.89      0.93      0.91       403
           6       0.50      0.06      0.11        16
           7       0.27      0.06      0.09        53
           8       0.75      0.96      0.84       334
           9       0.00      0.00      0.00         3
          10       0.44      0.09      0.15        45
          11       0.33      0.07      0.12        28
          12       0.00      0.00      0.00         6
          13       0.00      0.00      0.00        11
          14       0.60      0.44      0.51        54
          15       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         4
          17       0.47    

## Random Forest

In [48]:
from sklearn.ensemble import RandomForestClassifier as RFC

def rfc_objective(trial):
    params = dict(
        bootstrap=trial.suggest_categorical('bootstrap', [True, False]),
        max_depth=trial.suggest_categorical('max_depth', [None, 20, 40, 60, 80, 100]),
        max_features=trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2']),
        n_estimators=trial.suggest_int('n_estimators', 100, 200, step=20),
        n_jobs=-1,
        random_state=5
    )

    clf = RFC(**params)
    clf.fit(xtrain_basef, ytrain_b)
    trial.set_user_attr(key="model", value=clf)
    
    y_pred = clf.predict(xdev_basef)
    return quick_f1(ydev_ml, y_pred)


sampler = TPESampler(seed=22)
rfc_study = optuna.create_study(sampler=sampler, direction='maximize')
rfc_study.optimize(rfc_objective, n_trials=20, callbacks=[callback])


clf5 = rfc_study.user_attrs['best_model']

print(evaluate(clf5, xtest_basef, ytest_b))

print('train:', quick_f1(ytrain_ml, clf5.predict(xtrain_basef)))
print('dev:  ', quick_f1(ydev_ml  , clf5.predict(xdev_basef)))
print('test:', quick_f1(ytest_ml  , clf5.predict(xtest_basef)))

print(clf5.get_params())
print(rfc_study.best_params)

[32m[I 2021-12-23 08:53:14,331][0m A new study created in memory with name: no-name-f86c6ef4-87da-4861-afe4-f9acb041ce0c[0m
[32m[I 2021-12-23 08:53:19,279][0m Trial 0 finished with value: 0.5402 and parameters: {'bootstrap': False, 'max_depth': 20, 'max_features': 'sqrt', 'n_estimators': 160}. Best is trial 0 with value: 0.5402.[0m
[32m[I 2021-12-23 08:53:27,844][0m Trial 1 finished with value: 0.5834 and parameters: {'bootstrap': True, 'max_depth': 60, 'max_features': 'auto', 'n_estimators': 160}. Best is trial 1 with value: 0.5834.[0m
[32m[I 2021-12-23 08:53:31,965][0m Trial 2 finished with value: 0.5131 and parameters: {'bootstrap': False, 'max_depth': 60, 'max_features': 'log2', 'n_estimators': 180}. Best is trial 1 with value: 0.5834.[0m
[32m[I 2021-12-23 08:53:33,640][0m Trial 3 finished with value: 0.4861 and parameters: {'bootstrap': True, 'max_depth': 20, 'max_features': 'log2', 'n_estimators': 100}. Best is trial 1 with value: 0.5834.[0m
[32m[I 2021-12-23 08:5

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        28
           1       0.42      0.06      0.11       175
           2       0.53      0.27      0.35       128
           3       0.00      0.00      0.00        11
           4       0.00      0.00      0.00        43
           5       0.81      1.00      0.89       403
           6       0.00      0.00      0.00        16
           7       0.00      0.00      0.00        53
           8       0.70      0.98      0.82       334
           9       0.00      0.00      0.00         3
          10       0.00      0.00      0.00        45
          11       0.00      0.00      0.00        28
          12       0.00      0.00      0.00         6
          13       0.00      0.00      0.00        11
          14       0.00      0.00      0.00        54
          15       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         4
          17       0.00    

## TwoStageSimple

In [63]:
clf6 = TwoStageSimple(MOC(LinearSVC(**clf2.estimators_[0].get_params())),
                      MOC(LinearSVC(**clf2.estimators_[0].get_params())))
clf6.fit(xtrain_basef, ytrain_a, ytrain_b)

print(evaluate(clf6, xtest_basef, ytest_b))

print('train:', quick_f1(ytrain_ml, clf6.predict(xtrain_basef)))
print('dev:  ', quick_f1(ydev_ml  , clf6.predict(xdev_basef)))
print('test:', quick_f1(ytest_ml  , clf6.predict(xtest_basef)))

              precision    recall  f1-score   support

           0       0.46      0.21      0.29        28
           1       0.52      0.65      0.57       175
           2       0.46      0.58      0.51       128
           3       0.00      0.00      0.00        11
           4       0.86      0.14      0.24        43
           5       0.85      0.94      0.89       403
           6       0.00      0.00      0.00        16
           7       0.00      0.00      0.00        53
           8       0.72      0.97      0.83       334
           9       0.00      0.00      0.00         3
          10       0.53      0.22      0.31        45
          11       0.19      0.18      0.19        28
          12       0.00      0.00      0.00         6
          13       0.00      0.00      0.00        11
          14       0.54      0.54      0.54        54
          15       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         4
          17       0.43    

In [73]:
clf7 = TwoStageSimple(MOC(SVC(**clf3.estimators_[0].get_params())),
                      MOC(SVC(**clf3.estimators_[0].get_params())))
clf7.fit(xtrain_basef, ytrain_a, ytrain_b)

print(evaluate(clf7, xtest_basef, ytest_b))

print('train:', quick_f1(ytrain_ml, clf7.predict(xtrain_basef)))
print('dev:  ', quick_f1(ydev_ml  , clf7.predict(xdev_basef)))
print('test: ', quick_f1(ytest_ml , clf7.predict(xtest_basef)))


Solver terminated early (max_iter=3000).  Consider pre-processing your data with StandardScaler or MinMaxScaler.


Solver terminated early (max_iter=3000).  Consider pre-processing your data with StandardScaler or MinMaxScaler.


Solver terminated early (max_iter=3000).  Consider pre-processing your data with StandardScaler or MinMaxScaler.


Solver terminated early (max_iter=3000).  Consider pre-processing your data with StandardScaler or MinMaxScaler.


Solver terminated early (max_iter=3000).  Consider pre-processing your data with StandardScaler or MinMaxScaler.


Solver terminated early (max_iter=3000).  Consider pre-processing your data with StandardScaler or MinMaxScaler.


Solver terminated early (max_iter=3000).  Consider pre-processing your data with StandardScaler or MinMaxScaler.


Solver terminated early (max_iter=3000).  Consider pre-processing your data with StandardScaler or MinMaxScaler.


Solver terminated early (max_iter=3000).  Consider pre-processing your data wit

              precision    recall  f1-score   support

           0       0.10      0.04      0.05        28
           1       0.39      0.48      0.43       175
           2       0.33      0.44      0.38       128
           3       0.00      0.00      0.00        11
           4       0.20      0.14      0.16        43
           5       0.82      0.95      0.88       403
           6       0.00      0.00      0.00        16
           7       0.33      0.04      0.07        53
           8       0.72      0.97      0.83       334
           9       0.00      0.00      0.00         3
          10       1.00      0.02      0.04        45
          11       0.00      0.00      0.00        28
          12       0.00      0.00      0.00         6
          13       0.00      0.00      0.00        11
          14       1.00      0.11      0.20        54
          15       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         4
          17       0.67    

In [74]:
clf8 = TwoStageSimple(MOC(LogisticRegression(**clf4.estimators_[0].get_params())),
                      MOC(LogisticRegression(**clf4.estimators_[0].get_params())))
clf8.fit(xtrain_basef, ytrain_a, ytrain_b)

print(evaluate(clf8, xtest_basef, ytest_b))

print('train:', quick_f1(ytrain_ml, clf8.predict(xtrain_basef)))
print('dev:  ', quick_f1(ydev_ml  , clf8.predict(xdev_basef)))
print('test:', quick_f1(ytest_ml  , clf8.predict(xtest_basef)))


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



              precision    recall  f1-score   support

           0       0.39      0.25      0.30        28
           1       0.52      0.58      0.55       175
           2       0.47      0.62      0.54       128
           3       0.00      0.00      0.00        11
           4       0.48      0.30      0.37        43
           5       0.87      0.94      0.90       403
           6       1.00      0.06      0.12        16
           7       0.20      0.06      0.09        53
           8       0.73      0.95      0.83       334
           9       0.00      0.00      0.00         3
          10       0.42      0.18      0.25        45
          11       0.22      0.14      0.17        28
          12       0.00      0.00      0.00         6
          13       0.00      0.00      0.00        11
          14       0.51      0.39      0.44        54
          15       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         4
          17       0.50    

In [67]:
clf9 = TwoStageSimple(RFC(**clf5.get_params()),
                      RFC(**clf5.get_params()))
clf9.fit(xtrain_basef, ytrain_a, ytrain_b)

print(evaluate(clf9, xtest_basef, ytest_b))

print('train:', quick_f1(ytrain_ml, clf9.predict(xtrain_basef)))
print('dev:  ', quick_f1(ydev_ml  , clf9.predict(xdev_basef)))
print('test:', quick_f1(ytest_ml  , clf9.predict(xtest_basef)))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        28
           1       0.56      0.14      0.23       175
           2       0.46      0.52      0.49       128
           3       0.00      0.00      0.00        11
           4       0.00      0.00      0.00        43
           5       0.81      1.00      0.89       403
           6       0.00      0.00      0.00        16
           7       0.00      0.00      0.00        53
           8       0.69      0.99      0.82       334
           9       0.00      0.00      0.00         3
          10       0.00      0.00      0.00        45
          11       0.00      0.00      0.00        28
          12       0.00      0.00      0.00         6
          13       0.00      0.00      0.00        11
          14       0.00      0.00      0.00        54
          15       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         4
          17       0.00    

## TwoStageAdvanced

In [84]:
clf10 = TwoStageAdvanced(MOC(LinearSVC(**clf2.estimators_[0].get_params())),
                         MOC(LinearSVC(**clf2.estimators_[0].get_params())))
clf10.fit(xtrain_a, xtrain_b, ytrain_a, ytrain_b)

yb_true  = mo2ml(ytest_b)

yb_pred  = mo2df(clf10.predict(xtest_a, xtest_b))
yb_pred  = mo2ml(yb_pred)

print(classification_report(yb_true, yb_pred, zero_division=0))

print('train:', quick_f1(ytrain_ml, clf10.predict(xtrain_a, xtrain_b)))
print('dev:  ', quick_f1(ydev_ml  , clf10.predict(xdev_a, xdev_b)))
print('test:', quick_f1(ytest_ml  , clf10.predict(xtest_a, xtest_b)))

              precision    recall  f1-score   support

           0       0.33      0.14      0.20        28
           1       0.52      0.39      0.44       175
           2       0.44      0.56      0.49       128
           3       0.00      0.00      0.00        11
           4       0.60      0.21      0.31        43
           5       0.87      0.95      0.91       403
           6       0.00      0.00      0.00        16
           7       0.00      0.00      0.00        53
           8       0.74      0.88      0.81       334
           9       0.00      0.00      0.00         3
          10       0.50      0.07      0.12        45
          11       0.19      0.18      0.18        28
          12       0.00      0.00      0.00         6
          13       0.00      0.00      0.00        11
          14       0.57      0.48      0.52        54
          15       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         4
          17       0.37    

# Export model

In [118]:
from sklearn.pipeline import make_pipeline
import joblib

pipe = make_pipeline(TextCleaner(), vectorizer, clf4)
joblib.dump(pipe, 'pipe.joblib')

['pipe.joblib']

# Draft

## Unicode normalization

In [None]:
import unicodedata

dựng_sẵn = '\u1EA0'
tổ_hợp   = '\u0041\u0323'

print(dựng_sẵn, tổ_hợp)
print(dựng_sẵn == tổ_hợp)

print('-'*10)

dựng_sẵn = unicodedata.normalize('NFC', dựng_sẵn)
tổ_hợp   = unicodedata.normalize('NFC', tổ_hợp)

print(dựng_sẵn, tổ_hợp)
print(dựng_sẵn == tổ_hợp)

Ạ Ạ
False
----------
Ạ Ạ
True


## Test TextCleaner class

In [None]:
texts = ['K khí trong lành. đồ ăn hong ngon, thức uống  K tồi; 🥙🌮',
         'khung cảnh xinh đẹp',
         'khuyến mãi cực sốc giả chỉ 1000 đồng',
         '200k quá mắc',
         'món ăn này mắc quá tới 200k lận. ngày 23/3/2000 😴',
         'mua 100.000vnd',
         'bán 1,000,000 d. 5 cái bành xèo tốn 500k',
         'bán 1.000.000 d. 5 cái bành xèo tốn 500k %^^4',
         'món ăn này có giá 10 lít',
         'món ăn này tận 100 nghìn đồng',
         'bán 1.000đ',
         'quán này có giá trung bình từ 100k-200k 😛',
         'quán này có giá trung bình từ 100-200k 😫',
         '#mắc #food',
         'bàn ghế sạch đẹp, thái độ nhân viên ok#restaurant 😍',
         '#tiktok ở nhà vẫn vui',
         '# birthday ngày mai có tiệc ^^',
         'aslkdhlakd#tiktok#learn asljdalskjd',
         '#tiktok   #learn',
         '#hastag alskjdlasjd #hastag asdsadas #hastag 😁',
         '#123&456',
         '#!?@!']

cleaner = TextCleaner()
for t in cleaner.fit_transform(texts):
    print(t.strip())

không khí trong lành đồ ăn hong ngon thức uống  không tồi
khung cảnh xinh đẹp
khuyến mãi cực sốc giả chỉ giátiền
giátiền quá mắc
món ăn này mắc quá tới giátiền lận ngày 2332000
mua giátiền
bán giátiền 5 cái bành xèo tốn giátiền
bán giátiền 5 cái bành xèo tốn giátiền 4
món ăn này có giá 10 lít
món ăn này tận giátiền
bán giátiền
quán này có giá trung bình từ giátiềngiátiền
quán này có giá trung bình từ 100giátiền
hashtag hashtag
bàn ghế sạch đẹp thái độ nhân viên okhashtag
hashtag ở nhà vẫn vui
birthday ngày mai có tiệc
aslkdhlakdhashtag asljdalskjd
hashtag   hashtag
hashtag alskjdlasjd hashtag asdsadas hashtag
hashtag
hashtag


## Precision-Recall-F1

In [None]:
tmp1 = ydev_ml
tmp2 = mo2ml(mo2df(clf0.predict(Xdev)))
print(precision_score(tmp1, tmp2, average='micro'))
print(recall_score(tmp1, tmp2, average='micro'))
print(f1_score(tmp1, tmp2, average='micro'))
print()
print(precision_score(tmp1, tmp2, average='macro', zero_division=0))
print(recall_score(tmp1, tmp2, average='macro', zero_division=0))
print(f1_score(tmp1, tmp2, average='macro', zero_division=0))

0.6818042344277385
0.645367412140575
0.6630856460757983

0.42682438155100083
0.18689938230658137
0.2106136408551627


In [None]:
def my_prec(target, predict, average='micro'):
    if average == 'micro':
        return (target & predict).values.sum() / predict.values.sum()
    return ((target & predict).sum() / predict.sum()).fillna(0).mean()

def my_reca(target, predict, average='micro'):
    if average == 'micro':
        return (target & predict).values.sum() / target.values.sum()
    return ((target & predict).sum() / target.sum()).fillna(0).mean()

def my_ftes(target, predict, average='micro'):
    if average == 'macro':
        p = ((target & predict).sum() / predict.sum()).fillna(0)
        r = ((target & predict).sum() / target.sum()).fillna(0)
        return (2*p*r / (p+r)).fillna(0).mean()
    else:
        p = my_prec(target, predict)
        r = my_reca(target, predict)
        return 2*p*r / (p+r)

print(my_prec(tmp1, tmp2))
print(my_reca(tmp1, tmp2))
print(my_ftes(tmp1, tmp2))
print()
print(my_prec(tmp1, tmp2, average='macro'))
print(my_reca(tmp1, tmp2, average='macro'))
print(my_ftes(tmp1, tmp2, average='macro'))

0.6818042344277385
0.645367412140575
0.6630856460757983

0.42682438155100083
0.18689938230658137
0.21061364085516276
