In [1]:
#imports
from datasets import load_dataset
from thai2transformers.metrics import classification_metrics
from pythainlp.ulmfit import process_thai
import pandas as pd

Corpus: wiki_lm_lstm
- Already up to date.


In [2]:
#parameters
class Args:
    dataset_name_or_path = 'wisesight_sentiment'
    feature_col = 'texts'
    label_col = 'category'
    metric_for_best_model = 'f1_micro'
    seed = 1412

args = Args()

In [3]:
dataset = load_dataset(args.dataset_name_or_path)
dataset

Reusing dataset wisesight_sentiment (/Users/admin/.cache/huggingface/datasets/wisesight_sentiment/wisesight_sentiment/1.0.0/4bb1772cff1a0703d72fb9e84dff9348e80f6cdf80b0f6c0f59bcd85fc5a3537)


DatasetDict({
    train: Dataset({
        features: ['texts', 'category'],
        num_rows: 21628
    })
    validation: Dataset({
        features: ['texts', 'category'],
        num_rows: 2404
    })
    test: Dataset({
        features: ['texts', 'category'],
        num_rows: 2671
    })
})

In [4]:
if args.dataset_name_or_path == 'wongnai_reviews':
    train_val_split = dataset['train'].train_test_split(test_size=0.1, shuffle=True, seed=2020)
    dataset['train'] = train_val_split['train']
    dataset['validation'] = train_val_split['test']
dataset

DatasetDict({
    train: Dataset({
        features: ['texts', 'category'],
        num_rows: 21628
    })
    validation: Dataset({
        features: ['texts', 'category'],
        num_rows: 2404
    })
    test: Dataset({
        features: ['texts', 'category'],
        num_rows: 2671
    })
})

In [5]:
#nbsvm class
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.linear_model import LogisticRegression
from scipy import sparse

class NbSvmClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, penalty='l2', C=1.0, dual=False, seed=1412):
        self.penalty = penalty
        self.C = C
        self.dual = dual
        self.seed = seed
        
    def predict(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict(x.multiply(self._r))

    def predict_proba(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict_proba(x.multiply(self._r))

    def fit(self, x, y):
        # Check that X and y have correct shape
        y = y.toarray().ravel() if type(y)!=np.ndarray else y.ravel()
        x, y = check_X_y(x, y, accept_sparse=True)

        def pr(x, y_i, y):
            p = x[y==y_i].sum(0)
            return (p+1) / ((y==y_i).sum()+1)

        self._r = sparse.csr_matrix(np.log(pr(x,1,y) / pr(x,0,y)))
        x_nb = x.multiply(self._r)
        self._clf = LogisticRegression(penalty = self.penalty, 
                                       C=self.C, 
                                       dual=self.dual,
                                       solver='liblinear',
                                       random_state=self.seed,).fit(x_nb, y)
        return self

In [6]:
if args.dataset_name_or_path == 'generated_reviews_enth':
    texts_train = [i['th'] for i in dataset['train'][args.feature_col]]
    texts_valid = [i['th'] for i in dataset['validation'][args.feature_col]]
    texts_test = [i['th'] for i in dataset['test'][args.feature_col]]
else:
    texts_train = dataset['train'][args.feature_col]
    texts_valid = dataset['validation'][args.feature_col]
    texts_test = dataset['test'][args.feature_col]

In [7]:
#x
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(ngram_range=(1,2), tokenizer=process_thai,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1 )

x_train = tfidf.fit_transform(texts_train)
x_valid = tfidf.transform(texts_valid)
x_test = tfidf.transform(texts_test)
x_train,x_valid,x_test

(<21628x38120 sparse matrix of type '<class 'numpy.float64'>'
 	with 684096 stored elements in Compressed Sparse Row format>,
 <2404x38120 sparse matrix of type '<class 'numpy.float64'>'
 	with 72836 stored elements in Compressed Sparse Row format>,
 <2671x38120 sparse matrix of type '<class 'numpy.float64'>'
 	with 80819 stored elements in Compressed Sparse Row format>)

In [14]:
#y
if args.dataset_name_or_path == 'generated_reviews_enth' and args.label_col=='review_star':
    labels_train = [i-1 for i in dataset['train'][args.label_col]]
    labels_valid = [i-1 for i in dataset['validation'][args.label_col]]
    labels_test = [i-1 for i in dataset['test'][args.label_col]]
else:
    labels_train = dataset['train'][args.label_col]
    labels_valid = dataset['validation'][args.label_col]
    labels_test = dataset['test'][args.label_col]
    
    
from sklearn.preprocessing import OneHotEncoder
import numpy as np
enc = OneHotEncoder(handle_unknown='ignore')
y_train = enc.fit_transform(np.array(labels_train)[:,None])
y_valid = enc.transform(np.array(labels_valid)[:,None])
y_test = enc.transform(np.array(labels_test)[:,None])
y_train,y_valid,y_test

(<21628x4 sparse matrix of type '<class 'numpy.float64'>'
 	with 21628 stored elements in Compressed Sparse Row format>,
 <2404x4 sparse matrix of type '<class 'numpy.float64'>'
 	with 2404 stored elements in Compressed Sparse Row format>,
 <2671x4 sparse matrix of type '<class 'numpy.float64'>'
 	with 2671 stored elements in Compressed Sparse Row format>)

In [15]:
#validation
from sklearn.metrics import f1_score, accuracy_score

def validation_f1(penalty, C, seed):
    probs = np.zeros((x_valid.shape[0], y_valid.shape[1]))
    for i in range(len(enc.categories_[0])):
        if penalty == 'l1':
            model = NbSvmClassifier(penalty='l1', 
                                    C=C, 
                                    dual=False,
                                    seed=seed).fit(x_train, y_train[:,i])
        else:
            model = NbSvmClassifier(penalty='l2', 
                                    C=C, 
                                    dual=True,
                                    seed=seed).fit(x_train, y_train[:,i])
        probs[:,i] = model.predict_proba(x_valid)[:,1]

        preds = probs.argmax(1)
    return f1_score(labels_valid, preds, average='micro')

In [16]:
hyperparams = []
for p in ['l1','l2']:
    for c in range(1,5):
        hyp = {'dataset':args.dataset_name_or_path,
               'penalty':p,
               'C':c,
               'f1_micro':validation_f1(p,c,seed=args.seed)}
        hyp['dual'] = True if p=='l2' else False
        hyperparams.append(hyp)
hyperparams_df = pd.DataFrame(hyperparams).sort_values('f1_micro',ascending=False).reset_index(drop=True)
best_hyperparams = hyperparams_df.drop(['f1_micro','dataset'],1).iloc[0,:].to_dict()
hyperparams_df

Unnamed: 0,dataset,penalty,C,f1_micro,dual
0,wisesight_sentiment,l2,3,0.720466,True
1,wisesight_sentiment,l2,2,0.718386,True
2,wisesight_sentiment,l2,4,0.715474,True
3,wisesight_sentiment,l1,2,0.710067,False
4,wisesight_sentiment,l1,3,0.707571,False
5,wisesight_sentiment,l2,1,0.707571,True
6,wisesight_sentiment,l1,1,0.706323,False
7,wisesight_sentiment,l1,4,0.705075,False


In [17]:
#test
probs = np.zeros((x_test.shape[0], y_test.shape[1]))
for i in range(len(enc.categories_[0])):
    model = NbSvmClassifier(**best_hyperparams).fit(x_train, y_train[:,i])
    probs[:,i] = model.predict_proba(x_test)[:,1]

class Preds:
    label_ids = labels_test
    predictions = probs
    
pd.DataFrame.from_dict(classification_metrics(Preds),orient='index').transpose()

Unnamed: 0,accuracy,f1_micro,precision_micro,recall_micro,f1_macro,precision_macro,recall_macro,nb_samples
0,0.720329,0.720329,0.720329,0.720329,0.546664,0.661797,0.511304,2671.0


In [18]:
from collections import Counter
Counter(labels_test)

Counter({2: 683, 1: 1453, 0: 478, 3: 57})