In [1]:
import pandas as pd
import numpy as np
from pythainlp import word_tokenize
from tqdm import tqdm_notebook
import re
import emoji

#viz
from plotnine import *
import matplotlib.pyplot as plt
import seaborn as sns

from ast import literal_eval
from collections import Counter

#fastai
import fastai
from fastai.text import *
from fastai.callbacks import CSVLogger

#pythainlp
from pythainlp.ulmfit import *


import numpy as np
from sklearn.model_selection import train_test_split

from fastai.text import *
from fastai.callbacks import CSVLogger, SaveModelCallback
from pythainlp.ulmfit import *


from sklearn.metrics import f1_score

import time



from sklearn.metrics import confusion_matrix

In [2]:
def replace_url(text):
    URL_PATTERN = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"""
    return re.sub(URL_PATTERN, 'xxurl', text)

def replace_rep(text):
    def _replace_rep(m):
        c,cc = m.groups()
        return f'{c}xxrep'
    re_rep = re.compile(r'(\S)(\1{2,})')
    return re_rep.sub(_replace_rep, text)

def ungroup_emoji(toks):
    res = []
    for tok in toks:
        if emoji.emoji_count(tok) == len(tok):
            for char in tok:
                res.append(char)
        else:
            res.append(tok)
    return res

def process_text(text):
    #pre rules
    res = text.lower().strip()
    res = replace_url(res)
    res = replace_rep(res)
    
    #tokenize
    res = [word for word in word_tokenize(res, engine='ulmfit') if word and not re.search(pattern=r"\s+", string=word)]
    
    #post rules
    res = ungroup_emoji(res)
    
    return res

In [40]:
def train_df(all_df, abuss_type):
    
    train_df, valid_df = train_test_split(all_df, test_size=0.15)
    train_df = train_df.reset_index(drop=True)
    valid_df = valid_df.reset_index(drop=True)


    y_train = train_df[abuss_type]
    y_valid = valid_df[abuss_type]

    model_path = 'model_ulm_test/'

    train_df, valid_df = train_test_split(all_df, test_size=0.15)

    tt = Tokenizer(tok_func = ThaiTokenizer, lang = 'th', pre_rules = pre_rules_th, post_rules=post_rules_th)

    processor = [TokenizeProcessor(tokenizer=tt, chunksize=10000, mark_fields=False),
              NumericalizeProcessor(vocab=None, max_vocab=50000, min_freq=2)]

    data_lm = (TextList.from_df(all_df, model_path, cols='comment_text', processor=processor)
      .random_split_by_pct(valid_pct = 0.01, seed = 1412)
      .label_for_lm()
      .databunch(bs=48))

    data_lm.sanity_check()

    len(data_lm.train_ds), len(data_lm.valid_ds)

    config = dict(emb_sz=400, n_hid=1550, n_layers=4, pad_token=1, qrnn=False, tie_weights=True, out_bias=True,
              output_p=0.25, hidden_p=0.1, input_p=0.2, embed_p=0.02, weight_p=0.15)

    trn_args = dict(drop_mult=1., clip=0.12, alpha=2, beta=1)

    learn = language_model_learner(data_lm, AWD_LSTM, config=config, pretrained=False, **trn_args)

    learn.load_pretrained(**_THWIKI_LSTM)

    #train frozen
    print('training frozen')
    learn.freeze_to(-1)#last layer
    learn.fit_one_cycle(1, 1e-2, moms=(0.8, 0.7))

    #train unfrozen
    print('training unfrozen')
    learn.unfreeze()
    learn.fit_one_cycle(5, 1e-3, moms=(0.8, 0.7))

    learn.model


    learn.save('wisesight_lm')
    learn.save_encoder('wisesight_enc')

    #classification data
    tt = Tokenizer(tok_func = ThaiTokenizer, lang = 'th', pre_rules = pre_rules_th, post_rules=post_rules_th)
    processor = [TokenizeProcessor(tokenizer=tt, chunksize=10000, mark_fields=False),
              NumericalizeProcessor(vocab=data_lm.vocab, max_vocab=50000, min_freq=20)]

    data_cls = (ItemLists(model_path,train=TextList.from_df(train_df, model_path, cols=['comment_text'], processor=processor),
                      valid=TextList.from_df(valid_df, model_path, cols=['comment_text'], processor=processor))
      .label_from_df(abuss_type)
      .add_test(TextList.from_df(test_df, model_path, cols=['comment_text'], processor=processor))
      .databunch(bs=50)
      )
    data_cls.sanity_check()
    print(len(data_cls.vocab.itos))

    #model
    config = dict(emb_sz=400, n_hid=1550, n_layers=4, pad_token=1, qrnn=False,
              output_p=0.4, hidden_p=0.2, input_p=0.6, embed_p=0.1, weight_p=0.5)
    trn_args = dict(bptt=70, drop_mult=0.7, alpha=2, beta=1, max_len=500)

    learn = text_classifier_learner(data_cls, AWD_LSTM, config=config, pretrained=False, **trn_args)
    #load pretrained finetuned model
    learn.load_encoder('wisesight_enc')

    #train unfrozen
    learn.freeze_to(-1)
    learn.fit_one_cycle(1, 2e-2, moms=(0.8, 0.7))
    learn.freeze_to(-2)
    learn.fit_one_cycle(1, slice(1e-2 / (2.6 ** 4), 1e-2), moms=(0.8, 0.7))
    learn.freeze_to(-3)
    learn.fit_one_cycle(1, slice(5e-3 / (2.6 ** 4), 5e-3), moms=(0.8, 0.7))
    learn.unfreeze()
    learn.fit_one_cycle(10, slice(1e-3 / (2.6 ** 4), 1e-3), moms=(0.8, 0.7), callbacks=[SaveModelCallback(learn, every='improvement', monitor='accuracy', name='bestmodel')])
    
    return learn

def embedding(s, learn):
    sentence = learn.model[0](learn.data.one_item(s)[0])[0][-1][:, -1]
    return sentence.cpu().detach().numpy()

def embedding_df(df, learn):
    for i in range(400):
        df['sent_emb_' + str(i)] = None
    a = np.zeros((len(df),400))
    for i, v in enumerate(df.comment_text):
        a[i] = embedding(v, learn)
    df.iloc[:,df.columns.get_loc('sent_emb_0'):] = a
    return df

def predict_df(df, learn, abuss_type, show = False):
    model_path = 'model_ulm_test/'
    tt = Tokenizer(tok_func = ThaiTokenizer, lang = 'th', pre_rules = pre_rules_th, post_rules=post_rules_th)
    processor = [TokenizeProcessor(tokenizer=tt, chunksize=10000, mark_fields=False),
              NumericalizeProcessor(vocab=None, max_vocab=50000, min_freq=2)]
    learn.data.add_test(TextList.from_df(df, model_path, cols=['comment_text'], processor=processor))
    tmp = learn.get_preds(DatasetType.Test)
    if show:
        df['pred'] = tmp[0].numpy()[:, 1]
        df.pred = df.pred.apply(round)
        print(f"{abuss_type}: f1 = {f1_score(df[abuss_type], df.pred)}")
        print(pd.crosstab(df[abuss_type], df.pred, rownames=['Actual'], colnames=['Predicted'], margins=True))
    
    return tmp[0].numpy()[:, 1]

In [32]:
def pre_df(df3):
    df3.rude = df3.rude.apply(int)
    df3.figurative = df3.figurative.apply(int)
    df3.offensive = df3.offensive.apply(int)
    df3.dirty = df3.dirty.apply(int)
    return df3

In [33]:
df_1 = pd.read_csv('df_labeled_temp_1.csv')
df_1 = pre_df(df_1)
train_data, test_data = train_test_split(df_1, train_size = 0.7)

In [34]:
learn = train_df(train_data, 'rude')

  warn("`random_split_by_pct` is deprecated, please use `split_by_rand_pct`.")


training frozen


epoch,train_loss,valid_loss,accuracy,time
0,5.151257,4.435395,0.344048,00:13


training unfrozen


epoch,train_loss,valid_loss,accuracy,time
0,4.61726,4.234052,0.358631,00:14
1,4.444029,4.07524,0.370536,00:14
2,4.255073,3.981435,0.381548,00:14
3,4.113658,3.949065,0.3875,00:14
4,4.034777,3.943092,0.390476,00:14


10720


epoch,train_loss,valid_loss,accuracy,time
0,0.329849,0.281769,0.875033,00:18


epoch,train_loss,valid_loss,accuracy,time
0,0.285566,0.237692,0.900339,00:20


epoch,train_loss,valid_loss,accuracy,time
0,0.248531,0.18731,0.92408,00:20


epoch,train_loss,valid_loss,accuracy,time
0,0.227815,0.181737,0.925385,00:22
1,0.223111,0.174922,0.931385,00:20
2,0.193404,0.162962,0.934516,00:22
3,0.170631,0.154713,0.941821,00:20
4,0.169258,0.151448,0.942082,00:22
5,0.143443,0.152075,0.941299,00:20
6,0.15169,0.155464,0.943386,00:20
7,0.139667,0.151763,0.944169,00:21
8,0.137278,0.150742,0.945213,00:22
9,0.138256,0.150055,0.945213,00:21


Better model found at epoch 0 with accuracy value: 0.9253848195075989.
Better model found at epoch 1 with accuracy value: 0.931385338306427.
Better model found at epoch 2 with accuracy value: 0.9345160722732544.
Better model found at epoch 3 with accuracy value: 0.9418210387229919.
Better model found at epoch 4 with accuracy value: 0.9420819282531738.
Better model found at epoch 6 with accuracy value: 0.9433863759040833.
Better model found at epoch 7 with accuracy value: 0.9441690444946289.
Better model found at epoch 8 with accuracy value: 0.9452126026153564.


In [35]:
df_2 = embedding_df(df_1,learn)

In [36]:
df_2

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,Unnamed: 0.1.1.1.1,comment_id,page_id,thread_level,parent_id,post_id,...,sent_emb_390,sent_emb_391,sent_emb_392,sent_emb_393,sent_emb_394,sent_emb_395,sent_emb_396,sent_emb_397,sent_emb_398,sent_emb_399
0,0,0,21,21,21,1911015469147330,634784363349240,2,793063094188032,793041250856883,...,0.096863,0.002663,-0.007263,-0.018467,0.010095,-0.014836,-0.008404,0.000109,0.017170,-0.000214
1,1,1,1030,1030,1030,771701819657493,634784363349240,2,771409233020085,770898433071165,...,0.096863,0.002663,-0.007263,-0.018467,0.010095,-0.014836,-0.008404,0.000109,0.017170,-0.000214
2,2,2,13358,13358,13358,1592983987529268,634784363349240,2,1592572740903726,1592396844254649,...,0.096863,0.002663,-0.007263,-0.018467,0.010095,-0.014836,-0.008404,0.000109,0.017170,-0.000214
3,3,3,15752,15752,15752,1572979059529761,634784363349240,2,1572910266203307,1572739449553722,...,0.096863,0.002663,-0.007263,-0.018467,0.010095,-0.014836,-0.008404,0.000109,0.017170,-0.000214
4,4,4,20734,20734,20734,865151033645904,634784363349240,2,865033150324359,864999696994371,...,0.096863,0.002663,-0.007263,-0.018467,0.010095,-0.014836,-0.008404,0.000109,0.017170,-0.000214
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36497,139930,139930,3320679,3320679,3320679,1332936220215278,180898305419081,1,1332933253548908,1332933253548908,...,-0.018719,0.011463,-0.000982,-0.009881,0.001654,-0.003036,-0.003722,0.000882,0.030370,-0.000778
36498,139931,139931,3321731,3321731,3321731,966590620183175,180898305419081,1,966529650189272,966529650189272,...,-0.005204,0.009819,-0.008097,-0.030218,0.001816,-0.004924,-0.003321,0.001254,0.019805,-0.002331
36499,139932,139932,3324332,3324332,3324332,107347496908078,180898305419081,1,1174176909424544,1174176909424544,...,0.024235,-0.013216,0.004051,-0.034253,-0.007615,0.004179,-0.030486,-0.001520,-0.001727,0.001221
36500,139933,139933,3327047,3327047,3327047,1154405454735023,180898305419081,1,1154252921416943,1154252921416943,...,0.039196,0.014395,0.016445,-0.008457,-0.004077,0.001481,-0.016374,0.018551,0.048063,-0.003061


In [41]:
df_3 = predict_df(df_1, learn, 'rude', show = True)

rude: f1 = 0.9196695797916916
Predicted      0     1    All
Actual                       
0          27478   849  28327
1            493  7682   8175
All        27971  8531  36502


In [45]:
df_3

array([4.761477e-04, 4.761477e-04, 4.761477e-04, 4.761477e-04, ..., 8.527055e-01, 4.976363e-03, 8.830055e-01,
       8.406510e-04], dtype=float32)

In [208]:

df1 = pd.read_csv('df_labeled_temp_1.csv')

df3 = df1[['comment_text','rude', 'offensive', 'figurative', 'dirty']]

df3.dropna(inplace=True)

df3 = df3.sample(frac=1)

df3.rude = df3.rude.apply(int)
df3.figurative = df3.figurative.apply(int)
df3.offensive = df3.offensive.apply(int)
df3.dirty = df3.dirty.apply(int)

df3.reset_index(drop=True, inplace=True)

abuss_types = ['rude', 'offensive', 'figurative', 'dirty']

all_df, test_df = train_test_split(df3, train_size = 0.7, shuffle = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [9]:
learns = {}
for i in abuss_types:
    learns[i] = train_df(all_df, i)

  warn("`random_split_by_pct` is deprecated, please use `split_by_rand_pct`.")


training frozen


epoch,train_loss,valid_loss,accuracy,time
0,5.142308,4.508138,0.340476,00:13


training unfrozen


epoch,train_loss,valid_loss,accuracy,time
0,4.629007,4.311252,0.344643,00:14
1,4.443398,4.168736,0.35625,00:14
2,4.261222,4.097723,0.362202,00:14
3,4.110754,4.05796,0.369048,00:14
4,4.031862,4.054327,0.365774,00:14


10736


epoch,train_loss,valid_loss,accuracy,time
0,0.344612,0.300725,0.877902,00:18


epoch,train_loss,valid_loss,accuracy,time
0,0.283098,0.245839,0.902687,00:20


epoch,train_loss,valid_loss,accuracy,time
0,0.255472,0.201343,0.919645,00:20


epoch,train_loss,valid_loss,accuracy,time
0,0.231677,0.197514,0.923037,00:22
1,0.201311,0.186423,0.92982,00:20
2,0.188684,0.175114,0.935038,00:22
3,0.207269,0.164776,0.935821,00:20
4,0.171407,0.168889,0.938429,00:21
5,0.152816,0.169717,0.938429,00:19
6,0.149489,0.173173,0.940777,00:20
7,0.147367,0.168138,0.939473,00:21
8,0.130099,0.167462,0.940517,00:22
9,0.138029,0.170938,0.938951,00:20


Better model found at epoch 0 with accuracy value: 0.9230368137359619.
Better model found at epoch 1 with accuracy value: 0.9298200011253357.
Better model found at epoch 2 with accuracy value: 0.9350378513336182.
Better model found at epoch 3 with accuracy value: 0.9358205199241638.
Better model found at epoch 4 with accuracy value: 0.9384294152259827.
Better model found at epoch 6 with accuracy value: 0.9407774806022644.


rude: f1 = 0.8753943217665615


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
  warn("`random_split_by_pct` is deprecated, please use `split_by_rand_pct`.")


training frozen


epoch,train_loss,valid_loss,accuracy,time
0,5.142798,4.511869,0.33869,00:13


training unfrozen


epoch,train_loss,valid_loss,accuracy,time
0,4.61972,4.305456,0.346131,00:14
1,4.440465,4.173342,0.355952,00:14
2,4.257493,4.103572,0.361607,00:14
3,4.111989,4.068312,0.363988,00:14
4,4.027159,4.061355,0.3625,00:14


10736


epoch,train_loss,valid_loss,accuracy,time
0,0.104866,0.11486,0.960083,00:18


epoch,train_loss,valid_loss,accuracy,time
0,0.112957,0.099568,0.967128,00:20


epoch,train_loss,valid_loss,accuracy,time
0,0.124994,0.094616,0.971824,00:20


epoch,train_loss,valid_loss,accuracy,time
0,0.093242,0.09598,0.973128,00:22
1,0.087832,0.099116,0.972345,00:21
2,0.085716,0.094918,0.972606,00:24
3,0.076632,0.092634,0.974433,00:24
4,0.077845,0.099607,0.97365,00:29
5,0.059487,0.099749,0.975737,00:23
6,0.04536,0.107485,0.974693,00:20
7,0.049777,0.105038,0.975215,00:21
8,0.049694,0.106291,0.974954,00:22
9,0.042292,0.104267,0.974954,00:20


Better model found at epoch 0 with accuracy value: 0.973128080368042.
Better model found at epoch 3 with accuracy value: 0.9744325876235962.
Better model found at epoch 5 with accuracy value: 0.9757370352745056.


offensive: f1 = 0.6164383561643836


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
  warn("`random_split_by_pct` is deprecated, please use `split_by_rand_pct`.")


training frozen


epoch,train_loss,valid_loss,accuracy,time
0,5.143902,4.505757,0.335714,00:13


training unfrozen


epoch,train_loss,valid_loss,accuracy,time
0,4.614824,4.308103,0.347024,00:14
1,4.444665,4.173145,0.35625,00:14
2,4.258959,4.101288,0.361905,00:14
3,4.114602,4.063487,0.363393,00:14
4,4.029469,4.05686,0.368452,00:14


10736


epoch,train_loss,valid_loss,accuracy,time
0,0.139629,0.139929,0.954866,00:18


epoch,train_loss,valid_loss,accuracy,time
0,0.139268,0.123608,0.961649,00:20


epoch,train_loss,valid_loss,accuracy,time
0,0.126058,0.112622,0.966867,00:20


epoch,train_loss,valid_loss,accuracy,time
0,0.114697,0.101237,0.969476,00:22
1,0.102772,0.095724,0.971041,00:20
2,0.085544,0.101908,0.972606,00:22
3,0.100864,0.087979,0.975215,00:20
4,0.077152,0.08751,0.977302,00:22
5,0.068109,0.092933,0.977563,00:19
6,0.072432,0.094072,0.977041,00:20
7,0.066342,0.093517,0.976781,00:21
8,0.060342,0.094987,0.979129,00:22
9,0.059711,0.0936,0.979129,00:20


Better model found at epoch 0 with accuracy value: 0.9694756269454956.
Better model found at epoch 1 with accuracy value: 0.9710409641265869.
Better model found at epoch 2 with accuracy value: 0.9726063013076782.
Better model found at epoch 3 with accuracy value: 0.9752152562141418.
Better model found at epoch 4 with accuracy value: 0.9773023724555969.
Better model found at epoch 5 with accuracy value: 0.9775632619857788.
Better model found at epoch 8 with accuracy value: 0.9791285991668701.


figurative: f1 = 0.7199074074074073


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
  warn("`random_split_by_pct` is deprecated, please use `split_by_rand_pct`.")


training frozen


epoch,train_loss,valid_loss,accuracy,time
0,5.142696,4.511,0.335119,00:13


training unfrozen


epoch,train_loss,valid_loss,accuracy,time
0,4.623894,4.30259,0.345238,00:14
1,4.43942,4.170606,0.354167,00:14
2,4.261703,4.098464,0.360417,00:14
3,4.115185,4.0629,0.365476,00:14
4,4.028567,4.052834,0.36875,00:14


10736


epoch,train_loss,valid_loss,accuracy,time
0,0.077571,0.0782,0.981216,00:18


epoch,train_loss,valid_loss,accuracy,time
0,0.077288,0.075945,0.977563,00:20


epoch,train_loss,valid_loss,accuracy,time
0,0.061331,0.080462,0.97365,00:20


epoch,train_loss,valid_loss,accuracy,time
0,0.056575,0.078466,0.974172,00:22
1,0.058166,0.070727,0.978085,00:20
2,0.056636,0.075884,0.975476,00:21
3,0.048403,0.071689,0.975476,00:20
4,0.036508,0.080651,0.974172,00:22
5,0.027291,0.080901,0.979389,00:19
6,0.029609,0.091744,0.975215,00:20
7,0.027158,0.088585,0.978085,00:21
8,0.025732,0.087571,0.977824,00:22
9,0.024314,0.085796,0.97652,00:20


Better model found at epoch 0 with accuracy value: 0.9741716384887695.
Better model found at epoch 1 with accuracy value: 0.9780850410461426.
Better model found at epoch 5 with accuracy value: 0.979389488697052.


dirty: f1 = 0.3228346456692914


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [None]:
# F1 and confusion matrix
for abuss_type in abuss_types:
    tmp = learns[abuss_type].get_preds(DatasetType.Test)
    test_df['pred'] = tmp[0].numpy()[:, 1]
    test_df.pred = test_df.pred.apply(round)
    print(f"{abuss_type}: f1 = {f1_score(test_df[abuss_type], test_df.pred)}")
    print(pd.crosstab(test_df[abuss_type], test_df.pred, rownames=['Actual'], colnames=['Predicted'], margins=True))

In [41]:
learn_tmp = learns['rude']

In [287]:
test_lm = TextList.from_df(test_df, model_path, cols='comment_text', processor=processor)

In [337]:
a = learn_tmp.model[0](learn_tmp.data.one_item("ไอหน้าโง่")[0])[0][-1][:, -1]

a.cpu().detach().numpy()

In [374]:
def embedding_df(df, abuss_type):
    a = np.zeros((len(df),400))
    for i, v in enumerate(df.comment_text):
        a[i] = embedding(v, abuss_type)
    df.iloc[:,df.columns.get_loc('sent_emb_0'):] = a
def embedding(s, abuss_type):
    sentence = learns[abuss_type].model[0](learns[abuss_type].data.one_item(s)[0])[0][-1][:, -1]
    return sentence.cpu().detach().numpy()



In [513]:
df_1 = test_df.copy()

In [514]:
for i in range(400):
    df_1['sent_emb_' + str(i)] = None

In [515]:
df_2 = df_1.iloc[:1000]

In [516]:
start = time.time_ns()
a = np.zeros((len(df_2),400))
for i, v in enumerate(df_2.comment_text):
    a[i] = embedding(v, 'rude')
df_2.iloc[:,df_2.columns.get_loc('sent_emb_0'):] = a
t = time.time_ns() - start

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [517]:
t

6192723440

In [518]:
df_2

Unnamed: 0,comment_text,rude,offensive,figurative,dirty,sent_emb_0,sent_emb_1,sent_emb_2,sent_emb_3,sent_emb_4,...,sent_emb_390,sent_emb_391,sent_emb_392,sent_emb_393,sent_emb_394,sent_emb_395,sent_emb_396,sent_emb_397,sent_emb_398,sent_emb_399
4657,ไม่เรียกการ์เด็นยาร์ทไปเลยละสัสทไอ้ควาย,1,0,1,0,0.008896,0.034835,0.091158,-0.001808,-0.002071,...,-0.023483,0.003060,0.011713,-0.012353,-0.016319,0.033031,-0.001937,0.087386,0.031454,0.009348
3989,อยากลองโดนหมาเย็ดบ้างจัง,0,0,0,0,-0.002927,-0.008596,-0.024920,0.014813,-0.009668,...,-0.009464,0.054030,-0.001190,0.006093,0.000166,-0.016309,-0.005888,0.022392,0.008179,0.007481
2061,ช่วยเเปลไทยเป็นไทยที มันหมายความว่าไง,0,0,0,0,-0.016566,-0.023509,0.011421,0.029301,-0.022250,...,0.028772,0.066345,-0.016508,0.004403,-0.007536,-0.039450,-0.021828,0.003672,0.021948,0.004249
14789,Nattii Memory,0,0,0,0,-0.015504,-0.009954,-0.008057,0.005284,-0.016166,...,0.022626,-0.013803,0.002958,-0.011930,-0.011034,0.000881,0.003108,-0.009486,0.009031,0.010626
30359,อีดอกหมั่นไส้แอดมิน,1,0,0,0,-0.002542,-0.024288,0.010666,-0.041654,-0.052078,...,0.021117,0.013166,0.007771,-0.014121,0.034591,0.001021,-0.022170,0.025837,0.036038,0.007216
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1549,โจรที่ถูกต้องตามกฎหมาย....,0,0,0,0,-0.004151,-0.000587,0.016885,0.018948,-0.012675,...,0.046218,-0.010283,-0.018737,-0.003777,-0.017242,-0.024416,-0.024064,0.021981,-0.008406,0.020191
13648,เฮ้ออออ,0,0,0,0,-0.003531,-0.000743,0.011149,0.018602,-0.006672,...,0.014051,-0.004096,-0.000855,-0.009593,-0.001904,-0.003632,0.001257,-0.002902,-0.038019,0.010529
26632,จุ๊บจิ๊บของนัทNutthaphol Thanakijsoonthorn,0,0,0,0,-0.007395,-0.016608,-0.016472,-0.001412,-0.021891,...,0.019551,-0.013215,0.011810,-0.010079,-0.014069,0.005176,-0.004763,-0.001546,-0.001750,0.011939
15313,อีกะเทย,1,1,0,0,0.030308,0.128397,0.298503,0.037120,0.031057,...,-0.013818,0.001193,0.023193,0.000860,-0.042324,0.098481,0.010621,0.132203,-0.036891,-0.010427


In [519]:
f"for 1000000 rec = {((t/1e9) * 1e3)/ 60} mins"

'for 1000000 rec = 103.21205733333333 mins'

In [490]:
a = np.zeros((1000,400))

In [493]:
df_2.iat[0, 5]

array([[ 0.008896,  0.034835,  0.091158, -0.001808, ..., -0.001937,  0.087386,  0.031454,  0.009348]], dtype=float32)

In [494]:
a[0] = df_2.iat[0, 5]

In [496]:
a

array([[ 0.008896,  0.034835,  0.091158, -0.001808, ..., -0.001937,  0.087386,  0.031454,  0.009348],
       [ 0.      ,  0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,  0.      ,  0.      ],
       [ 0.      ,  0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,  0.      ,  0.      ],
       [ 0.      ,  0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,  0.      ,  0.      ],
       ...,
       [ 0.      ,  0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,  0.      ,  0.      ],
       [ 0.      ,  0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,  0.      ,  0.      ],
       [ 0.      ,  0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,  0.      ,  0.      ],
       [ 0.      ,  0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,  0.      ,  0.      ]])

In [489]:
a

array([4.643993e-310, 4.643991e-310, 4.643993e-310, 4.643991e-310, ..., 0.000000e+000, 0.000000e+000, 0.000000e+000,
       0.000000e+000])

In [482]:
df_2.iloc[:,6:] = a

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [483]:
df_2

Unnamed: 0,comment_text,rude,offensive,figurative,dirty,sent_emb_all,sent_emb_0,sent_emb_1,sent_emb_2,sent_emb_3,...,sent_emb_390,sent_emb_391,sent_emb_392,sent_emb_393,sent_emb_394,sent_emb_395,sent_emb_396,sent_emb_397,sent_emb_398,sent_emb_399
4657,ไม่เรียกการ์เด็นยาร์ทไปเลยละสัสทไอ้ควาย,1,0,1,0,"[[0.008896432, 0.034834858, 0.091157936, -0.00...",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3989,อยากลองโดนหมาเย็ดบ้างจัง,0,0,0,0,"[[-0.002927174, -0.008595865, -0.024919897, 0....",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2061,ช่วยเเปลไทยเป็นไทยที มันหมายความว่าไง,0,0,0,0,"[[-0.016566131, -0.023509227, 0.011420618, 0.0...",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14789,Nattii Memory,0,0,0,0,"[[-0.015504351, -0.009953583, -0.008057029, 0....",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30359,อีดอกหมั่นไส้แอดมิน,1,0,0,0,"[[-0.0025417486, -0.024287757, 0.010665811, -0...",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1549,โจรที่ถูกต้องตามกฎหมาย....,0,0,0,0,"[[-0.004151395, -0.0005865164, 0.016884968, 0....",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13648,เฮ้ออออ,0,0,0,0,"[[-0.0035308173, -0.0007431547, 0.011149167, 0...",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26632,จุ๊บจิ๊บของนัทNutthaphol Thanakijsoonthorn,0,0,0,0,"[[-0.0073947655, -0.01660846, -0.01647233, -0....",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15313,อีกะเทย,1,1,0,0,"[[0.030308114, 0.12839684, 0.29850256, 0.03712...",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [461]:
t / 1e9

112.772541

In [438]:
df_2.at[df_2.index[0], 'sent_emb_all']

array([[ 0.008896,  0.034835,  0.091158, -0.001808, ..., -0.001937,  0.087386,  0.031454,  0.009348]], dtype=float32)

In [449]:
df_2.columns.get_loc('sent_emb_all')

5

In [451]:
df_2.iat[0, 5]

array([[ 0.008896,  0.034835,  0.091158, -0.001808, ..., -0.001937,  0.087386,  0.031454,  0.009348]], dtype=float32)

In [452]:
for i in range(len(df_2)):
    df_2.iloc[i,6:] = df_2.iat[i, 5]

In [453]:
df_2

Unnamed: 0,comment_text,rude,offensive,figurative,dirty,sent_emb_all,sent_emb_0,sent_emb_1,sent_emb_2,sent_emb_3,...,sent_emb_390,sent_emb_391,sent_emb_392,sent_emb_393,sent_emb_394,sent_emb_395,sent_emb_396,sent_emb_397,sent_emb_398,sent_emb_399
4657,ไม่เรียกการ์เด็นยาร์ทไปเลยละสัสทไอ้ควาย,1,0,1,0,"[[0.008896432, 0.034834858, 0.091157936, -0.00...",[0.008896431885659695],[0.034834858030080795],[0.0911579355597496],[-0.0018082483438774943],...,[-0.023482704535126686],[0.00305978674441576],[0.0117125753313303],[-0.012352812103927135],[-0.01631900854408741],[0.03303085267543793],[-0.0019370714435353875],[0.08738567680120468],[0.03145422413945198],[0.00934758223593235]
3989,อยากลองโดนหมาเย็ดบ้างจัง,0,0,0,0,"[[-0.002927174, -0.008595865, -0.024919897, 0....",[-0.0029271740932017565],[-0.008595865219831467],[-0.024919897317886353],[0.014813334681093693],...,[-0.009463699534535408],[0.054029546678066254],[-0.0011904219863936305],[0.006092878058552742],[0.00016551690350752324],[-0.016308855265378952],[-0.0058881076984107494],[0.02239229716360569],[0.008179351687431335],[0.0074812620878219604]
2061,ช่วยเเปลไทยเป็นไทยที มันหมายความว่าไง,0,0,0,0,"[[-0.016566131, -0.023509227, 0.011420618, 0.0...",[-0.01656613126397133],[-0.023509226739406586],[0.011420617811381817],[0.02930143103003502],...,[0.02877231501042843],[0.06634528189897537],[-0.016507582738995552],[0.00440271245315671],[-0.007535697892308235],[-0.039450112730264664],[-0.021828308701515198],[0.0036722985096275806],[0.021948155015707016],[0.004249427001923323]
14789,Nattii Memory,0,0,0,0,"[[-0.015504351, -0.009953583, -0.008057029, 0....",[-0.015504350885748863],[-0.009953582659363747],[-0.008057028986513615],[0.00528378551825881],...,[0.022625554352998734],[-0.013803248293697834],[0.002957971068099141],[-0.01193039771169424],[-0.011033503338694572],[0.0008806283003650606],[0.0031080686021596193],[-0.009485781192779541],[0.009031379595398903],[0.010626038536429405]
30359,อีดอกหมั่นไส้แอดมิน,1,0,0,0,"[[-0.0025417486, -0.024287757, 0.010665811, -0...",[-0.0025417485740035772],[-0.02428775653243065],[0.01066581066697836],[-0.041654329746961594],...,[0.021116936579346657],[0.013166447170078754],[0.007770536933094263],[-0.014120913110673428],[0.034590739756822586],[0.0010206043953076005],[-0.022170258685946465],[0.025836827233433723],[0.036037661135196686],[0.007216327358037233]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1549,โจรที่ถูกต้องตามกฎหมาย....,0,0,0,0,"[[-0.004151395, -0.0005865164, 0.016884968, 0....",[-0.004151395056396723],[-0.0005865163984708488],[0.01688496768474579],[0.018948189914226532],...,[0.046218011528253555],[-0.010283076204359531],[-0.018736736848950386],[-0.003777463687583804],[-0.017242029309272766],[-0.024415727704763412],[-0.024063866585493088],[0.02198072150349617],[-0.008406125009059906],[0.020191393792629242]
13648,เฮ้ออออ,0,0,0,0,"[[-0.0035308173, -0.0007431547, 0.011149167, 0...",[-0.003530817339196801],[-0.0007431547273881733],[0.011149167083203793],[0.018602274358272552],...,[0.014050746336579323],[-0.0040964060463011265],[-0.0008547122706659138],[-0.009592968970537186],[-0.0019040307961404324],[-0.0036317293997853994],[0.001257276046089828],[-0.0029015913605690002],[-0.03801877424120903],[0.010529018938541412]
26632,จุ๊บจิ๊บของนัทNutthaphol Thanakijsoonthorn,0,0,0,0,"[[-0.0073947655, -0.01660846, -0.01647233, -0....",[-0.007394765503704548],[-0.016608459874987602],[-0.016472330316901207],[-0.0014124728040769696],...,[0.019550777971744537],[-0.013215483166277409],[0.011809746734797955],[-0.010079422034323215],[-0.014069108292460442],[0.005176058504730463],[-0.004762914497405291],[-0.0015457301633432508],[-0.0017501935362815857],[0.011939071118831635]
15313,อีกะเทย,1,1,0,0,"[[0.030308114, 0.12839684, 0.29850256, 0.03712...",[0.030308114364743233],[0.12839683890342712],[0.2985025644302368],[0.03712006285786629],...,[-0.01381818950176239],[0.0011926997685804963],[0.023192621767520905],[0.0008601141162216663],[-0.04232392832636833],[0.0984814390540123],[0.01062057726085186],[0.13220319151878357],[-0.03689086064696312],[-0.010427355766296387]


In [412]:
df_1.iloc[:, 6:] = df_1.comment_text.apply(lambda s: embedding(s, 'rude'))

ValueError: Must have equal len keys and value when setting with an iterable

In [385]:
df_tmp = df_1.iloc[:5]

In [390]:
df_tmp.iloc[0,6:] = df_tmp.sent_emb_all[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [396]:
df_tmp.at[df_tmp.index[0], 'sent_emb_all']

array([[ 0.008896,  0.034835,  0.091158, -0.001808, ..., -0.001937,  0.087386,  0.031454,  0.009348]], dtype=float32)

In [397]:
df_tmp.iloc[0, 6:] = df_tmp.at[df_tmp.index[0], 'sent_emb_all']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [403]:
df_tmp.sent_emb_0[4657][0]

0.008896431885659695

In [351]:
b = embedding("หน้าหมา")

In [353]:
print(a)
print(b)

[[-0.001561 -0.006691 -0.038699  0.022102 ... -0.00047   0.011359  0.087136 -0.003541]]
[[-0.002986  0.05515  -0.001198 -0.0009   ...  0.001058  0.003904  0.002604  0.002235]]


In [289]:
learn_tmp.data.one_item(test_lm[0])

(tensor([[   2,    9,   15,  313,  184,    0,  453, 5202,   11,   12,   83,    0,
            59,   76]], device='cuda:0'), tensor([0], device='cuda:0'))

In [245]:
type(data_lm[0])

str

In [187]:
awd_lstm = learn_tmp.model[0] # เลือก layer แรก ที่ทำการ encoder

In [246]:
data_lm[0]

'ไม่เรียกการ์เด็นยาร์ทไปเลยละสัสทไอ้ควาย'

In [247]:
xb, yb = learn_tmp.data.one_item(data_lm[0])

In [250]:
xb

tensor([[   2,    9,   15,  313,  184,    0,  453, 5202,   11,   12,   83,    0,
           59,   76]], device='cuda:0')

In [251]:
xb, yb = learn_tmp.data.one_item(data_lm[0]) # เอา data_lm ตัวแรกออกมา

In [222]:
xb.shape

torch.Size([1, 17])

In [284]:
start = time.time_ns()
awd_lstm.reset()
awd_lstm.eval()
with torch.no_grad():
    out = awd_lstm.eval()(xb)
    encoded_rep = out[0][-1]  # last layer?
    sentence_rep = encoded_rep[:, -1, :]  # last token?
end = time.time_ns()
print((end - start) / 1e9)

0.007349208


1000000000.0

In [268]:
start

<function time.time_ns>

In [211]:
sentence_rep

array([[-0.001266, -0.013044, -0.005869,  0.016881, ..., -0.021768, -0.039975,  0.003019,  0.027363]], dtype=float32)

In [219]:
awd_lstm

MultiBatchEncoder(
  (module): AWD_LSTM(
    (encoder): Embedding(10736, 400, padding_idx=1)
    (encoder_dp): EmbeddingDropout(
      (emb): Embedding(10736, 400, padding_idx=1)
    )
    (rnns): ModuleList(
      (0): WeightDropout(
        (module): LSTM(400, 1550, batch_first=True)
      )
      (1): WeightDropout(
        (module): LSTM(1550, 1550, batch_first=True)
      )
      (2): WeightDropout(
        (module): LSTM(1550, 1550, batch_first=True)
      )
      (3): WeightDropout(
        (module): LSTM(1550, 400, batch_first=True)
      )
    )
    (input_dp): RNNDropout()
    (hidden_dps): ModuleList(
      (0): RNNDropout()
      (1): RNNDropout()
      (2): RNNDropout()
      (3): RNNDropout()
    )
  )
)

In [214]:
out[0][-1].shape

torch.Size([1, 17, 400])

In [202]:
sentence_rep = sentence_rep.cpu().detach().numpy()

In [210]:
sentence_rep.shape

array([[-0.001266, -0.013044, -0.005869,  0.016881, ..., -0.021768, -0.039975,  0.003019,  0.027363]], dtype=float32)