In [1]:
from sklearn.ensemble import RandomForestClassifier
import os, json, re, sys
import numpy as np
from tqdm import tqdm
from tokenize_tag import tokenize
import parmap
import pickle
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import recall_score, precision_score, f1_score

In [2]:
def clean_sentence(sent):
    sent = sent.strip()
    sent = re.sub('{laughing}|{clearing}|{singing}|{applauding}', '', sent)
    sent = re.sub('[(][(].*?[)][)]|-.*?-', '', sent)
    return sent

In [3]:
get_main_topic = lambda x: x.split(' > ')[0]

def load_data():
    file_list = [f_name for f_name in os.listdir('data') if f_name[-5:] == '.json']
    
    total_data = []
    for f_name in tqdm(file_list):
        with open('data/%s' %f_name, 'r') as f:
            data = json.loads(f.read())['document'][0]
            metadata = data['metadata']
            utterance = data['utterance']
            
            topic = get_main_topic(metadata['topic'])
            if topic[:4] == 'NWRW':
                continue

            last_speaker = None
            seg1 = seg2 = ''
            for u in utterance:
                if last_speaker is None:
                    last_speaker = u['speaker_id']
                    seg2 = u['form']
                elif last_speaker == u['speaker_id']:
                    seg2 += ' ' + u['original_form']
                else:
                    if seg1 and seg2:
                        total_data.append([f_name, topic, clean_sentence(seg1), clean_sentence(seg2)])
                    last_speaker = u['speaker_id']
                    seg1 = seg2
                    seg2 = u['original_form']
            if seg1 and seg2:
                total_data.append([f_name, topic, clean_sentence(seg1), clean_sentence(seg2)])
                
    return np.array(total_data)

In [4]:
def row_to_list(row):
    return str(row[2]), str(row[3]), label_w2i[row[1]]

In [None]:
raw_dataset = load_data()
file_with_label = np.unique(raw_dataset[:, :2], axis=0)
_labels, _counts = np.unique(file_with_label[:, 1], return_counts=True)
label_counts = dict(zip(_labels, _counts))

In [None]:
np.random.seed(0)
np.random.shuffle(file_with_label)

valid_list = {label: [] for label in label_counts}
test_list = {label: [] for label in label_counts}

for file in file_with_label:
    if len(valid_list[file[1]]) < label_counts[file[1]] * 0.1:
        valid_list[file[1]].append(file[0])
    elif len(test_list[file[1]]) < label_counts[file[1]] * 0.1:
        test_list[file[1]].append(file[0])
        
_valid_list = []
_test_list = []
for label in label_counts:
    _valid_list.extend(valid_list[label])
    _test_list.extend(test_list[label])
    
valid_list = _valid_list
test_list = _test_list

In [None]:
raw_train_dataset, raw_valid_dataset, raw_test_dataset = [], [], []
for row in raw_dataset:
    if row[0] in valid_list:
        raw_valid_dataset.append(row)
    elif row[0] in test_list:
        raw_test_dataset.append(row)
    else:
        raw_train_dataset.append(row)
raw_train_dataset, raw_valid_dataset, raw_test_dataset = np.array(raw_train_dataset), np.array(raw_valid_dataset), np.array(raw_test_dataset)

In [5]:
npz_file = np.load('raw_train_valid_test.npz')
raw_train_dataset, raw_valid_dataset, raw_test_dataset = npz_file['arr_0'], npz_file['arr_1'], npz_file['arr_2']

In [6]:
label_i2w = np.unique(raw_train_dataset[:, 1]).tolist()
label_w2i = {w: i for i, w in enumerate(label_i2w)}
label_i2w = {i: w for i, w in enumerate(label_i2w)}

In [6]:
label_w2i = pickle.load(open('ver_2.w2i', 'rb'))
label_i2w = {label_w2i[l]: l for l in label_w2i}

In [7]:
train_x, train_y = [str(r[2]) + ' ' + str(r[3]) for r in raw_train_dataset], np.array([label_w2i[r[1]] for r in raw_train_dataset])
valid_x, valid_y = [str(r[2]) + ' ' + str(r[3]) for r in raw_valid_dataset], np.array([label_w2i[r[1]] for r in raw_valid_dataset])
test_x, test_y = [str(r[2]) + ' ' + str(r[3]) for r in raw_test_dataset], np.array([label_w2i[r[1]] for r in raw_test_dataset])

In [8]:
if __name__ == '__main__':
    total_size = len(train_x)
    data = [train_x[int(total_size/os.cpu_count()*i):int(total_size/os.cpu_count()*(i+1))] for i in range(os.cpu_count())]
    token_list = parmap.map(tokenize, data, pm_pbar=True, pm_processes=os.cpu_count())
    
    tokens = []
    for t in token_list:
        tokens.extend(t)
        
    i2w = []
    for s in tqdm(tokens):
        for t in s:
            if t not in i2w:
                i2w.append(t)
    i2w = list(set(i2w))
    i2w.insert(0, '[UNK]')
    seq_len = len(i2w)
    w2i = {t: i for i, t in enumerate(i2w)}
    
    train_x = np.zeros(shape=(len(tokens), seq_len))
    for i, s in enumerate(tokens):
        for t in s:
            train_x[i][w2i[t]] += 1
            
            
    total_size = len(valid_x)
    data = [valid_x[int(total_size/os.cpu_count()*i):int(total_size/os.cpu_count()*(i+1))] for i in range(os.cpu_count())]
    token_list = parmap.map(tokenize, data, pm_pbar=True, pm_processes=os.cpu_count())
    
    tokens = []
    for t in token_list:
        tokens.extend(t)
        
    valid_x = np.zeros(shape=(len(tokens), seq_len))
    for i, s in enumerate(tokens):
        for t in s:
            valid_x[i][w2i[t] if t in w2i else 0] += 1
            
            
    total_size = len(test_x)
    data = [test_x[int(total_size/os.cpu_count()*i):int(total_size/os.cpu_count()*(i+1))] for i in range(os.cpu_count())]
    token_list = parmap.map(tokenize, data, pm_pbar=True, pm_processes=os.cpu_count())
    
    tokens = []
    for t in token_list:
        tokens.extend(t)
        
    test_x = np.zeros(shape=(len(tokens), seq_len))
    for i, s in enumerate(tokens):
        for t in s:
            test_x[i][w2i[t] if t in w2i else 0] += 1

100%|██████████| 20/20 [00:32<00:00,  1.62s/it]
100%|██████████| 18629/18629 [01:23<00:00, 222.00it/s]
100%|██████████| 20/20 [00:19<00:00,  1.02it/s]
100%|██████████| 20/20 [00:16<00:00,  1.25it/s]


In [9]:
np.savez('bow_data.npz', train_x, train_y, valid_x, valid_y, test_x, test_y)

In [10]:
pickle.dump((i2w, w2i), open('bow_dict.pkl', 'wb'))
pickle.dump((label_i2w, label_w2i), open('bow_label.pkl', 'wb'))

In [2]:
i2w, w2i = pickle.load(open('bow_dict.pkl', 'rb'))
label_i2w, label_w2i = pickle.load(open('bow_label.pkl', 'rb'))

In [3]:
bow_data = np.load('bow_data.npz')
train_x, train_y, valid_x, valid_y, test_x, test_y = bow_data['arr_0'], bow_data['arr_1'], bow_data['arr_2'], bow_data['arr_3'], bow_data['arr_4'], bow_data['arr_5']

In [11]:
n_estimators = [100, 125, 150, 175, 
                200, 225, 250, 275, 300]
max_depth = [100, 125, 150, 175, 200]
result = []
for n in n_estimators:
    for d in tqdm(max_depth, file=sys.stdout):
        rf = RandomForestClassifier(
            n_estimators=n, max_depth=d,
            random_state=123456, n_jobs=-1
        )
        rf.fit(train_x, train_y)
        valid_pred = rf.predict(valid_x)
        result.append((
            n, d,
            (valid_pred == valid_y).sum() / len(valid_y), 
            precision_score(valid_y, valid_pred, average='macro'), 
            recall_score(valid_y, valid_pred, average='macro'), 
            f1_score(valid_y, valid_pred, average='macro')
        ))
        tqdm.write(str(['%.04f' %r for r in result[-1]]))

['100.0000', '100.0000', '0.7785', '0.7948', '0.7797', '0.7742']
['100.0000', '125.0000', '0.7846', '0.7984', '0.7880', '0.7823']
['100.0000', '150.0000', '0.7862', '0.8033', '0.7894', '0.7853']
['100.0000', '175.0000', '0.7862', '0.8027', '0.7889', '0.7839']
['100.0000', '200.0000', '0.7834', '0.8005', '0.7868', '0.7819']
100%|██████████| 5/5 [01:12<00:00, 14.55s/it]
['125.0000', '100.0000', '0.7797', '0.7967', '0.7811', '0.7751']
['125.0000', '125.0000', '0.7850', '0.8023', '0.7901', '0.7846']
['125.0000', '150.0000', '0.7797', '0.8033', '0.7845', '0.7807']
['125.0000', '175.0000', '0.7870', '0.8038', '0.7883', '0.7840']
['125.0000', '200.0000', '0.7878', '0.8066', '0.7900', '0.7856']
100%|██████████| 5/5 [01:29<00:00, 17.81s/it]
['150.0000', '100.0000', '0.7858', '0.8034', '0.7883', '0.7825']
['150.0000', '125.0000', '0.7891', '0.8058', '0.7927', '0.7870']
['150.0000', '150.0000', '0.7834', '0.8047', '0.7867', '0.7826']
['150.0000', '175.0000', '0.7874', '0.8050', '0.7894', '0.7846'

KeyboardInterrupt: 

In [12]:
rf = RandomForestClassifier(
    n_estimators=225, max_depth=200,
    random_state=123456, n_jobs=-1
)
rf.fit(train_x, train_y)

RandomForestClassifier(max_depth=200, n_estimators=225, n_jobs=-1,
                       random_state=123456)

In [13]:
pickle.dump(rf, open('rf.pkl', 'wb'))

In [14]:
test_pred = rf.predict(test_x)
test_true = test_y

test_acc = (test_pred == test_true).sum() / len(test_true)
test_precision = [precision_score(test_true, test_pred, labels=[i], average='macro') for i in label_i2w]
test_recall = [recall_score(test_true, test_pred, labels=[i], average='macro') for i in label_i2w]
test_f1 = [f1_score(test_true, test_pred, labels=[i], average='macro') for i in label_i2w]

print('| Label | Precision | Recall | F1 |')
print('|-------|-----------|--------|----|')
for i, (precision, recall, f1) in enumerate(zip(test_precision, test_recall, test_f1)):
    print('| %s | %.4f | %.4f | %.4f |' %(label_i2w[i], precision, recall, f1))
print('| %s | %.4f | %.4f | %.4f |' %(
    'Total', 
    precision_score(test_true, test_pred, average='macro'), 
    recall_score(test_true, test_pred, average='macro'), 
    f1_score(test_true, test_pred, average='macro'), 
))

print('Test Accuracy : %.4f' %test_acc)

| Label | Precision | Recall | F1 |
|-------|-----------|--------|----|
| 가족 | 0.9464 | 0.5000 | 0.6543 |
| 건강/다이어트 | 0.7949 | 0.8455 | 0.8194 |
| 계절/날씨 | 0.8054 | 0.9023 | 0.8511 |
| 꿈(목표) | 0.7647 | 0.6915 | 0.7263 |
| 먹거리 | 0.9049 | 0.9189 | 0.9119 |
| 반려동물 | 0.9015 | 1.0000 | 0.9482 |
| 방송/연예 | 0.8882 | 0.5479 | 0.6777 |
| 선물 | 0.9906 | 0.8898 | 0.9375 |
| 성격 | 0.9483 | 0.8800 | 0.9129 |
| 스포츠/레저 | 0.9504 | 0.7012 | 0.8070 |
| 아르바이트 | 0.7808 | 0.9828 | 0.8702 |
| 여행지(국내/해외) | 0.7477 | 0.9639 | 0.8421 |
| 연애/결혼 | 0.8061 | 0.9568 | 0.8750 |
| 영화 | 0.6473 | 0.9944 | 0.7841 |
| 회사/학교 | 0.7602 | 0.6915 | 0.7242 |
| Total | 0.8425 | 0.8311 | 0.8228 |
Test Accuracy : 0.8239


In [16]:
from xgboost import XGBClassifier

In [18]:
max_depth = [4, 6, 8, 10]
result = []
# for n in n_estimators:
for d in tqdm(max_depth, file=sys.stdout):
    xgb = XGBClassifier(
        objective='multi:softprob', 
        max_depth=d,
        seed=123456,
        random_state=123456,
        mthread=os.cpu_count(),
        use_label_encoder=False
    )
    xgb.fit(train_x, train_y)
    valid_pred = xgb.predict(valid_x)
    result.append((
        d,
        (valid_pred == valid_y).sum() / len(test_true), 
        precision_score(valid_y, valid_pred, average='macro'), 
        recall_score(valid_y, valid_pred, average='macro'), 
        f1_score(valid_y, valid_pred, average='macro')
    ))
    tqdm.write(str(['%.04f' %r for r in result[-1]]))

Parameters: { "mthread" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


['4.0000', '0.8762', '0.8135', '0.8197', '0.8125']
Parameters: { "mthread" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


['6.0000', '0.8713', '0.8125', '0.8188', '0.8114']
Parameters: { "mthread" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


['8.0000', '0.8682', '0.8078', '0.8146', '0

In [19]:
xgb = XGBClassifier(
    objective='multi:softprob', 
    max_depth=4,
    seed=123456,
    random_state=123456,
    mthread=os.cpu_count(),
    use_label_encoder=False
)
xgb.fit(train_x, train_y)

Parameters: { "mthread" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=4,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              mthread=20, n_estimators=100, n_jobs=20, num_parallel_tree=1,
              objective='multi:softprob', random_state=123456, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, seed=123456, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [20]:
pickle.dump(xgb, open('xgb.pkl', 'wb'))

In [21]:
test_pred = xgb.predict(test_x)
test_true = test_y

test_acc = (test_pred == test_true).sum() / len(test_true)
test_precision = [precision_score(test_true, test_pred, labels=[i], average='macro') for i in label_i2w]
test_recall = [recall_score(test_true, test_pred, labels=[i], average='macro') for i in label_i2w]
test_f1 = [f1_score(test_true, test_pred, labels=[i], average='macro') for i in label_i2w]

print('| Label | Precision | Recall | F1 |')
print('|-------|-----------|--------|----|')
for i, (precision, recall, f1) in enumerate(zip(test_precision, test_recall, test_f1)):
    print('| %s | %.4f | %.4f | %.4f |' %(label_i2w[i], precision, recall, f1))
print('| %s | %.4f | %.4f | %.4f |' %(
    'Total', 
    precision_score(test_true, test_pred, average='macro'), 
    recall_score(test_true, test_pred, average='macro'), 
    f1_score(test_true, test_pred, average='macro'), 
))

print('Test Accuracy : %.4f' %test_acc)

| Label | Precision | Recall | F1 |
|-------|-----------|--------|----|
| 가족 | 0.7778 | 0.5943 | 0.6738 |
| 건강/다이어트 | 0.7521 | 0.8273 | 0.7879 |
| 계절/날씨 | 0.9008 | 0.8872 | 0.8939 |
| 꿈(목표) | 0.6500 | 0.6915 | 0.6701 |
| 먹거리 | 0.9206 | 0.8958 | 0.9080 |
| 반려동물 | 0.8947 | 1.0000 | 0.9444 |
| 방송/연예 | 0.8876 | 0.5747 | 0.6977 |
| 선물 | 0.9633 | 0.8898 | 0.9251 |
| 성격 | 0.9310 | 0.8640 | 0.8963 |
| 스포츠/레저 | 0.8824 | 0.8232 | 0.8517 |
| 아르바이트 | 0.8467 | 1.0000 | 0.9170 |
| 여행지(국내/해외) | 0.7407 | 0.9639 | 0.8377 |
| 연애/결혼 | 0.7661 | 0.9424 | 0.8452 |
| 영화 | 0.7860 | 0.9441 | 0.8579 |
| 회사/학교 | 0.7688 | 0.7074 | 0.7368 |
| Total | 0.8312 | 0.8404 | 0.8296 |
Test Accuracy : 0.8322


In [25]:
length_count = [[0, 0] for _ in label_i2w]

for i in tqdm(range(len(train_x))):
    length_count[train_y[i]][0] += sum(train_x[i])
    length_count[train_y[i]][1] += 1
    
for i in range(len(valid_x)):
    length_count[valid_y[i]][0] += sum(valid_x[i])
    length_count[valid_y[i]][1] += 1
    
for i in range(len(test_x)):
    length_count[test_y[i]][0] += sum(test_x[i])
    length_count[test_y[i]][1] += 1

100%|██████████| 18629/18629 [00:28<00:00, 664.95it/s]


In [28]:
for i, (l, c) in enumerate(length_count):
#     print('%.3f' %(l//c), label_i2w[i], sep='\t')
    print(l//c, label_i2w[i], sep='\t')

552.0	가족
478.0	건강/다이어트
339.0	계절/날씨
527.0	꿈(목표)
334.0	먹거리
479.0	반려동물
330.0	방송/연예
481.0	선물
468.0	성격
398.0	스포츠/레저
471.0	아르바이트
336.0	여행지(국내/해외)
467.0	연애/결혼
336.0	영화
362.0	회사/학교
