Skip to content

Commit

Permalink
Merge pull request #1 from upura/livedoor-experiments
Browse files Browse the repository at this point in the history
Experiments on Livedoor News Corpus
  • Loading branch information
upura committed Apr 8, 2021
2 parents d740541 + 1f54f42 commit fd02b64
Show file tree
Hide file tree
Showing 9 changed files with 186 additions and 141 deletions.
1 change: 1 addition & 0 deletions .gitignore
Expand Up @@ -7,6 +7,7 @@ data/bert
*.pkl
*.log
wandb
lightning_logs

# Created by https://www.gitignore.io/api/macos,python
# Edit at https://www.gitignore.io/?templates=macos,python
Expand Down
13 changes: 8 additions & 5 deletions examples/text_classification/README.md
Expand Up @@ -11,8 +11,11 @@ https://github.com/microsoft/nlp-recipes/blob/master/examples/text_classificatio

## Summary

|Notebook|Environment|Description|
|---|---|---|
|[TF-IDF & Logistic Regression](tfidf_logistic_regression.py)|Local| [Logistic Regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) with TF-IDF vectors |
|[TF-IDF & LightGBM](tfidf_lgbm.py)|Local| [LightGBM](https://github.com/microsoft/LightGBM) with TF-IDF vectors |
|[BERT](run_bert.py)|Local| [Transformers BERT](https://github.com/huggingface/transformers) |
|Notebook|Environment|Description|ACC|
|---|---|---|---|
|[TF-IDF & Logistic Regression](tfidf_logistic_regression.py)|Local| [Logistic Regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) with TF-IDF vectors | 0.9308 |
|[TF-IDF & LightGBM](tfidf_lgbm.py)|Local| [LightGBM](https://github.com/microsoft/LightGBM) with TF-IDF vectors | 0.9512 |
|[BERT](run_bert.py) 'cl-tohoku/bert-base-japanese-v2' |Local| [Transformers BERT](https://github.com/huggingface/transformers) | 0.9362 |
|[BERT](run_bert.py) 'cl-tohoku/bert-base-japanese-char-v2' |Local| [Transformers BERT](https://github.com/huggingface/transformers) | 0.9274 |

Accuracy scores (ACC) are calculated by running code only in fold 0 in the condition that datasets are devided into train/val/test at the rate of 0.6/0.2/0.2.
108 changes: 45 additions & 63 deletions examples/text_classification/run_bert.py
@@ -1,21 +1,23 @@
import argparse
import sys

import neologdn
import numpy as np

import pytorch_lightning as pl
from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split, StratifiedKFold
import torch
from torch import nn
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers.tokenization_bert_japanese import BertJapaneseTokenizer
from transformers import BertTokenizer

from utils_nlp.common.pytorch_utils import seed_everything
sys.path.append('.')
from utils_nlp.dataset.livedoor import load_pandas_df
from utils_nlp.eval.classification import eval_classification
from utils_nlp.models.nn.datasets import LivedoorDataset
from utils_nlp.models.nn.runner import CustomRunner
from utils_nlp.models.nn.models import BERTClass
from utils_nlp.models.nn.models import PLBertClassifier


def preprocess_data(df):
Expand All @@ -32,76 +34,56 @@ def preprocess_data(df):

if __name__ == '__main__':

RUN_NAME = 'bert'
MAX_LEN = 20
seed_everything()
parser = argparse.ArgumentParser()
parser.add_argument('--model_name')
args = parser.parse_args()

MODEL_NAME = args.model_name
MAX_LEN = 300
pl.seed_everything(777)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

df = load_pandas_df(nrows=1000, shuffle=True)
df = load_pandas_df(shuffle=True)
X_train, X_test, y_train, y_test = preprocess_data(df)

tokenizer = BertJapaneseTokenizer.from_pretrained('cl-tohoku/bert-base-japanese')
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

test_dataset = LivedoorDataset(X_test, tokenizer, MAX_LEN)
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=32)
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=32, num_workers=4)

y_preds = []
NUM_CLASS = 9
oof_train = np.zeros((len(X_train), NUM_CLASS))
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=0)

for fold_id, (train_index, valid_index) in enumerate(tqdm(cv.split(X_train, X_train['label']))):

X_tr = X_train.loc[train_index, :].reset_index(drop=True)
X_val = X_train.loc[valid_index, :].reset_index(drop=True)
y_tr = y_train[train_index]
y_val = y_train[valid_index]

train_dataset = LivedoorDataset(X_tr, tokenizer, MAX_LEN)
valid_dataset = LivedoorDataset(X_val, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, shuffle=True, batch_size=32)
valid_loader = DataLoader(valid_dataset, shuffle=False, batch_size=32)

loaders = {'train': train_loader, 'valid': valid_loader}
runner = CustomRunner(device=device)

model = BERTClass(NUM_CLASS)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=30, eta_min=1e-6)

logdir = f'data/{RUN_NAME}/logdir_{RUN_NAME}/fold{fold_id}'
runner.train(
model=model,
criterion=criterion,
optimizer=optimizer,
scheduler=scheduler,
loaders=loaders,
logdir=logdir,
num_epochs=3,
verbose=True,
)

pred = np.concatenate(list(map(lambda x: x.cpu().numpy(),
runner.predict_loader(
loader=valid_loader,
resume=f'{logdir}/checkpoints/best.pth',
model=model,),)))

oof_train[valid_index] = pred
score = log_loss(y_val, oof_train[valid_index])
print('score', score)

y_pred = np.concatenate(list(map(lambda x: x.cpu().numpy(),
runner.predict_loader(
loader=test_loader,
resume=f'{logdir}/checkpoints/best.pth',
model=model,),)))
y_preds.append(y_pred)

y_preds = np.mean(y_preds, axis=0)
if fold_id == 0:
X_tr = X_train.loc[train_index, :].reset_index(drop=True)
X_val = X_train.loc[valid_index, :].reset_index(drop=True)
y_tr = y_train[train_index]
y_val = y_train[valid_index]

train_dataset = LivedoorDataset(X_tr, tokenizer, MAX_LEN)
valid_dataset = LivedoorDataset(X_val, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, shuffle=True, batch_size=16, num_workers=4)
valid_loader = DataLoader(valid_dataset, shuffle=False, batch_size=32, num_workers=4)

model = PLBertClassifier(model_name=MODEL_NAME,
num_classes=NUM_CLASS)
device ='cuda:0' if torch.cuda.is_available() else 'cpu'
model = model.to(device)
trainer = pl.Trainer(gpus=1, max_epochs=7)
trainer.fit(model, train_loader, valid_loader)
trainer.test(test_dataloaders=test_loader)

y_preds = np.load('data/bert/preds.npy')
print(f'test, log_loss: {log_loss(y_test, y_preds)}')
result_dict = eval_classification(y_test, y_preds.argmax(axis=1))
print(result_dict)
"""
{'accuracy': 0.9362,
'precision': [0.8939, 0.9101, 0.9588, 0.9293, 0.9451, 0.9241, 0.9822, 0.9882, 0.8935],
'recall': [0.9195, 0.931, 0.9422, 0.902, 0.9885, 0.8639, 0.954, 0.9333, 0.9805],
'f1': [0.9065, 0.9205, 0.9504, 0.9154, 0.9663, 0.893, 0.9679, 0.96, 0.935]}
"""
53 changes: 28 additions & 25 deletions examples/text_classification/tfidf_lgbm.py
@@ -1,3 +1,5 @@
import sys

from konoha import WordTokenizer
import lightgbm as lgb
from loguru import logger
Expand All @@ -10,6 +12,7 @@
from sklearn.model_selection import train_test_split, StratifiedKFold
from tqdm import tqdm

sys.path.append('.')
from utils_nlp.common.data import Data
from utils_nlp.dataset.livedoor import load_pandas_df
from utils_nlp.eval.classification import eval_classification
Expand Down Expand Up @@ -52,7 +55,7 @@ def preprocess_data(df):

if __name__ == '__main__':

df = load_pandas_df(nrows=1000, shuffle=True)
df = load_pandas_df(shuffle=True)
X_train, X_test, y_train, y_test = preprocess_data(df)

RUN_NAME = 'lgbm'
Expand All @@ -64,7 +67,7 @@ def preprocess_data(df):
y_preds = []
NUM_CLASS = 9
oof_train = np.zeros((len(X_train), NUM_CLASS))
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=0)

params = {
'objective': 'multiclass',
Expand All @@ -84,40 +87,40 @@ def preprocess_data(df):
}

for fold_id, (train_index, valid_index) in enumerate(tqdm(cv.split(X_train, y_train))):
if fold_id == 0:
X_tr = X_train.loc[train_index, :]
X_val = X_train.loc[valid_index, :]
y_tr = y_train[train_index]
y_val = y_train[valid_index]

X_tr = X_train.loc[train_index, :]
X_val = X_train.loc[valid_index, :]
y_tr = y_train[train_index]
y_val = y_train[valid_index]

lgb_train = lgb.Dataset(X_tr, y_tr)
lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)
lgb_train = lgb.Dataset(X_tr, y_tr)
lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)

model = lgb.train(params,
lgb_train,
valid_sets=[lgb_train, lgb_eval],
verbose_eval=10,
num_boost_round=1000,
early_stopping_rounds=10)
model = lgb.train(params,
lgb_train,
valid_sets=[lgb_train, lgb_eval],
verbose_eval=10,
num_boost_round=1000,
early_stopping_rounds=10)

Data.dump(model, f'data/{RUN_NAME}/model_{fold_id}.pkl')
Data.dump(model, f'data/{RUN_NAME}/model_{fold_id}.pkl')

oof_train[valid_index] = model.predict(X_val)
score = log_loss(y_val, oof_train[valid_index])
logger.info(f'fold {fold_id}, log_loss: {score}')
oof_train[valid_index] = model.predict(X_val)
score = log_loss(y_val, oof_train[valid_index])
logger.info(f'fold {fold_id}, log_loss: {score}')

y_pred = model.predict(X_test)
y_preds.append(y_pred)
y_pred = model.predict(X_test)
y_preds.append(y_pred)

y_preds = np.mean(y_preds, axis=0)
logger.info(f'test, log_loss: {log_loss(y_test, y_preds)}')
result_dict = eval_classification(y_test, y_preds.argmax(axis=1))
logger.info(str(result_dict))
"""
{'accuracy': 0.885,
'precision': [0.8889, 0.88, 0.8, 0.9333, 0.8261, 0.8, 1.0, 0.9583, 0.9],
'recall': [0.8889, 0.88, 0.9091, 0.8235, 0.95, 0.6957, 1.0, 0.8846, 0.9474],
'f1': [0.8889, 0.88, 0.8511, 0.875, 0.8837, 0.7442, 1.0, 0.92, 0.9231]}
{'accuracy': 0.9512,
'precision': [0.9253, 0.9714, 0.9713, 0.9348, 0.9286, 0.8786, 1.0, 0.9831, 0.9608],
'recall': [0.9253, 0.977, 0.9769, 0.8431, 0.9713, 0.8994, 1.0, 0.9667, 0.9545],
'f1': [0.9253, 0.9742, 0.9741, 0.8866, 0.9494, 0.8889, 1.0, 0.9748, 0.9577]}
"""

Data.dump(oof_train, f'data/{RUN_NAME}/oof_train.pkl')
Expand Down
41 changes: 22 additions & 19 deletions examples/text_classification/tfidf_logistic_regression.py
@@ -1,3 +1,5 @@
import sys

from konoha import WordTokenizer
from loguru import logger
import neologdn
Expand All @@ -10,6 +12,7 @@
from sklearn.model_selection import train_test_split, StratifiedKFold
from tqdm import tqdm

sys.path.append('.')
from utils_nlp.common.data import Data
from utils_nlp.dataset.livedoor import load_pandas_df
from utils_nlp.eval.classification import eval_classification
Expand Down Expand Up @@ -52,7 +55,7 @@ def preprocess_data(df):

if __name__ == '__main__':

df = load_pandas_df(nrows=1000, shuffle=True)
df = load_pandas_df(shuffle=True)
X_train, X_test, y_train, y_test = preprocess_data(df)

RUN_NAME = 'logistic_regression'
Expand All @@ -64,35 +67,35 @@ def preprocess_data(df):
y_preds = []
NUM_CLASS = 9
oof_train = np.zeros((len(X_train), NUM_CLASS))
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=0)

for fold_id, (train_index, valid_index) in enumerate(tqdm(cv.split(X_train, y_train))):
if fold_id == 0:
X_tr = X_train.loc[train_index, :]
X_val = X_train.loc[valid_index, :]
y_tr = y_train[train_index]
y_val = y_train[valid_index]

X_tr = X_train.loc[train_index, :]
X_val = X_train.loc[valid_index, :]
y_tr = y_train[train_index]
y_val = y_train[valid_index]

model = LogisticRegression(penalty='l2', solver='sag', random_state=0)
model.fit(X_tr, y_tr)
Data.dump(model, f'data/{RUN_NAME}/model_{fold_id}.pkl')
model = LogisticRegression(penalty='l2', solver='sag', random_state=0)
model.fit(X_tr, y_tr)
Data.dump(model, f'data/{RUN_NAME}/model_{fold_id}.pkl')

oof_train[valid_index] = model.predict_proba(X_val)
score = log_loss(y_val, oof_train[valid_index])
logger.info(f'fold {fold_id}, log_loss: {score}')
oof_train[valid_index] = model.predict_proba(X_val)
score = log_loss(y_val, oof_train[valid_index])
logger.info(f'fold {fold_id}, log_loss: {score}')

y_pred = model.predict_proba(X_test)
y_preds.append(y_pred)
y_pred = model.predict_proba(X_test)
y_preds.append(y_pred)

y_preds = np.mean(y_preds, axis=0)
logger.info(f'test, log_loss: {log_loss(y_test, y_preds)}')
result_dict = eval_classification(y_test, y_preds.argmax(axis=1))
logger.info(str(result_dict))
"""
{'accuracy': 0.9,
'precision': [0.8182, 0.8929, 1.0, 1.0, 0.9, 0.8261, 0.9545, 0.8929, 0.9412],
'recall': [1.0, 1.0, 0.8636, 0.5882, 0.9, 0.8261, 1.0, 0.9615, 0.8421],
'f1': [0.9, 0.9434, 0.9268, 0.7407, 0.9, 0.8261, 0.9767, 0.9259, 0.8889]}
{'accuracy': 0.9308,
'precision': [0.8771, 0.96, 0.9639, 0.9412, 0.9198, 0.8678, 0.9771, 0.9309, 0.9517],
'recall': [0.9023, 0.9655, 0.9249, 0.7843, 0.9885, 0.8935, 0.9828, 0.9722, 0.8961],
'f1': [0.8895, 0.9628, 0.944, 0.8556, 0.9529, 0.8805, 0.9799, 0.9511, 0.9231]}
"""

Data.dump(oof_train, f'data/{RUN_NAME}/oof_train.pkl')
Expand Down
6 changes: 2 additions & 4 deletions requirements.txt
@@ -1,13 +1,11 @@
allennlp==v1.0.0
catalyst==20.6.0
chakin==0.0.8
ja-sentence-segmenter==0.0.2
japanize-matplotlib==1.1.2
konoha[all]==4.6.1
loguru==0.5.1
mecab-python3==0.996.5
mecab-python3==1.0.3
nagisa==0.2.7
neologdn==0.4
oseti==0.2
pykakasi==2.0.1
tensorflow-text==2.3.0
pytorch-lightning==1.2.7
14 changes: 0 additions & 14 deletions utils_nlp/common/pytorch_utils.py

This file was deleted.

7 changes: 4 additions & 3 deletions utils_nlp/models/nn/datasets.py
Expand Up @@ -19,16 +19,17 @@ def __getitem__(self, index):
None,
add_special_tokens=True,
max_length=self.max_len,
pad_to_max_length=True,
truncation=True,
padding='max_length',
return_token_type_ids=True
)
ids = inputs['input_ids']
mask = inputs['attention_mask']
attention_mask = inputs['attention_mask']
token_type_ids = inputs["token_type_ids"]

return {
'ids': torch.tensor(ids, dtype=torch.long),
'mask': torch.tensor(mask, dtype=torch.long),
'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
'targets': torch.tensor(self.targets[index], dtype=torch.long)
}

0 comments on commit fd02b64

Please sign in to comment.