In [None]:
!pip install transformers nlp pulp

In [None]:
%cd "/content/drive/My Drive/Colab Notebooks/Competition/ProbSpace/Spam mail"

In [None]:
import collections
import os
import random
import re
import time

from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import plotly.express as px
import pulp
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, f1_score, confusion_matrix
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm.notebook import tqdm

from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModel,
    AdamW,
    get_cosine_schedule_with_warmup
)
import nlp

import warnings
warnings.filterwarnings('ignore')

In [None]:
SEED = 44
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        
seed_everything(SEED)

In [None]:
if torch.cuda.is_available():
    current_device = torch.cuda.current_device()
    print("Device:", torch.cuda.get_device_name(current_device))

In [None]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
TRAIN_FILE = "./data/train_data.csv"
TEST_FILE = "./data/test_data.csv"
MODEL_DIR = "./checkpoint/"
MODEL_NAME = 'google/electra-base-discriminator'
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 128
NUM_CLASS = 2
NUM_EPOCH = 10
NUM_SPLIT = 5
TEST_FREQS = [7838, 17000]

In [None]:
def preprocess(text):
    text = BeautifulSoup(text, "html.parser").get_text(strip=strip)
    return text

In [None]:
def make_dataset(df, tokenizer, device):
    dataset = nlp.Dataset.from_pandas(df)
    dataset = dataset.map(lambda example: tokenizer(example["contents"]))
    dataset.rename_column_('y', 'labels')
    dataset.set_format(type='torch', 
                       columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'], 
                       device=device)
    return dataset

In [None]:
class Tokenizer:
    def __init__(self, model_name, additional_tokens=None, max_length=512):
        self.bert_tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
        self.bert_tokenizer.add_tokens(additional_tokens)
        self.max_length = max_length
        assert self.max_length % 2 == 0

    def __call__(self, text):
        sep_index = text.find("\r\n")
        input = self.bert_tokenizer(text[:sep_index], text[sep_index:],
                                    padding='max_length', max_length=self.max_length)
        if len(input["input_ids"]) > self.max_length:
            for k, v in input.items():
                input[k] = v[:self.max_length//4] + v[-(self.max_length//4)*3:]
        return input

In [None]:
class Classifier(nn.Module):
    def __init__(self, model_name, num_classes=2):
        super().__init__()

        config = AutoConfig.from_pretrained(model_name, output_hidden_states=True, return_dict=False)
        self.bert = AutoModel.from_pretrained(model_name, config=config)
        self.dropout = nn.Dropout(0.2)
        self.high_dropout = nn.Dropout(p=0.5)
        self.linear = nn.Linear(self.bert.config.hidden_size, num_classes)

        n_weights = config.num_hidden_layers + 1
        weights_init = torch.zeros(n_weights).float()
        weights_init.data[:-1] = -3
        self.layer_weights = torch.nn.Parameter(weights_init)
        
        nn.init.normal_(self.linear.weight,
                        mean=0.0,
                        std=config.initializer_range)
        nn.init.zeros_(self.linear.bias)

    def forward(self, **inputs):
        _, hidden_layers = self.bert(**inputs)
        
        output = torch.stack(
            [self.dropout(layer[:, 0, :]) for layer in hidden_layers], dim=2
        )
        output = (torch.softmax(self.layer_weights, dim=0) * output).sum(-1)
        
        output = torch.mean(
            torch.stack(
                [self.linear(self.high_dropout(output)) for _ in range(5)],
                dim=0,
            ),
            dim=0,
        )
        return output

    def get_grouped_params(self, lr=5e-5, bert_lr=2e-5, bert_lr_decay=0.95):
        no_decay = ['bias', 'LayerNorm.weight', 'LayerNorm.bias']
        bert_layers_lr = [('embeddings', bert_lr * pow(bert_lr_decay, 11))] + \
                      [(f'layer.{i}.', bert_lr * pow(bert_lr_decay, 11 - i)) for i in range(12)]
        bert_layers_name = ['embeddings'] + [f'layer.{i}.' for i in range(12)]
        bert_params = []
        other_params = []

        for g, l in bert_layers_lr:
            bert_params.append(
                {
                    'params': [p for n, p in self.named_parameters() if
                               not any(nd in n for nd in no_decay) and any(nd in n for nd in [g])],
                    'lr': l, 'weight_decay': 0.01
                }
            )
            bert_params.append(
                {
                    'params': [p for n, p in self.named_parameters() if
                               any(nd in n for nd in no_decay) and any(nd in n for nd in [g])],
                    'lr': l, 'weight_decay': 0.0
                }
            )

        other_params = [
            {'params': [p for n, p in self.named_parameters() if
                        not any(nd in n for nd in no_decay) and not any(nd in n for nd in bert_layers_name)],
             'weight_decay': 0.01, 'lr': lr},
            {'params': [p for n, p in self.named_parameters() if
                        any(nd in n for nd in no_decay) and not any(nd in n for nd in bert_layers_name)],
             'weight_decay': 0.0, 'lr': lr},
        ]

        grouped_params = bert_params + other_params
        return grouped_params

In [None]:
class ClassBalancedLoss(nn.CrossEntropyLoss):
    def __init__(self, beta, freq_per_class, device="cuda"):
        super().__init__()

        assert beta > 0 and beta < 1
        weight = [(1-beta)/(1-beta**freq) for freq in freq_per_class]
        self.weight = torch.tensor(weight, device=device, requires_grad=False)

In [None]:
class Trainer:
    def __init__(self,
                 model,
                 train_dataloader,
                 valid_dataloader,
                 criterion,
                 optimizer,
                 scheduler=None,
                 num_epoch=10,
                 model_name='./models/best_model'):
        
        self.model = model
        self.train_dataloader = train_dataloader
        self.valid_dataloader = valid_dataloader
        self.criterion = criterion
        self.optimizer = optimizer
        if scheduler is not None:
            self.scheduler = scheduler
        else:
            self.scheduler = optim.lr_scheduler.StepLR(self.optimizer,
                                                       step_size=1e+10,
                                                       gamma=1.0)
        self.num_epoch = num_epoch
        self.model_name = model_name


    def _train_step(self, epoch):
        self.model.train()
        total_loss = 0
        all_labels = np.array([])
        all_preds = np.array([])

        progress = tqdm(self.train_dataloader, total=len(self.train_dataloader))

        for i, batch in enumerate(progress):
            progress.set_description(f"<Train> Epoch{epoch+1}")

            labels = batch.pop('labels')
            inputs = batch

            self.optimizer.zero_grad()

            output = self.model(**inputs)
            loss = self.criterion(output, labels)
            pred = torch.argmax(output, dim=1)

            loss.backward()
            self.optimizer.step()
            self.scheduler.step()

            total_loss += loss.item()
            all_labels = np.r_[all_labels, labels.to('cpu').detach().numpy()]
            all_preds = np.r_[all_preds, pred.to('cpu').detach().numpy()]
            f1 = f1_score(all_labels, all_preds)

            progress.set_postfix(loss=total_loss/(i+1), f1=f1)

        train_loss = total_loss / len(self.train_dataloader)
        train_f1 = f1

        return train_loss, train_f1

    def _eval_step(self, epoch):
        self.model.eval()
        total_loss = 0
        all_labels = np.array([])
        all_preds = np.array([])

        with torch.no_grad():
            progress = tqdm(self.valid_dataloader,
                            total=len(self.valid_dataloader))
            
            for i, batch in enumerate(progress):
                progress.set_description(f"<Valid> Epoch{epoch+1}")

                labels = batch.pop('labels')
                inputs = batch

                output = self.model(**inputs)
                loss = self.criterion(output, labels)
                pred = torch.argmax(output, dim=1)
                
                total_loss += loss.item()
                all_labels = np.r_[all_labels, labels.to('cpu').detach().numpy()]
                all_preds = np.r_[all_preds, pred.to('cpu').detach().numpy()]
                f1 = f1_score(all_labels, all_preds)

                progress.set_postfix(loss=total_loss/(i+1), f1=f1)

            valid_loss = total_loss / len(self.valid_dataloader)
            valid_f1 = f1

        return valid_loss, valid_f1

    def train(self, metric='f1'):
        if metric == 'f1':
            best_metric = 0
        elif metric == 'loss':
            best_metric = np.inf
        else:
            raise RuntimeError()

        for epoch in range(self.num_epoch):
            train_loss, train_f1= self._train_step(epoch)
            valid_loss, valid_f1 = self._eval_step(epoch)
            print(f'Loss: {valid_loss}  f1: {valid_f1}', end='  ')

            if metric == 'f1':
                if valid_f1 > best_metric:
                    best_metric = valid_f1
                    print('model saving!', end='')
                    torch.save(self.model.state_dict(), f"{self.model_name}.pth")
            elif metric == 'loss':
                if valid_loss < best_metric:
                    best_metric = valid_loss
                    print('model saving!', end='')
                    torch.save(self.model.state_dict(), f"{self.model_name}.pth")
            else:
                raise RuntimeError()
            print('\n\n')

        return best_metric

In [None]:
tokenizer = Tokenizer(MODEL_NAME)
# additional_tokens = [' enron ', ' ect ', ' hou ']
# tokenizer = Tokenizer(MODEL_NAME, additional_tokens)
train_valid_df = pd.read_csv(TRAIN_FILE)

In [None]:
f1_scores = []
skfolds = StratifiedKFold(n_splits=NUM_SPLIT, shuffle=True, random_state=SEED)
for fold, (train_index, valid_index) in enumerate(skfolds.split(train_valid_df, train_valid_df['y'])):
    print(f"fold {fold}", "="*80)

    train_df = train_valid_df.iloc[train_index].reset_index(drop=True)
    valid_df = train_valid_df.iloc[valid_index].reset_index(drop=True)

    train_dataset = make_dataset(train_df, tokenizer, DEVICE)
    valid_dataset = make_dataset(valid_df, tokenizer, DEVICE)

    train_dataloader = torch.utils.data.DataLoader(
        train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True
    )
    valid_dataloader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False
    )

    model = Classifier(MODEL_NAME, num_classes=NUM_CLASS)
    model.bert.resize_token_embeddings(len(tokenizer.bert_tokenizer))
    model = model.to(DEVICE)
    grouped_params = model.get_grouped_params(lr=5e-5, bert_lr=2e-5, bert_lr_decay=0.95)

    criterion = nn.CrossEntropyLoss()
    optimizer = AdamW(grouped_params)
    scheduler = get_cosine_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0,
                                                num_training_steps = 15*len(train_dataloader))
    
    model_name = MODEL_DIR + MODEL_NAME.replace('/', '_') + '_' + str(fold)
    trainer = Trainer(model,
                      train_dataloader,
                      valid_dataloader,
                      criterion,
                      optimizer,
                      scheduler=scheduler,
                      num_epoch=NUM_EPOCH,
                      model_name=model_name)
    
    f1 = trainer.train(metric='f1')
    f1_scores.append(f1)
    print(f"<fold={fold}> best score: {f1}\n")

In [None]:
cv = sum(f1_scores) / len(f1_scores)
for i, f1 in enumerate(f1_scores):
    print(f"fold{i}: {f1}")
print(f"CV:    {cv}")

In [None]:
lines = ""
for i, f1 in enumerate(f1_scores):
    line = f"fold={i}: {f1}\n"
    lines += line
lines += f"CV    : {cv}"
with open(f"{MODEL_DIR}{MODEL_NAME.replace('/', '_')}_result.txt", mode='w') as f:
    f.write(lines)

In [None]:
models = []
for fold in tqdm(range(NUM_SPLIT)):
    model = Classifier(MODEL_NAME)
    model.bert.resize_token_embeddings(len(tokenizer.bert_tokenizer))
    model.load_state_dict(torch.load(f'{MODEL_DIR}{MODEL_NAME.replace("/", "_")}_{fold}.pth'))
    model.to(DEVICE)
    model.eval()
    models.append(model)

In [None]:
test_df = pd.read_csv(TEST_FILE)
test_df['y'] = -1
test_dataset = make_dataset(test_df, tokenizer, DEVICE)
test_dataloader = torch.utils.data.DataLoader(
    test_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False)

In [None]:
# 制約付き対数尤度最大化問題を解く
def optimize(prob):
    logp = np.log(prob + 1e-16)
    N = prob.shape[0]
    K = prob.shape[1]

    m = pulp.LpProblem('Problem', pulp.LpMaximize)  # 最大化問題

    # 最適化する変数(= 提出ラベル)
    x = pulp.LpVariable.dicts('x', [(i, j) for i in range(N) for j in range(K)], 0, 1, pulp.LpBinary)
    
    # log likelihood(目的関数)
    log_likelihood = pulp.lpSum([x[(i, j)] * logp[i, j] for i in range(N) for j in range(K)])
    m += log_likelihood
    
    # 各データについて，1クラスだけを予測ラベルとする制約
    for i in range(N):
        m += pulp.lpSum([x[(i, k)] for k in range(K)]) == 1
    
    # 各クラスについて，推定個数の合計に関する制約
    for k in range(K):
        m += pulp.lpSum([x[(i, k)] for i in range(N)]) == TEST_FREQS[k]
        
    m.solve()  # 解く

    assert m.status == 1  # assert 最適 <=>（実行可能解が見つからないとエラー）

    x_ast = np.array([[int(x[(i, j)].value()) for j in range(K)] for i in range(N)])  # 結果の取得
    return x_ast.argmax(axis=1) # 結果をonehotから -> {0, 1}のラベルに変換

In [None]:
def postprocess(final_output, test_df):
    assert final_output.shape[0] == test_df.shape[0]
    is_empty = test_df['contents'] == 'Subject: \r\n'
    final_output[is_empty] = np.array([0.0, 1.0])
    return final_output

In [None]:
with torch.no_grad():
    progress = tqdm(test_dataloader, total=len(test_dataloader))
    final_output = np.empty([0,0])

    for batch in progress:
        progress.set_description("<Test>")

        _ = batch.pop('labels')
        inputs = batch

        outputs = []
        for model in models:
            output = model(**inputs)
            outputs.append(output)

        outputs = sum(outputs) / len(outputs)
        outputs = torch.softmax(outputs, dim=1).cpu().detach().numpy()
        # outputs = np.argmax(outputs, axis=1)

        final_output = np.append(final_output, outputs).reshape((-1,2))

    final_output = postprocess(final_output, test_df)
    final_output = optimize(final_output)

In [None]:
submit = pd.DataFrame(columns=['id', 'y'])
submit['id'] = range(1, 24838+1)
submit['y'] = final_output
try:
    submit.to_csv(f"./output/submission_cv{cv:.10f}_optimized.csv", index=False)
except NameError:
    submit.to_csv("./output/submission_optimized.csv", index=False)
submit

In [None]:
fig = px.pie(submit, names='y')
fig.show()