In [None]:
!pip install transformers nlp
!pip install pulp

In [None]:
%cd "/content/drive/My Drive/Colab Notebooks/Competition/ProbSpace/Spam mail"

In [None]:
import collections
import os
import random
import re
import time

from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import plotly.express as px
import pulp
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, f1_score, confusion_matrix
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm.notebook import tqdm

from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModel,
    AdamW,
    get_cosine_schedule_with_warmup
)
import nlp

import warnings
warnings.filterwarnings('ignore')

In [None]:
SEED = 42
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        
seed_everything(SEED)

In [None]:
if torch.cuda.is_available():
    current_device = torch.cuda.current_device()
    print("Device:", torch.cuda.get_device_name(current_device))

In [None]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
TRAIN_FILE = "./data/train_data.csv"
TEST_FILE = "./data/test_data.csv"
MODEL_PATHS = ['./checkpoint/bert-base-uncased-baseline/',
               './checkpoint/roberta-base-baseline/',
               './checkpoint/google_electra-base-discriminator-baseline/',
               './checkpoint/bert-base-uncased-custom-tokenizer/',
               './checkpoint/roberta-base-custom-tokenizer/',
               './checkpoint/google_electra-base-discriminator-custom-tokenizer/']
MODEL_NAMES = ['bert-base-uncased',
               'roberta-base',
               'google/electra-base-discriminator',
               'bert-base-uncased',
               'roberta-base',
               'google/electra-base-discriminator']
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 128
NUM_CLASSES = 2
EPOCHS = 10
NUM_SPLITS = 5
TEST_FREQS = [7838, 17000]

In [None]:
def make_dataset(df, tokenizer, device):
    dataset = nlp.Dataset.from_pandas(df)
    dataset = dataset.map(lambda example: tokenizer(example["contents"]))
    if 'token_type_ids' not in dataset.features:
        dataset.set_format(type='torch', 
                           columns=['input_ids', 'attention_mask'], 
                           device=device)
    else:
        dataset.set_format(type='torch', 
                           columns=['input_ids', 'token_type_ids', 'attention_mask'], 
                           device=device)
    return dataset

In [None]:
class Tokenizer:
    def __init__(self, model_name, additional_tokens=None, max_length=512):
        self.bert_tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
        self.bert_tokenizer.add_tokens(additional_tokens)
        self.max_length = max_length
        assert self.max_length % 2 == 0

    def __call__(self, text):
        sep_index = text.find("\r\n")
        input = self.bert_tokenizer(text[:sep_index], text[sep_index:],
                                    padding='max_length', max_length=self.max_length)
        if len(input["input_ids"]) > self.max_length:
            for k, v in input.items():
                input[k] = v[:self.max_length//4] + v[-(self.max_length//4)*3:]
        return input

In [None]:
class Classifier(nn.Module):
    def __init__(self, model_name, num_classes=2):
        super().__init__()

        self.config = AutoConfig.from_pretrained(model_name, output_hidden_states=True)
        self.bert = AutoModel.from_config(config=self.config)
        self.dropout = nn.Dropout(0.2)
        n_weights = self.config.num_hidden_layers + 1
        weights_init = torch.zeros(n_weights).float()
        weights_init.data[:-1] = -3
        self.layer_weights = torch.nn.Parameter(weights_init)
        self.high_dropout = nn.Dropout(p=0.5)
        self.linear = nn.Linear(self.config.hidden_size, num_classes)
        
        nn.init.normal_(self.linear.weight, mean=0.0, std=self.config.initializer_range)
        nn.init.zeros_(self.linear.bias)

    def forward(self, **inputs):
        outputs = self.bert(**inputs)
        hidden_layers = outputs.hidden_states
        
        if not(hasattr(self.config, 'summary_type')) or self.config.summary_type == 'first':
            output = torch.stack(
                [self.dropout(layer[:, 0, :]) for layer in hidden_layers], dim=2
            )
        elif self.config.summary_type == 'last':
            output = torch.stack(
                [self.dropout(layer[:, -1, :]) for layer in hidden_layers], dim=2
            )
        else:
            raise Exception('invalid summary_type') 

        output = (torch.softmax(self.layer_weights, dim=0) * output).sum(-1)
        
        output = torch.mean(
            torch.stack(
                [self.linear(self.high_dropout(output)) for _ in range(5)],
                dim=0,
            ),
            dim=0,
        )
        return output

In [None]:
def get_models(model_name, model_path, tokenizer, num_splits=5):
    models = []
    for fold in tqdm(range(num_splits)):
        model = Classifier(model_name, NUM_CLASSES)
        if "custom-tokenizer" in model_path:
            model.bert.resize_token_embeddings(len(tokenizer.bert_tokenizer))
        model.load_state_dict(torch.load(model_path + f"{model_name.replace('/', '_')}_{fold}.pth"))
        model.to(DEVICE)
        model.eval()
        models.append(model)
    return models

In [None]:
# 制約付き対数尤度最大化問題を解く
def hack(prob):
    logp = np.log(prob + 1e-16)
    N = prob.shape[0]
    K = prob.shape[1]

    m = pulp.LpProblem('Problem', pulp.LpMaximize)  # 最大化問題

    # 最適化する変数(= 提出ラベル)
    x = pulp.LpVariable.dicts('x', [(i, j) for i in range(N) for j in range(K)], 0, 1, pulp.LpBinary)
    
    # log likelihood(目的関数)
    log_likelihood = pulp.lpSum([x[(i, j)] * logp[i, j] for i in range(N) for j in range(K)])
    m += log_likelihood
    
    # 各データについて，1クラスだけを予測ラベルとする制約
    for i in range(N):
        m += pulp.lpSum([x[(i, k)] for k in range(K)]) == 1  # i.e., SOS1
    
    # 各クラスについて，推定個数の合計に関する制約
    for k in range(K):
        m += pulp.lpSum([x[(i, k)] for i in range(N)]) == TEST_FREQS[k]
        
    m.solve()  # 解く

    assert m.status == 1  # assert 最適 <=>（実行可能解が見つからないとエラー）

    x_ast = np.array([[int(x[(i, j)].value()) for j in range(K)] for i in range(N)])  # 結果の取得
    return x_ast.argmax(axis=1) # 結果をonehotから -> {0, 1}のラベルに変換

In [None]:
def postprocess(final_output, test_df):
    assert final_output.shape[0] == test_df.shape[0]
    is_empty = test_df['contents'] == 'Subject: \r\n'
    final_output[is_empty] = np.array([0.0, 1.0])
    return final_output

In [None]:
test_df = pd.read_csv(TEST_FILE)

with torch.no_grad():
    final_output = np.empty([0])

    for model_name, model_path in tqdm(zip(MODEL_NAMES, MODEL_PATHS), total=len(MODEL_PATHS)):
        if "custom-tokenizer" in model_path:
            additional_tokens = [' enron ', ' ect ', ' hou ']
            tokenizer = Tokenizer(model_name, additional_tokens)
        else:
            tokenizer = Tokenizer(model_name)
        models = get_models(model_name, model_path, tokenizer)
        dataset = make_dataset(test_df, tokenizer, DEVICE)
        test_dataloader = torch.utils.data.DataLoader(
            dataset, batch_size=VALID_BATCH_SIZE, shuffle=False)
        
        for model in tqdm(models, total=len(models)):
            for inputs in test_dataloader:
                output = model(**inputs)
                output = output.cpu().detach().numpy()
                final_output = np.append(final_output, output)

In [None]:
final_output = final_output.reshape(-1, len(test_dataloader.dataset), NUM_CLASSES)
assert final_output.shape[0] == len(MODEL_NAMES) * NUM_SPLITS
final_output = np.mean(final_output, axis=0)
final_output = torch.from_numpy(final_output.astype(np.float32))
final_output = torch.softmax(final_output, dim=1).cpu().detach().numpy()

final_output = postprocess(final_output, test_df)
final_output = hack(final_output)

In [None]:
submit = pd.DataFrame(columns=['id', 'y'])
submit['id'] = range(1, 24838+1)
submit['y'] = final_output
submit.to_csv("./output/submission_ensemble.csv", index=False)
submit.head()

In [None]:
fig = px.pie(submit, names='y')
fig.show()