In [None]:
from logging.config import dictConfig
import logging

dictConfig({
    'version': 1,
    'formatters': {
        'default': {
            'format': '[%(asctime)s] %(message)s',
        }
    },
    'handlers': {
        'file': {
            'level': 'DEBUG',
            'class': 'logging.FileHandler',
            'filename': 'debug.log',
            'formatter': 'default',
        },
    },
    'root': {
        'level': 'DEBUG',
        'handlers': ['file']
    }
})


def myfunc():
    logging.debug("함수가 시작되었습니다.")


myfunc()

In [None]:
!pip install mxnet
!pip install gluonnlp pandas tqdm
!pip install sentencepiece
!pip install transformers==3.0.2
!pip install torch

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook

from gluonnlp.data import SentencepieceTokenizer

In [None]:
#GPU 사용
device = torch.device("cuda:0")

In [None]:
# 구글드라이브 연동
from google.colab import drive
drive.mount('/content/MyDrive')

In [None]:
# 특정 파일 불러오기
import pandas as pd
train_preprocessed = pd.read_csv("/content/MyDrive/MyDrive/test/train_preprocessed_ver2_no_lower_no_ques.csv")
test_preprocessed = pd.read_csv("/content/MyDrive/MyDrive/test/test_preprocessed_ver2_no_lower_no_ques.csv")

In [None]:
import random
random_seed = 0

torch.manual_seed(random_seed)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

torch.cuda.manual_seed(random_seed)
torch.cuda.manual_seed_all(random_seed)

random.seed(random_seed)
np.random.seed(random_seed)

Over Sampling

In [None]:
small_label = train_preprocessed["label"].value_counts()[train_preprocessed["label"].value_counts() < 20].index
mask = train_preprocessed['label'].isin(small_label)
smalls = train_preprocessed[mask].reset_index(drop=True)
train_preprocessed=train_preprocessed.append(smalls) # 두배
train_preprocessed=train_preprocessed.append(smalls) # 세배
train_preprocessed.reset_index(drop=True)

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(train_preprocessed["label"])

label = le.transform(train_preprocessed["label"])
train_preprocessed["encoded_label"] = label

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
le.classes_

## 최종 5개 앙상블

In [None]:
final_fold0_predict = np.load("/content/MyDrive/MyDrive/test/final_fold0_100_predict_epoch7.npy")
final_fold1_predict = np.load("/content/MyDrive/MyDrive/test/final_fold1_100_predict_epoch7.npy")
final_fold2_predict = np.load("/content/MyDrive/MyDrive/test/final_kfold2_100만개_epoch7.npy")
final_fold3_predict = np.load("/content/MyDrive/MyDrive/test/final_kfold3_100만개_epoch7.npy")
final_fold4_predict = np.load("/content/MyDrive/MyDrive/test/final_kfold4_100만개_epoch6.npy")

In [None]:
# 삼중배열로 되어있기 때문에 이중배열로 변경해주기

def make_matrix(lst):
    res = []
    for i in lst:
        res.append(i[0])
    return res

import torch
from torch import nn
import torch.nn.functional as F

def get_softmax(matrix):
    res = []
    # 2차원 리스트 입력
    for row in matrix:
        logit_tensor = torch.tensor(row)
        res.append(torch.nn.functional.softmax(logit_tensor,dim=0).numpy())
        
    return res

final_fold0_predict = make_matrix(final_fold0_predict)
final_fold1_predict = make_matrix(final_fold1_predict)
final_fold2_predict = make_matrix(final_fold2_predict)
final_fold3_predict = make_matrix(final_fold3_predict)
final_fold4_predict = make_matrix(final_fold4_predict)

final_fold0_predict = get_softmax(final_fold0_predict)
final_fold1_predict = get_softmax(final_fold1_predict)
final_fold2_predict = get_softmax(final_fold2_predict)
final_fold3_predict = get_softmax(final_fold3_predict)
final_fold4_predict = get_softmax(final_fold4_predict)

### Softmax로 확률값으로 변환하기

In [None]:
# 5개 앙상블의 확률값이 평균
final_fold_sum_predict = []

for i in range(100000):
    final_fold_sum_predict.append((final_fold0_predict[i] + final_fold1_predict[i] + final_fold2_predict[i] + final_fold3_predict[i] + final_fold4_predict[i])/5)

final_predict = np.array(final_fold_sum_predict).argmax(axis=1)

In [None]:
final_predict_df = test_preprocessed.copy()
final_predict_df["label"] = le.inverse_transform(final_predict)
final_predict_df.head()

In [None]:
test_id = []
for i in range(1,100001):
    num = str(i).zfill(6)
    test_id.append("id_" + num)

In [None]:
# label 분할
#predict_df["AI_id"] = test_id
final_predict_df["digit_1"] = final_predict_df["label"].apply(lambda x : x[0])
final_predict_df["digit_2"] = final_predict_df["label"].apply(lambda x : x[1:3] if len(x) == 6 else x[1]) # 4글자, 6글자인 경우 다르게 출력
final_predict_df["digit_3"] = final_predict_df["label"].apply(lambda x : x[3:] if len(x) == 6 else x[-2:]) # 4글자, 6글자인 경우 다르게 출력

In [None]:
final_predict_df = final_predict_df[["AI_id","digit_1","digit_2","digit_3"]]
final_predict_df.head()

In [None]:
final_predict_df.to_csv("/content/MyDrive/MyDrive/test/predict_0412.csv",encoding="utf-8-sig",index=False)