In [None]:
!pip install transformers
!pip install keras
!pip install tensorflow
!pip install jupyter-resource-usage

In [1]:
# In[]
import gc
import torch

from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

import numpy as np
import random
import time
import datetime

import io
import pandas as pd
import torch
import torch.nn as nn
import torchvision.datasets as dsets

USE_CUDA = True
RANDOM_SEED=43 # 재현을 위해 랜덤시드 고정
TOKEN_MAX_LEN = 128*4
BATCH_SIZE = 12
STATUS_PRINT_INTERVAL=25

random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed_all(RANDOM_SEED)


if USE_CUDA and torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [4]:
#%%
data = pd.read_csv('data/labeled/samsung_2010_2021.csv', encoding='utf-8', dtype={'label':np.float32})
test_cnt = int(data.shape[0] * 0.25)

test = data[:test_cnt]
train = data[test_cnt:]

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)
print('train&validation data processing')
# Train,Validation Data Preprocessing
input_ids = [tokenizer.encode(s,max_length=TOKEN_MAX_LEN,truncation=True) for s in train['text']]
input_ids = pad_sequences(input_ids, maxlen=TOKEN_MAX_LEN, dtype='long', truncating='post', padding='post')
attention_mask = []
for seq in input_ids:
    seq_mask = [float(i > 0) for i in seq]
    attention_mask.append(seq_mask)

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, train['label'].values, random_state=RANDOM_SEED, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_mask,input_ids,random_state=RANDOM_SEED,test_size=0.1)

train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)
validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks)

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=BATCH_SIZE)

print('test data processing')
# Test Data Preprocessing
input_ids = [tokenizer.encode(sent,max_length=TOKEN_MAX_LEN,truncation=True) for sent in test['text']]
input_ids = pad_sequences(input_ids, maxlen=TOKEN_MAX_LEN, dtype="long", truncating="post", padding="post")
attention_masks = []
for seq in input_ids:
    seq_mask = [float(i > 0) for i in seq]
    attention_masks.append(seq_mask)

test_inputs = torch.tensor(input_ids)
test_labels = torch.tensor(test['label'].values)
test_masks = torch.tensor(attention_masks)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=BATCH_SIZE)

model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=1).to(device)
print('Model Created')

optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 4
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)


def format_time(elapsed):
    elapsed_rounded = int(round(elapsed))
    return str(datetime.timedelta(seconds=elapsed_rounded))

model.zero_grad()

for epoch_i in range(0, epochs):
    print('\n======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    t0 = time.time()
    total_loss = 0
    gc.collect()
    torch.cuda.empty_cache()
    model.train()
    for step, batch in enumerate(train_dataloader):
        pass
        if step and step % STATUS_PRINT_INTERVAL == 0:
            elapsed = format_time(time.time() - t0)
            print('{:>5,}/{:>5,}, Elapsed {:}'.format(step, len(train_dataloader), elapsed))

        b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)
        pred = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)[0]
        loss = torch.mean(torch.abs(pred-b_labels))
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        model.zero_grad()
    avg_train_loss = total_loss / len(train_dataloader)

    print("\n  Average training loss: {0:.8f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))

    print("\nRunning Validation...")

    t0 = time.time()
    model.eval()
    eval_mae=0
    for batch in validation_dataloader:
        b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            pred = model(b_input_ids,token_type_ids=None,attention_mask=b_input_mask)[0]
        loss = torch.mean(torch.abs(pred-b_labels))
        eval_mae += loss.item()
    print("  Validation MAE: {0:.8f}".format(eval_mae / len(validation_dataloader)))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))
print("\nTraining complete!")

t0 = time.time()
model.eval()
eval_mae=0
for step, batch in enumerate(test_dataloader):
    if step and step % STATUS_PRINT_INTERVAL == 0:
        elapsed = format_time(time.time() - t0)
        print('{:>5,}/{:>5,}, Elapsed {:}'.format(step, len(test_dataloader), elapsed))
    b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)
    with torch.no_grad():
        pred = model(b_input_ids,token_type_ids=None,attention_mask=b_input_mask)[0]
    loss = torch.mean(torch.abs(pred-b_labels))
    eval_mae += loss.item()
print("\nTest MAE: {0:.8f}".format(eval_mae / len(test_dataloader)))
print("Test took: {:}".format(format_time(time.time() - t0)))
# %%

train&validation data processing
test data processing


Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

Model Created

Training...
   25/  695, Elapsed 0:00:30
   50/  695, Elapsed 0:01:01
   75/  695, Elapsed 0:01:32
  100/  695, Elapsed 0:02:02
  125/  695, Elapsed 0:02:33
  150/  695, Elapsed 0:03:03
  175/  695, Elapsed 0:03:33
  200/  695, Elapsed 0:04:04
  225/  695, Elapsed 0:04:34
  250/  695, Elapsed 0:05:05
  275/  695, Elapsed 0:05:35
  300/  695, Elapsed 0:06:05
  325/  695, Elapsed 0:06:35
  350/  695, Elapsed 0:07:06
  375/  695, Elapsed 0:07:36
  400/  695, Elapsed 0:08:07
  425/  695, Elapsed 0:08:37
  450/  695, Elapsed 0:09:07
  475/  695, Elapsed 0:09:38
  500/  695, Elapsed 0:10:09
  525/  695, Elapsed 0:10:39
  550/  695, Elapsed 0:11:09
  575/  695, Elapsed 0:11:39
  600/  695, Elapsed 0:12:09
  625/  695, Elapsed 0:12:40
  650/  695, Elapsed 0:13:10
  675/  695, Elapsed 0:13:41

  Average training loss: 0.01001529
  Training epcoh took: 0:14:04

Running Validation...
  Validation MAE: 0.01536852
  Validation took: 0:00:34

Training...
   25/  695, Elapsed 0:00:31
 

In [48]:

model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=1).to(device)
print('TestSet AAD: {}'.format(np.mean(np.abs(np.median(test['label'])-test['label']))))
idx=12

with torch.no_grad():
    pred = model(test_inputs[idx:idx+1].to(device),token_type_ids=None,attention_mask=test_masks[idx:idx+1].to(device))[0]
    loss = torch.mean(torch.abs(pred-test_labels[idx:idx+1].to(device)))
    print('Label: {}'.format(test_labels[idx:idx+1]))
    print('Pred: {}'.format(pred))
    print('Loss: {}'.format(loss))
    print(tokenizer.decode(test_inputs[idx]))

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

TestSet AAD: 0.01192370243370533
Label: tensor([-0.0001])
Pred: tensor([[-0.0300]], device='cuda:0')
Loss: 0.029930904507637024
[CLS] 원본보기 ▲ 삼성전자 2010년형 노트PC CES서 공개삼성전자가 글로벌 노트PC 시장 공략을 본격화하고 있다. 삼성전자는 8일 미국 라스베이거스에서 열리고 있는 세계 최대 가전전시회 CES 2010에서 2010년형 프리미엄 노트북 신제품 6종을 공개한다고 밝혔다. 삼성전자는 우선 크리스털 원석과 같은 감성적인 스타일의 R780, R580, R480 제품을 선보였다. 이들 제품은 블랙 - 레드 그라데이션 컬러에 감각적인 [UNK] S [UNK] 자 모양의 배면 패턴을 적용, 빛의 각도에 따라 보다 깊이가 있는 S패턴을 연출한다. 아울러 은은하게 빛나는 발광다이오드 ( LED ) 터치패드 터치 라이팅 기술을 적용했으며 부드러운 터치감과 작업 효율이 높은 프리미엄 아일랜드 키보드 ( Island Keyboard ) 를 장착했다. 성능도 기존 플랫폼 대비 20 % 이상 속도가 빨라진 인텔의 2010년형 새 플랫폼인 [UNK] 칼펠라 ( Calpella ) [UNK] 프로세서를 탑재했으며 최대 4GB의 DDR3 1066MHZ 메모리를 내장해 한층 빠르고 강력해진 시스템 성능을 제공한다. 또한 엔디비아 최신 그래픽 ( GT 330M, 1GB gDDR3 ) 카드를 탑재해 3D게임의 다이나믹한 영상을 즐길 수 있다. 감각적인 디자인의 보급형 노트PC 라인업인 R730, R530, R430도 선보였다. 이들 제품은 크리스털 느낌의 다양한 패턴이 어우러진 세련된 디자인에 최신 기능을 두루 탑재한 것이 가장 큰 특징이다. 성면에서는 엔디비아 최신 그래픽 카드 ( 310M, 512MB gDDR3 ) 를 탑재했으며 HD LED 디스플레이를 적용해 밝고 선명한 HD 영상을 감상할 수 있다. 삼성전자 IT솔루션사업부장 남성우 부사장은 [UNK] 이번