In [None]:
from google.colab import drive

drive.mount("/content/drive")

In [2]:
import os

os.chdir("/content/drive/My Drive/TMI/KoGPT2")

In [None]:
!pip install -r requirements.txt

In [None]:
import argparse
import re
import subprocess
import numpy as np
from tqdm import tqdm

import gluonnlp
import torch
from gluonnlp.data import SentencepieceTokenizer 
from torch.utils.data import DataLoader, Dataset
from tensorboardX import SummaryWriter

from kogpt2.model.sample import sample_sequence
from kogpt2.model.torch_gpt2 import GPT2Config, GPT2LMHeadModel
from kogpt2.utils import download, get_tokenizer, tokenizer

In [None]:
pytorch_kogpt2 = {
	'url':
	'https://kobert.blob.core.windows.net/models/kogpt2/pytorch/pytorch_kogpt2_676e9bcfa7.params',
	'fname': 'pytorch_kogpt2_676e9bcfa7.params',
	'chksum': '676e9bcfa7'
}

kogpt2_config = {
	"initializer_range": 0.02,
	"layer_norm_epsilon": 1e-05,
	"n_ctx": 1024,
	"n_embd": 768,
	"n_head": 12,
	"n_layer": 12,
	"n_positions": 1024,
	"vocab_size": 50000,
  "output_past": None
}

In [5]:
class Read_Dataset(Dataset):

	def __init__(self, file_path,vocab,tokenizer):
		self.file_path = file_path
		self.data =[]
		self.vocab =vocab
		self.tokenizer = tokenizer
		file = open(self.file_path, 'r', encoding='utf-8')

		df = pd.read_csv(self.file_path)

		datasets = []
		for _, row in df.iterrows():
			datasets.append([row["답변"]])
			
		print("tokenizer ending")
		for line in datasets:
			if not line[0]:
				break
			if len(line[0]) < 3:
				continue
			tokenized_line = tokenizer(line[0][:-1])

			index_of_words = [vocab[vocab.bos_token], ] + vocab[tokenized_line] + [vocab[vocab.eos_token]]

			if len(index_of_words) > 1024:
				continue
			elif len(index_of_words) < 100:
				continue

			self.data.append([index_of_words])

		print(np.shape(self.data))

	def __len__(self):
		return len(self.data)

	def __getitem__(self, index):
		item = self.data[index]
		return item

In [6]:
def auto_enter(text):
	text = (text.replace("   ", "\n"))
	text = text.split("\n")

	text = [t.lstrip() for t in text if t != '']
	return "\n\n".join(text)


def main(epoch, save_path, load_path, samples, data_file_path, batch_size):
	ctx = 'cuda'
	cachedir = '~/kogpt2/'

	summary = SummaryWriter()

	model_info = pytorch_kogpt2
	model_path = download(model_info['url'],
						   model_info['fname'],
						   model_info['chksum'],
						   cachedir=cachedir)
 
	vocab_info = tokenizer
	vocab_path = download(vocab_info['url'],
						   vocab_info['fname'],
						   vocab_info['chksum'],
						   cachedir=cachedir)

	kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config))

	kogpt2model.load_state_dict(torch.load(model_path))

	device = torch.device(ctx)
	kogpt2model.to(device)

	try:
		checkpoint = torch.load(load_path, map_location=device)

		kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config))
		kogpt2model.load_state_dict(checkpoint['model_state_dict'])

		kogpt2model.eval()
	except:
		count = 0
	else:
		count = int(re.findall("\d+", load_path)[1])

	print(count)
 
	kogpt2model.train()
	vocab_b_obj = gluonnlp.vocab.BERTVocab.from_sentencepiece(vocab_path,
														 mask_token=None,
														 sep_token=None,
														 cls_token=None,
														 unknown_token='<unk>',
														 padding_token='<pad>',
														 bos_token='<s>',
														 eos_token='</s>')


	tok_path = get_tokenizer()
	model, vocab = kogpt2model, vocab_b_obj
	tok = SentencepieceTokenizer(tok_path)

	dataset = Read_Dataset(data_file_path, vocab, tok)
	print("Read_Dataset ok")
	data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, pin_memory=True)



	learning_rate = 3e-5
	criterion = torch.nn.CrossEntropyLoss()
	optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

	print('KoGPT-2 Transfer Learning Start')
	avg_loss = (0.0, 0.0)

	for epoch in range(epoch):
		for data in data_loader:
			optimizer.zero_grad()
			data = torch.stack(data[0])
			data = data.transpose(1,0)
			data = data.to(ctx)
			model = model.to(ctx)

			outputs = model(data, labels=data)
			loss, logits = outputs[:2]
			loss = loss.to(ctx)
			loss.backward()
			avg_loss = (avg_loss[0] * 0.99 + loss, avg_loss[1] * 0.99 + 1.0)
			optimizer.step()
			if count % 100 == 0:
				print('epoch no.{0} train no.{1}  loss = {2:.5f} avg_loss = {3:.5f}' . format(epoch, count, loss, avg_loss[0] / avg_loss[1]))
				summary.add_scalar('loss/avg_loss', avg_loss[0] / avg_loss[1], count)
				summary.add_scalar('loss/loss', loss, count)


			if (count > 0 and count % 1000 == 0) or (len(data) < batch_size):
				sent = sample_sequence(model.to("cpu"), tok, vocab, sent="지원", text_size=100, temperature=0.7, top_p=0.8, top_k=40)
				sent = sent.replace("<unused0>", "\n")
				sent = auto_enter(sent)
				print(sent)

				summary.add_text('Text', sent, count)

				if count > 500000:
					now = [int(n) for n in os.listdir(samples)]
					now = max(now)
					f = open(samples + str(now + 1), 'w', encoding="utf-8")
					f.write(sent)
					f.close()
		 
			count += 1

			if (count > 0 and count % 10000 == 0) or (len(data) < batch_size):
				
				try:
					torch.save({
						'epoch': epoch,
						'train_no': count,
						'model_state_dict': model.state_dict(),
						'optimizer_state_dict': optimizer.state_dict(),
						'loss': loss
					}, save_path + 'KoGPT2_checkpoint_' + str(count) + '.tar')
				except:
					pass

In [7]:
os.chdir("/content/drive/My Drive/TMI")

In [None]:
import pandas as pd
from preprocess import _preprocess_qna

df = pd.read_csv("jobkorea_all.csv")
df = _preprocess_qna(df)
df = df[["질문", "답변", "총평"]]
df.loc[:, "답변"].to_csv("dataset.txt", index=False)

In [10]:
print(torch.cuda.is_available())
print(torch.cuda.device_count())

True
1


In [9]:
!nvidia-smi

Mon Aug 24 15:12:59 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.57       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P0    58W / 149W |    432MiB / 11441MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
main(epoch=1,
     save_path="/content/drive/My Drive/TMI/KoGPT2/",
     load_path=None,
     samples="/content/drive/My Drive/TMI/KoGPT2/",
     data_file_path="dataset.txt",
     batch_size=1)

[██████████████████████████████████████████████████]
[██████████████████████████████████████████████████]
0
using cached model
tokenizer ending
(23335, 1)
Read_Dataset ok
KoGPT-2 Transfer Learning Start
epoch no.0 train no.0  loss = 5.63261 avg_loss = 5.63261
epoch no.0 train no.100  loss = 5.16856 avg_loss = 4.78414
epoch no.0 train no.200  loss = 4.04943 avg_loss = 4.67989
epoch no.0 train no.300  loss = 4.85964 avg_loss = 4.67364
epoch no.0 train no.400  loss = 4.88377 avg_loss = 4.64908
epoch no.0 train no.500  loss = 4.78443 avg_loss = 4.61110
epoch no.0 train no.600  loss = 4.92731 avg_loss = 4.62429
epoch no.0 train no.700  loss = 4.31770 avg_loss = 4.56981
epoch no.0 train no.800  loss = 4.55606 avg_loss = 4.56274
epoch no.0 train no.900  loss = 4.79640 avg_loss = 4.53951
epoch no.0 train no.1000  loss = 4.22772 avg_loss = 4.50387
101
to_tokens: ['▁저는', '자의', '▁지원', '학기', '▁이상의', '▁전공', '에', '▁지원', '▁다양한', '에', '▁대해', '▁전문성을', '한', '▁후', '▁지원', '개', '▁이상의', '▁직무', '에', '▁걸쳐', '