In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 31.0 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 80.6 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 59.3 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Importing stock libraries
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

# Importing the T5 modules from huggingface/transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import pipeline

In [4]:
# # Setting up the device for GPU usagea
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [5]:
device

'cuda'

In [6]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len
        self.text = self.data.news
        self.ctext = self.data.summary

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        ctext = str(self.ctext[index])
        ctext = ' '.join(ctext.split())

        text = str(self.text[index])
        text = ' '.join(text.split())

        source = self.tokenizer.batch_encode_plus([ctext], max_length= self.source_len, pad_to_max_length=True,return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([text], max_length= self.summ_len, pad_to_max_length=True,return_tensors='pt')

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }

In [7]:
def train(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    for _,data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)

        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=lm_labels)
        loss = outputs[0]
        
        if _%500==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # xm.optimizer_step(optimizer)
        # xm.mark_step()

In [8]:
def validate(epoch, tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask, 
                max_length=150, 
                num_beams=2,
                repetition_penalty=2.5, 
                length_penalty=1.0, 
                early_stopping=True
                )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
            if _%100==0:
                print(f'Completed {_}')

            predictions.extend(preds)
            actuals.extend(target)
    return predictions, actuals

In [10]:
TRAIN_BATCH_SIZE = 4    # input batch size for training (default: 64)
VALID_BATCH_SIZE = 4    # input batch size for testing (default: 1000)
TRAIN_EPOCHS = 3        # number of epochs to train (default: 10)
VAL_EPOCHS = 1 
LEARNING_RATE = 1e-4    # learning rate (default: 0.01)
SEED = 42               # random seed (default: 42)
MAX_LEN = 512
SUMMARY_LEN = 50 

# Set random seeds and deterministic pytorch for reproducibility
torch.manual_seed(SEED) # pytorch random seed
np.random.seed(SEED) # numpy random seed
torch.backends.cudnn.deterministic = True

# tokenzier for encoding the text
tokenizer = AutoTokenizer.from_pretrained("noahkim/KoT5_news_summarization")


# Importing and Pre-Processing the domain data
# Selecting the needed columns only. 
# Adding the summarzie text in front of the text. This is to format the dataset similar to how T5 model was trained for summarization task. 
df_train = pd.read_csv('/content/drive/MyDrive/kbalbert/data/final_train_clean.tsv',sep='\t')
df_test = pd.read_csv('/content/drive/MyDrive/kbalbert/data/final_test_clean.tsv',sep='\t')
df_train.summary = 'summarize: ' + df_train.summary
df_test.summary = 'summarize: ' + df_test.summary
print(df_train.head())


# Creation of Dataset and Dataloader
# Defining the trainsize. So 80% of the data will be used for training and the rest will be used for validation. 

print("TRAIN Dataset: {}".format(df_train.shape))
print("Test Dataset: {}".format(df_test.shape))


# Creating the Training and Validation dataset for further creation of Dataloader
training_set = CustomDataset(df_train, tokenizer, MAX_LEN, SUMMARY_LEN)
test_set = CustomDataset(df_test, tokenizer, MAX_LEN, SUMMARY_LEN)

# Defining the parameters for creation of dataloaders
train_params = {
    'batch_size': TRAIN_BATCH_SIZE,
    'shuffle': True,
    'num_workers': 0
    }

val_params = {
    'batch_size': VALID_BATCH_SIZE,
    'shuffle': False,
    'num_workers': 0
    }

# Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(test_set, **val_params)



# Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary. 
# Further this model is sent to device (GPU/TPU) for using the hardware.
model = AutoModelForSeq2SeqLM.from_pretrained("noahkim/KoT5_news_summarization")
model = model.to(device)

# Defining the optimizer that will be used to tune the weights of the network in the training session. 
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

# Training loop
print('Initiating Fine-Tuning for the model on our dataset')

for epoch in range(TRAIN_EPOCHS):
    train(epoch, tokenizer, model, device, training_loader, optimizer)


# Validation loop and saving the resulting file with predictions and acutals in a dataframe.
# Saving the dataframe as predictions.csv
print('Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe')
for epoch in range(VAL_EPOCHS):
    predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
    final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
    final_df.to_csv('/content/drive/MyDrive/kbalbert/data/predictions2.csv')
    print('Output Files generated for review')

                                                news  \
0  1분기 취급고 최고 실적 달성 현대홈쇼핑 별도기준 1분기 실적은 취급고 8,789억...   
1  3분기 실적은 시장과 당사 추정치 상회 동사의 3분기 연결 실적은 매출총이익 278...   
2   엊저녁에도 먹은 게 없는데 아침에도 밥이 넘어가지 않는다. 숭늉 몇 모금 마신 게...   
3   한국인들의 테이블 매너가 하인격(下人格)임은 이미 정평이 나 있다. 식사 자세나 ...   
4  5월 실적 발표, 해외 사업 호실적 지속 오리온이 5월 월별 실적을 발표하였다. 1...   

                                             summary  length  length_summ  
0  summarize: 현대홈쇼핑 1분기 실적이 발표되었다. 현대홈쇼핑 1분기 실적은 ...     222           33  
1  summarize: 동사의 3분기 실적은 시장 기대치를 상회했습니다. 4분기에도 실...     221           21  
2  summarize: 배낭을 포터에게 넘겨 맨몸인데도 몇십 m 걷고 나서는 주저앉아 ...     187           15  
3  summarize: 한국인들은 식사의 시작과 끝도 모르는 무례함으로 식당을 무시하여...     203           15  
4  summarize: 오리온 5월 실적은 해외 사업의 호실적이 지속되었습니다. 이에 ...     213           20  
TRAIN Dataset: (30626, 4)
Test Dataset: (3818, 4)


Downloading:   0%|          | 0.00/813 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.10G [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Initiating Fine-Tuning for the model on our dataset




Epoch: 0, Loss:  4.807136535644531
Epoch: 0, Loss:  1.719322681427002
Epoch: 0, Loss:  1.4292476177215576
Epoch: 0, Loss:  1.518933653831482
Epoch: 0, Loss:  2.404712677001953
Epoch: 0, Loss:  1.268314003944397
Epoch: 0, Loss:  1.7397750616073608
Epoch: 0, Loss:  2.123887300491333
Epoch: 0, Loss:  1.6115427017211914
Epoch: 0, Loss:  1.7014131546020508
Epoch: 0, Loss:  1.5895202159881592
Epoch: 0, Loss:  1.9859614372253418
Epoch: 0, Loss:  1.7019606828689575
Epoch: 0, Loss:  1.7827463150024414
Epoch: 0, Loss:  1.3692964315414429
Epoch: 0, Loss:  1.5062108039855957
Epoch: 1, Loss:  1.586114525794983
Epoch: 1, Loss:  1.3709710836410522
Epoch: 1, Loss:  1.582084059715271
Epoch: 1, Loss:  2.1458122730255127
Epoch: 1, Loss:  1.0963730812072754
Epoch: 1, Loss:  1.018862247467041
Epoch: 1, Loss:  1.1788185834884644
Epoch: 1, Loss:  1.1945492029190063
Epoch: 1, Loss:  1.6988463401794434
Epoch: 1, Loss:  0.6024692058563232
Epoch: 1, Loss:  1.517054796218872
Epoch: 1, Loss:  2.052272319793701
Epo

Rouge score

In [14]:
final = pd.read_csv('/content/drive/MyDrive/kbalbert/data/predictions2.csv', index_col=0)
final

Unnamed: 0,Generated Text,Actual Text
0,"호텔닷컴은 지난 5월 자녀가 있는 가정을 대상으로 설문조사를 진행한 결과, 국내에서...",호텔스닷컴이 그랜드 워커힐 서울과 협업해 꾸민 '호텔스닷컴 벨퍼그 랜드 룸' 세계적...
1,유기농 여성용품 브랜드 '라엘'이 국내 공식 쇼핑몰을 오픈했다. 18일 LA에 따르...,[이데일리 이성웅 기자] 유기농 여성용품 브랜드 라엘은 국내 공식 쇼핑몰을 열었다고...
2,"대한항공은 3Q 매출액 2조4,590억원(YoY +6.1%), 영업이익 2,503억...","3Q 영업이익 2,503억원, 메르스 효과에도 양호한 실적 지난 3분기 대한항공은 ..."
3,우양식품은 지난 6일 서울 여의도에서 열린 기업공개(IPO) 기자간담회에서 중장기 ...,이구열 우양 대표는 6일 서울 여의도에서 열린 기업공개(IPO) 기자간담회에서 중장...
4,자동차용 스틸 파이프와 변속기용 플레이트 생산 업체 동사는 자동차용 스틸 파이프와 ...,자동차용 스틸 파이프와 변속기용 플레이트 부문 일인자 동사는 1977년 설립되었으며...
...,...,...
3813,SK텔레콤이 박정호 사장의 총 보수가 38억8100만원이라고 14일 밝혔다. 같은 ...,SK텔레콤 박정호 사장이 3일 오전 서울 중구 SK타워에서 진행된 '5GX 서비스'...
3814,전세가율이 80%를 넘는 지역의 경우 새 아파트로 갈아타려는 수요자들의 관심이 쏠릴...,전세가율(매매가격 대비 전사가격 비율)이 80%가 넘는 지역 내 신규 물량에 내 집...
3815,HDC현대산업개발은 연결기준 2분기 영업이익이 1958억원으로 작년 동기 대비 96...,GS건설과 HDC현대산업개발이 올 2분기 실적에선 희비가 엇갈렸지만 수익성이나 재무...
3816,"LS전선아시아 상장 예비심사 통과, 조만간 KOSPI에 상장 예정 LS전선아시아가 ...","LS전선아시아 상장, 재무구조 개선, 전선 가치 재조명 계기 LS의 손자회사인 LS..."


In [17]:
# 데이터프레임 열 원소 길이 모두 보이게 하기
pd.set_option("max_colwidth", None)

In [29]:
final.loc[18]['Generated Text']

'1분기 실적, 시장 기대치 충족 신세계푸드는 1분기에 매출액 2조 6,059억원(+4.3% YoY), 영업이익'

In [13]:
!pip install rouge

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [15]:
# 리스트로 만들기
label = list(final['Actual Text'])
pred = list(final['Generated Text'])

In [16]:
from rouge import Rouge

rouge = Rouge()
rouge.get_scores(pred, label, avg=True)

{'rouge-1': {'r': 0.24683344063835813,
  'p': 0.2701026979496089,
  'f': 0.25590992065036894},
 'rouge-2': {'r': 0.139210296172966,
  'p': 0.1535117008240181,
  'f': 0.1449235528914146},
 'rouge-l': {'r': 0.238677660110994,
  'p': 0.2613465665313218,
  'f': 0.24753830360925433}}