<a href="https://colab.research.google.com/github/sunnie720/stockReport_summarization/blob/main/KoT5_%E1%84%8B%E1%85%A6%E1%84%91%E1%85%A9%E1%86%A82.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 4.3 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 67.2 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 61.9 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [None]:
# Importing stock libraries
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

# Importing the T5 modules from huggingface/transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import pipeline

In [None]:
# # Setting up the device for GPU usagea
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
device

'cuda'

In [None]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len
        self.text = self.data.news
        self.ctext = self.data.summary

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        ctext = str(self.ctext[index])
        ctext = ' '.join(ctext.split())

        text = str(self.text[index])
        text = ' '.join(text.split())

        source = self.tokenizer.batch_encode_plus([ctext], max_length= self.source_len, pad_to_max_length=True,return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([text], max_length= self.summ_len, pad_to_max_length=True,return_tensors='pt')

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }

In [None]:
def train(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    for _,data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)

        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=lm_labels)
        loss = outputs[0]
        
        if _%500==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # xm.optimizer_step(optimizer)
        # xm.mark_step()

In [None]:
def validate(epoch, tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask, 
                max_length=150, 
                num_beams=2,
                repetition_penalty=2.5, 
                length_penalty=1.0, 
                early_stopping=True
                )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
            if _%100==0:
                print(f'Completed {_}')

            predictions.extend(preds)
            actuals.extend(target)
    return predictions, actuals

In [None]:
TRAIN_BATCH_SIZE = 4    # input batch size for training (default: 64)
VALID_BATCH_SIZE = 4    # input batch size for testing (default: 1000)
TRAIN_EPOCHS = 2        # number of epochs to train (default: 10)
VAL_EPOCHS = 1 
LEARNING_RATE = 1e-4    # learning rate (default: 0.01)
SEED = 42               # random seed (default: 42)
MAX_LEN = 512
SUMMARY_LEN = 80

# Set random seeds and deterministic pytorch for reproducibility
torch.manual_seed(SEED) # pytorch random seed
np.random.seed(SEED) # numpy random seed
torch.backends.cudnn.deterministic = True

# tokenzier for encoding the text
tokenizer = AutoTokenizer.from_pretrained("noahkim/KoT5_news_summarization")


# Importing and Pre-Processing the domain data
# Selecting the needed columns only. 
# Adding the summarzie text in front of the text. This is to format the dataset similar to how T5 model was trained for summarization task. 
df_train = pd.read_csv('/content/drive/MyDrive/DA_KoT5/data/final_train_clean.tsv',sep='\t')
df_test = pd.read_csv('/content/drive/MyDrive/DA_KoT5/data/final_test_clean.tsv',sep='\t')
df_train.summary = 'summarize: ' + df_train.summary
df_test.summary = 'summarize: ' + df_test.summary
print(df_train.head())


# Creation of Dataset and Dataloader
# Defining the trainsize. So 80% of the data will be used for training and the rest will be used for validation. 

print("TRAIN Dataset: {}".format(df_train.shape))
print("Test Dataset: {}".format(df_test.shape))


# Creating the Training and Validation dataset for further creation of Dataloader
training_set = CustomDataset(df_train, tokenizer, MAX_LEN, SUMMARY_LEN)
test_set = CustomDataset(df_test, tokenizer, MAX_LEN, SUMMARY_LEN)

# Defining the parameters for creation of dataloaders
train_params = {
    'batch_size': TRAIN_BATCH_SIZE,
    'shuffle': True,
    'num_workers': 0
    }

val_params = {
    'batch_size': VALID_BATCH_SIZE,
    'shuffle': False,
    'num_workers': 0
    }

# Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(test_set, **val_params)



# Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary. 
# Further this model is sent to device (GPU/TPU) for using the hardware.
model = AutoModelForSeq2SeqLM.from_pretrained("noahkim/KoT5_news_summarization")
model = model.to(device)

# Defining the optimizer that will be used to tune the weights of the network in the training session. 
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

# Training loop
print('Initiating Fine-Tuning for the model on our dataset')

for epoch in range(TRAIN_EPOCHS):
    train(epoch, tokenizer, model, device, training_loader, optimizer)


# Validation loop and saving the resulting file with predictions and acutals in a dataframe.
# Saving the dataframe as predictions.csv
print('Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe')
for epoch in range(VAL_EPOCHS):
    predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
    final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
    final_df.to_csv('/content/drive/MyDrive/DA_KoT5/data/predictions2.csv')
    print('Output Files generated for review')

Downloading:   0%|          | 0.00/2.38k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.92M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

                                                news  \
0  1분기 취급고 최고 실적 달성 현대홈쇼핑 별도기준 1분기 실적은 취급고 8,789억...   
1  3분기 실적은 시장과 당사 추정치 상회 동사의 3분기 연결 실적은 매출총이익 278...   
2   엊저녁에도 먹은 게 없는데 아침에도 밥이 넘어가지 않는다. 숭늉 몇 모금 마신 게...   
3   한국인들의 테이블 매너가 하인격(下人格)임은 이미 정평이 나 있다. 식사 자세나 ...   
4  5월 실적 발표, 해외 사업 호실적 지속 오리온이 5월 월별 실적을 발표하였다. 1...   

                                             summary  length  length_summ  
0  summarize: 현대홈쇼핑 1분기 실적이 발표되었다. 현대홈쇼핑 1분기 실적은 ...     222           33  
1  summarize: 동사의 3분기 실적은 시장 기대치를 상회했습니다. 4분기에도 실...     221           21  
2  summarize: 배낭을 포터에게 넘겨 맨몸인데도 몇십 m 걷고 나서는 주저앉아 ...     187           15  
3  summarize: 한국인들은 식사의 시작과 끝도 모르는 무례함으로 식당을 무시하여...     203           15  
4  summarize: 오리온 5월 실적은 해외 사업의 호실적이 지속되었습니다. 이에 ...     213           20  
TRAIN Dataset: (30626, 4)
Test Dataset: (3818, 4)


Downloading:   0%|          | 0.00/813 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.10G [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Initiating Fine-Tuning for the model on our dataset




Epoch: 0, Loss:  4.837522506713867
Epoch: 0, Loss:  1.4972236156463623
Epoch: 0, Loss:  1.6022861003875732
Epoch: 0, Loss:  1.6178325414657593
Epoch: 0, Loss:  2.4130265712738037
Epoch: 0, Loss:  1.3091421127319336
Epoch: 0, Loss:  1.5908522605895996
Epoch: 0, Loss:  2.1882500648498535
Epoch: 0, Loss:  1.6534329652786255
Epoch: 0, Loss:  1.5900858640670776
Epoch: 0, Loss:  1.279703974723816
Epoch: 0, Loss:  2.0344433784484863
Epoch: 0, Loss:  1.8160114288330078
Epoch: 0, Loss:  1.6321911811828613
Epoch: 0, Loss:  1.5274924039840698
Epoch: 0, Loss:  1.571789026260376
Epoch: 1, Loss:  1.4062248468399048
Epoch: 1, Loss:  1.3579702377319336
Epoch: 1, Loss:  1.5107606649398804
Epoch: 1, Loss:  1.8214466571807861
Epoch: 1, Loss:  1.0988507270812988
Epoch: 1, Loss:  1.1684317588806152
Epoch: 1, Loss:  1.4857028722763062
Epoch: 1, Loss:  1.2817463874816895
Epoch: 1, Loss:  1.6386401653289795
Epoch: 1, Loss:  0.5889105200767517
Epoch: 1, Loss:  1.406765103340149
Epoch: 1, Loss:  1.9399232864379

In [None]:
final = pd.read_csv('/content/drive/MyDrive/DA_KoT5/data/predictions2.csv', index_col=0)
print(final)

                                         Generated Text  \
0     호텔닷컴은 지난 5월 자녀가 있는 가정을 대상으로 설문조사를 진행한 결과, 국내서 ...   
1     유기농 여성용품 브랜드 '라엘'이 국내 공식 쇼핑몰을 오픈했다. 이번 오픈은 다양한...   
2     대한항공의 3분기 영업이익은 2,503억원으로 시장 기대치에 부합했을 전망이다. 메...   
3     음료베이스 및 퓨레, HMR 제품을 제조하는 식품 제조 전문기업 우양이 이번 코스닥...   
4     동사는 자동차용 스틸 파이프와 변속기용 플레이트를 생산하는 업체로 국내에서는 1위이...   
...                                                 ...   
3813  박정호 SK텔레콤 사장의 총 보수가 38억8100만원으로 같은 이통3사 ceo중 최...   
3814  KB 주택 가격 동향에 따르면 이달 전국 아파트 평균 전세가율은 70.6%로, 서울...   
3815  HDC현대산업개발은 2분기 영업이익이 1958억원으로 작년 동기 대비 96.4% 증...   
3816  LS전선아시아 상장 예비심사 통과, 조만간 KOSPI에 상장 LS전선아시아는 베트남...   
3817  소설 동의보감이 출간되자 담박에 베스트셀러가 되었고, 나라에서는 1991년을 허준의...   

                                            Actual Text  
0     호텔스닷컴이 그랜드 워커힐 서울과 협업해 꾸민 '호텔스닷컴 벨퍼그 랜드 룸' 세계적...  
1     [이데일리 이성웅 기자] 유기농 여성용품 브랜드 라엘은 국내 공식 쇼핑몰을 열었다고...  
2     3Q 영업이익 2,503억원, 메르스 효과에도 양호한 실적 지난 3분기 대한항공은 ...  
3     이구열 우양 대표는 6일 서울 여의도에서 열린 기업공개(IPO) 기자간담회에서 중장...  
4

In [None]:
final.iloc[2, 0]

'대한항공의 3분기 영업이익은 2,503억원으로 시장 기대치에 부합했을 전망이다. 메르스 효과로 인한 중국 노선 부진이 성수기 내내 이어졌지만 원화 가치 하락에 따른 외화환산 손실로 순적자를 기록할 '

In [None]:
# 리스트로 만들기
label = list(final['Actual Text'])
pred = list(final['Generated Text'])

In [None]:
!pip install rouge

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [None]:
from rouge import Rouge

rouge = Rouge()
rouge.get_scores(pred, label, avg=True)

{'rouge-1': {'r': 0.27146351364692267,
  'p': 0.2873978217851045,
  'f': 0.27695345063002624},
 'rouge-2': {'r': 0.15486854620216978,
  'p': 0.16385161216848584,
  'f': 0.1581981412894046},
 'rouge-l': {'r': 0.2629356246787778,
  'p': 0.27852072903324004,
  'f': 0.26832668673858917}}