In [3]:
import os

In [1]:
pwd

'/Users/salonisahal/Desktop/Text-Summarizer-/research'

In [4]:
os.chdir("../")

In [5]:
pwd

'/Users/salonisahal/Desktop/Text-Summarizer-'

In [6]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir:Path
    data_path:Path
    tokenizer_name : Path

In [7]:
from src.textSummarizer.constants import *
from src.textSummarizer.utils.common import read_yaml,create_directory

In [8]:
class ConfigurationManager:
    def __init__(
            self,
            config_filepath=CONFIG_FILE_PATH,
            params_filepath=PARAMS_FILE_PATH):
            self.config=read_yaml(config_filepath)
            self.params=read_yaml(params_filepath)

            create_directory([self.config.artifacts_root])
    

    def get_data_transformation_config(self) -> DataTransformationConfig:
          config = self.config.data_transformation
          create_directory([config.root_dir])

          data_transformation_config= DataTransformationConfig(
                 root_dir =config.root_dir,
                 data_path=config.data_path,
                 tokenizer_name =config.tokenizer_name
          )

          return data_transformation_config

In [9]:
from transformers import AutoTokenizer
from datasets import load_from_disk



  from .autonotebook import tqdm as notebook_tqdm


[2025-04-25 12:10:36,881: INFO: utils:NumExpr defaulting to 8 threads.]
[2025-04-25 12:10:37,128: INFO: config:TensorFlow version 2.13.0 available.]


In [10]:
class DataTransformation:
    def __init__(self,config: DataTransformationConfig):
        self.config=config
        self.tokenizer=AutoTokenizer.from_pretrained("google/pegasus-xsum")
        print(self.tokenizer)

    def convert_examples_to_features(self,example_batch):
        input_encodings = self.tokenizer(example_batch['dialogue'],max_length=1024,truncation=True)
        with self.tokenizer.as_target_tokenizer():
            target_encodings=self.tokenizer(example_batch['summary'],max_length=128,truncation=True)
        
        return {
            'input_ids' : input_encodings['input_ids'],
            'attention_mask' : input_encodings['attention_mask'],
            'labels' : target_encodings['input_ids']
        }
    def convert(self):
        dataset_samsum = load_from_disk(self.config.data_path)
        dataset_samsum_pt =dataset_samsum.map()
        dataset_samsum_pt.save_to_disk(os.path.join(self.config.root_dir,"samsum_dataset"))


In [11]:
try:
    config=ConfigurationManager()
    data_transformation_config=config.get_data_transformation_config()
    data_transformation =DataTransformation(config=data_transformation_config)
    print(data_transformation.tokenizer)
    data_transformation.convert()
except Exception as e:
    raise e
    

[2025-04-25 12:10:49,497: INFO: common:yaml file:config/config.yaml loaded Successfully]
[2025-04-25 12:10:49,499: INFO: common:yaml file:params.yaml loaded Successfully]
[2025-04-25 12:10:49,500: INFO: common:Directory artifacts created successfully.]
[2025-04-25 12:10:49,501: INFO: common:Directory artifacts/data_transformation created successfully.]
PegasusTokenizerFast(name_or_path='google/pegasus-xsum', vocab_size=96103, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'mask_token': '<mask_2>', 'additional_special_tokens': ['<mask_1>', '<unk_2>', '<unk_3>', '<unk_4>', '<unk_5>', '<unk_6>', '<unk_7>', '<unk_8>', '<unk_9>', '<unk_10>', '<unk_11>', '<unk_12>', '<unk_13>', '<unk_14>', '<unk_15>', '<unk_16>', '<unk_17>', '<unk_18>', '<unk_19>', '<unk_20>', '<unk_21>', '<unk_22>', '<unk_23>', '<unk_24>', '<unk_25>', '<unk_26>', '<unk_27>', '<unk_28>', '<unk_29>', '<unk_30>'

Map: 100%|██████████| 14732/14732 [00:00<00:00, 51643.21 examples/s]
Map: 100%|██████████| 819/819 [00:00<00:00, 44018.18 examples/s]
Map: 100%|██████████| 818/818 [00:00<00:00, 44516.62 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 14732/14732 [00:00<00:00, 1317691.05 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 819/819 [00:00<00:00, 332089.61 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 818/818 [00:00<00:00, 332939.42 examples/s]
