In [1]:
import os

In [2]:
%pwd

'd:\\A_Category\\iNeuron\\End-To-End-NLP-Project-News-Article-Sorting\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\A_Category\\iNeuron\\End-To-End-NLP-Project-News-Article-Sorting'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen = True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    tokenizer_name: Path

In [6]:
from ArticleSorting.constants import *
from ArticleSorting.utils.common import read_yaml, create_directories

In [9]:
class ConfigurationManager:
    def __init__(self, 
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH):
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self) ->DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path= config.data_path,
            tokenizer_name=config.tokenizer_name
        )

        return data_transformation_config


In [12]:
import os
from ArticleSorting.logging import logger

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset

import torch
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np


In [18]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name)
        self.df = pd.read_csv(self.config.data_path)


    def encode_categories(self):
        self.df["encoded_label"] = self.df["Category"].astype("category").cat.codes
        hg_data = Dataset.from_pandas(self.df)
        return hg_data
   
    def tokenize_dataset(self, data):
        return self.tokenizer(data["Text"],
                     max_length=512,
                     truncation=True,
                     padding="max_length")
    
    def convert(self):
        data = self.encode_categories()
        tokenized_data = data.map(self.tokenize_dataset)
        # Remove  columnms
        tokenized_data = tokenized_data.remove_columns(["ArticleId", "Text", "Category", "__index_level_0__"])
        # Rename label to labels because the model expects the name labels
        dataset = tokenized_data.rename_column("encoded_label", "labels")
        # Change the format to PyTorch tensors
        dataset.set_format("torch")
        print(dataset)

        dataset.save_to_disk(os.path.join(self.config.root_dir,"BBC dataset"))


In [19]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.convert()
  
except Exception as e:
    raise e

[2023-10-26 23:12:40,439:  INFO: common: yaml file:config\config.yaml loaded successfully]
[2023-10-26 23:12:40,443:  INFO: common: yaml file:params.yaml loaded successfully]
[2023-10-26 23:12:40,447:  INFO: common: created directory at : artifacts]
[2023-10-26 23:12:40,450:  INFO: common: created directory at : artifacts/data_transformation]


                                                                

ValueError: Column name __index_level_0__ not in the dataset. Current columns in the dataset: ['ArticleId', 'Text', 'Category', 'encoded_label', 'input_ids', 'token_type_ids', 'attention_mask']