In [1]:
import os

In [2]:
%pwd

'd:\\A_Category\\iNeuron\\End-To-End-NLP-Project-News-Article-Sorting\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\A_Category\\iNeuron\\End-To-End-NLP-Project-News-Article-Sorting'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen = True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    tokenizer_name: Path

In [6]:
from ArticleSorting.constants import *
from ArticleSorting.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(self, 
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH):
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self) ->DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path= config.data_path,
            tokenizer_name=config.tokenizer_name
        )

        return data_transformation_config


In [8]:
import os
from ArticleSorting.logging import logger

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset

import torch
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np


  from .autonotebook import tqdm as notebook_tqdm


In [11]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name)
        

    def encode_categories(self):
        df = pd.read_csv(self.config.data_path)
        df['encoded_label'] = df['Category'].astype('category').cat.codes

        ## Spliting the Data
        # Training dataset
        train_data = df.sample(frac=0.8, random_state=42)
        # Testing dataset
        test_data = df.drop(train_data.index)

        # Convert pyhton dataframe to Hugging Face arrow dataset
        hg_train_data = Dataset.from_pandas(train_data)
        hg_test_data = Dataset.from_pandas(test_data)
        print(hg_train_data, hg_test_data)
        return hg_train_data, hg_test_data
   
    def tokenize_dataset(self, data):
        return self.tokenizer(data["Text"],
                     max_length=512,
                     truncation=True,
                     padding="max_length")
    
    def convert(self):
        hg_train_data, hg_test_data = self.encode_categories()

        # Tokenize the dataset
        dataset_train = hg_train_data.map(self.tokenize_dataset)
        dataset_test = hg_test_data.map(self.tokenize_dataset)
        
        # Remove the review and index columns because it will not be used in the model
        dataset_train = dataset_train.remove_columns(["ArticleId", "Text", "Category", "__index_level_0__"])
        dataset_test = dataset_test.remove_columns(["ArticleId", "Text", "Category", "__index_level_0__"])

        # Rename label to labels because the model expects the name labels
        dataset_train = dataset_train.rename_column("encoded_label", "labels")
        dataset_test = dataset_test.rename_column("encoded_label", "labels")

        # Change the format to PyTorch tensors
        dataset_train.set_format("torch")
        dataset_test.set_format("torch")

        # Take a look at the data
        print(dataset_train)
        print(dataset_test)

        dataset_train.save_to_disk(os.path.join(self.config.root_dir,"Train BBC dataset"))
        dataset_test.save_to_disk(os.path.join(self.config.root_dir,"Test BBC dataset"))


In [12]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.convert()
  
except Exception as e:
    raise e

[2023-10-28 20:54:16,764:  INFO: common: yaml file:config\config.yaml loaded successfully]
[2023-10-28 20:54:16,769:  INFO: common: yaml file:params.yaml loaded successfully]
[2023-10-28 20:54:16,773:  INFO: common: created directory at : artifacts]
[2023-10-28 20:54:16,775:  INFO: common: created directory at : artifacts/data_transformation]
Dataset({
    features: ['ArticleId', 'Text', 'Category', 'encoded_label', '__index_level_0__'],
    num_rows: 1192
}) Dataset({
    features: ['ArticleId', 'Text', 'Category', 'encoded_label', '__index_level_0__'],
    num_rows: 298
})


                                                                

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1192
})
Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 298
})


                                                                                               