In [1]:
import os

In [2]:
%pwd

'c:\\DataScience\\Projects\\Next_Word_Predictor\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\DataScience\\Projects\\Next_Word_Predictor'

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    Data_path: Path
    padding: str
    num_classes: int

In [6]:
from NWPproject.constants import *
from NWPproject.utils.common import read_yaml_file, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml_file(config_filepath)
        self.params = read_yaml_file(params_filepath)

        create_directories([self.config.artifacts_root])

    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        padding_params=self.params.pad_sequences
        num_classes_params=self.params.to_categorical

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            Data_path=config.Data_path,
            padding=padding_params.padding,
            num_classes=num_classes_params.num_classes
        )

        return data_transformation_config

In [8]:
import tensorflow as tf
import numpy as np
import os
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [9]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
    
    def data_transformation(self):
        with open(self.config.Data_path, 'r') as f:
            data = f.read()

        tokenizer=Tokenizer()

        tokenizer.fit_on_texts([data])

        input_seq=[]
        for sentence in data.split("\n"):
            tokenized_sentence=tokenizer.texts_to_sequences([sentence])[0]
            for i in range(1,len(tokenized_sentence)):
                n_gram=tokenized_sentence[:i+1]
                input_seq.append(n_gram)

        max_len=max([len(i) for i in input_seq])

        padded_input_seq=pad_sequences(input_seq,maxlen=max_len,padding=self.config.padding)

        X=padded_input_seq[:,:-1]
        Y=padded_input_seq[:,-1]

        print(X.shape)
        print(Y.shape)

        y=to_categorical(Y,num_classes=self.config.num_classes)

        print(type(X))
        print(type(y))

        print(X.shape)
        print(y.shape)

        file_path=os.path.join(self.config.root_dir,"arrays.npz")
        np.savez(file_path, arr1=X, arr2=y)

        loaded_arrays = np.load(file_path)

        loaded_array1 = loaded_arrays['arr1']
        loaded_array2 = loaded_arrays['arr2']

        print("Loaded array1:", type(loaded_array1))
        print("Loaded array2:", type(loaded_array2))

        print(loaded_array1.shape)
        print(loaded_array2.shape)
        

In [10]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.data_transformation()
except Exception as e:
    raise e

[2024-06-29 19:51:43,752: INFO:common: yaml file: config\config.yaml loaded successfully]
[2024-06-29 19:51:43,758: INFO:common: yaml file: params.yaml loaded successfully]
[2024-06-29 19:51:43,759: INFO:common: createD Directory at:artifacts]
[2024-06-29 19:51:43,760: INFO:common: createD Directory at:artifacts/data_transformation]
(379, 39)
(379,)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(379, 39)
(379, 221)
Loaded array1: <class 'numpy.ndarray'>
Loaded array2: <class 'numpy.ndarray'>
(379, 39)
(379, 221)
