In [1]:
import os

In [2]:
%pwd

'c:\\DataScience\\Projects\\Next_Word_Predictor\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\DataScience\\Projects\\Next_Word_Predictor'

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    Data_path: Path
    padding: str
    tokenizer_name: str

In [6]:
from NWPproject.constants import *
from NWPproject.utils.common import read_yaml_file, create_directories, update_nested_yaml

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml_file(config_filepath)
        self.params = read_yaml_file(params_filepath)

        create_directories([self.config.artifacts_root])

    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        padding_params=self.params.pad_sequences

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            Data_path=config.Data_path,
            padding=padding_params.padding,
            tokenizer_name=config.tokenizer_name
        )

        return data_transformation_config

In [8]:
import tensorflow as tf
import numpy as np
import os
import joblib
from NWPproject import constants
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [9]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
    
    def data_transformation(self):
        with open(self.config.Data_path, 'r') as f:
            data = f.read()

        tokenizer=Tokenizer()

        tokenizer.fit_on_texts([data])

        joblib.dump(tokenizer, os.path.join(self.config.root_dir, self.config.tokenizer_name))

        my_dict = tokenizer.word_index

        last_key = list(my_dict.keys())[-1]
        num_classes = my_dict[last_key]

        input_seq=[]
        for sentence in data.split("\n"):
            tokenized_sentence=tokenizer.texts_to_sequences([sentence])[0]
            for i in range(1,len(tokenized_sentence)):
                n_gram=tokenized_sentence[:i+1]
                input_seq.append(n_gram)

        max_len=max([len(i) for i in input_seq])
        constants.MAX_LEN=max_len
        print(constants.MAX_LEN)
        # self.config.padding="post"

        update_nested_yaml('params.yaml', ['to_categorical', 'num_classes'], num_classes+1)
        update_nested_yaml('params.yaml', ['Dense', 'units_dense'], num_classes+1)
        update_nested_yaml('params.yaml', ['Embedding', 'input_dim'], num_classes+1)
        update_nested_yaml('params.yaml', ['Embedding', 'input_length'], (constants.MAX_LEN)-1)

        padded_input_seq=pad_sequences(input_seq,maxlen=constants.MAX_LEN,padding=self.config.padding)
        print("uhwuh",padded_input_seq)

        X=padded_input_seq[:,:-1]
        Y=padded_input_seq[:,-1]

        print(X.shape)
        print(Y.shape)

        y=to_categorical(Y,num_classes=num_classes+1)

        print(type(X))
        print(type(y))

        print(X.shape)
        print(y.shape)

        file_path=os.path.join(self.config.root_dir,"arrays.npz")
        np.savez(file_path, arr1=X, arr2=y, arr3=Y)

        loaded_arrays = np.load(file_path)

        loaded_array1 = loaded_arrays['arr1']
        loaded_array2 = loaded_arrays['arr2']

        print("Loaded array1:", type(loaded_array1))
        print("Loaded array2:", type(loaded_array2))

        print(loaded_array1.shape)
        print(loaded_array2.shape)
        

In [10]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.data_transformation()
except Exception as e:
    raise e

[2024-07-03 21:30:35,687: INFO:common: yaml file: config\config.yaml loaded successfully]
[2024-07-03 21:30:35,687: INFO:common: yaml file: params.yaml loaded successfully]
[2024-07-03 21:30:35,687: INFO:common: createD Directory at:artifacts]
[2024-07-03 21:30:35,687: INFO:common: createD Directory at:artifacts/data_transformation]
25
uhwuh [[  0   0   0 ...   0   1  18]
 [  0   0   0 ...   1  18  80]
 [  0   0   0 ...  18  80  35]
 ...
 [  0   0   0 ...  12  16 395]
 [  0   0   0 ...  16 395 396]
 [  0   0   0 ... 395 396 397]]
(672, 24)
(672,)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(672, 24)
(672, 398)
Loaded array1: <class 'numpy.ndarray'>
Loaded array2: <class 'numpy.ndarray'>
(672, 24)
(672, 398)
