## Data Transformation

In [1]:
import pandas as pd
import re
import os
import sys

In [2]:
%pwd

'c:\\Personal AI Projects\\FORAGE JOB SIMULATIONS\\British Airline Data Science Virtual Internship\\customer-reviews-analysis\\trials'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Personal AI Projects\\FORAGE JOB SIMULATIONS\\British Airline Data Science Virtual Internship\\customer-reviews-analysis'

#### src/reviewAnalyser/entity/config_entity.py

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    source_data_path: str
    local_data_path: Path
    feature_for_new_feature: str
    feature_split_string: str
    new_feature: str

#### src/reviewAnalyser/config/configuration.py

In [6]:
from src.reviewAnalyzer.constants import *
from src.reviewAnalyzer.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH):
        self.config = read_yaml(config_filepath)
        create_directories([self.config.artifacts_root])

    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            source_data_path=config.source_data_path,
            local_data_path=config.local_data_path,
            feature_for_new_feature=config.feature_for_new_feature,
            feature_split_string=config.feature_split_string,
            new_feature=config.new_feature
        )

        return data_transformation_config



#### src/reviewAnalyser/components/data_transformation.py

In [8]:
from src.reviewAnalyzer import logger
from src.reviewAnalyzer.utils.common import get_size
from src.reviewAnalyzer.entity.config_entity import DataTransformationConfig
from pathlib import Path

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to C:\Users\Tito
[nltk_data]     Osadebey\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Tito
[nltk_data]     Osadebey\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Tito
[nltk_data]     Osadebey\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.file = self.config.source_data_path
        self.lemmatizer = WordNetLemmatizer()
        self.df = self.load_data()

    
    def load_data(self):
        # Load data
        print(f"Loading source data file...")
        df = pd.read_csv(self.file, index_col=0)
        logger.info(f"Data file ({self.file}) loaded")
        return df
        

    def create_feature(self):      
        # df = self.load_data()
        # Create a column in the dataframe based on what another feature contains
        print(f"Creating feature ({self.config.new_feature}) from ({self.config.feature_for_new_feature})")
        self.df[self.config.new_feature] = self.df[self.config.feature_for_new_feature].str.contains(self.config.feature_split_string)
        logger.info(f"Feature ({self.config.new_feature}) created")
        return self.df
        

    def clean_data(self, document):
        # df = self.create_feature()
        document = document.split('|')[1]
        document = re.sub('[^a-zA-Z]',' ', str(document))
        document = document.lower()
        document = document.split()
        # Fixing contractions e.g don't, won't etc.
        #document = contractions.fix(document)
        document = " ".join(document)
        return document
    

    def process_data(self, document):
        tokens = word_tokenize(document)
        tagged_tokens = pos_tag(tokens)
        tag_map = {'J':wordnet.ADJ, 'V':wordnet.VERB, 'N':wordnet.NOUN, 'R':wordnet.ADV}
        new_tagged_tokens = []
        for word, tag in tagged_tokens:
            if word.lower() not in set(stopwords.words('english')):
                new_tagged_tokens.append(tuple([word, tag_map.get(tag[0])]))
        return new_tagged_tokens


    def lemmatiza(self, document):
        #df = self.process_data()
        lemmatized_text = " "
        for word, tag in document:
            if not tag:
                lemmatize = word
                lemmatized_text = lemmatized_text + " " + lemmatize
            else:
                lemmatize = self.lemmatizer.lemmatize(word, pos=tag)
                lemmatized_text = lemmatized_text + " " + lemmatize
        return lemmatized_text


    def save_data(self, document: pd.DataFrame):
        document.to_csv(self.config.local_data_path)
        logger.info(f"Preprocessed file location: {self.config.local_data_path}")


#### src/reviewAnalyser/pipeline/stage_01_data_ingestion.py

In [10]:
class DataTransformationPipeline:
    def __init__(self):
        pass

    def main(self):  
        config = ConfigurationManager()
        data_transformation_config = config.get_data_transformation_config()

        if not os.path.exists(data_transformation_config.local_data_path):
            data_transformation = DataTransformation(config=data_transformation_config)
            new_data = data_transformation.create_feature()
            print(f"Cleaning data file...")
            new_data['cleaned_reviews'] = new_data['reviews'].apply(data_transformation.clean_data)
            logger.info(f"Data file cleaned")
            logger.info(f"Part of Speech Tagging...")
            print(f"Tokenizing...")
            print(f"Part of Speech Tagging...")
            print(f"Removing stopwords...")
            new_data['pos_tagged'] = new_data['cleaned_reviews'].apply(data_transformation.process_data)
            logger.info(f"Part of Speech Tagging done.")
            print(f"Word lemmatizing...")
            new_data['corpus'] = new_data['pos_tagged'].apply(data_transformation.lemmatiza)
            logger.info(f"Word lemmatizing done and corpus feature created.")
            data_transformation.save_data(new_data)
        else:
            logger.info(f"File already exists of size: {get_size(Path(data_transformation_config.local_data_path))}")


### main.py

In [11]:
from src.reviewAnalyzer.exceptions import CustomException

In [12]:
STAGE_NAME = "Data Transformation Stage"

try: 
   logger.info(f"*******************")
   logger.info(f">>>>>> {STAGE_NAME} started <<<<<<")
   data_transformation = DataTransformationPipeline()
   data_transformation.main()
   logger.info(f">>>>>> {STAGE_NAME} completed <<<<<<]\n\n[x==========x")
except Exception as e:
    logger.exception(e)
    raise CustomException(e, sys)

[2024-07-09 20:36:42,255: INFO: 3954784042: *******************]
[2024-07-09 20:36:42,256: INFO: 3954784042: >>>>>> Data Transformation Stage started <<<<<<]
[2024-07-09 20:36:42,267: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-07-09 20:36:42,271: INFO: common: created directory at: artifacts]
[2024-07-09 20:36:42,273: INFO: common: created directory at: artifacts/data_transformation]
Loading source data file...
[2024-07-09 20:36:42,462: INFO: 462053332: Data file (artifacts/data_ingestion/data.csv) loaded]
Creating feature (verified) from (reviews)
[2024-07-09 20:36:42,481: INFO: 462053332: Feature (verified) created]
Cleaning data file...
[2024-07-09 20:36:42,629: INFO: 1634043365: Data file cleaned]
[2024-07-09 20:36:42,631: INFO: 1634043365: Part of Speech Tagging...]
Tokenizing...
Part of Speech Tagging...
Removing stopwords...
[2024-07-09 20:38:01,960: INFO: 1634043365: Part of Speech Tagging done.]
Word lemmatizing...
[2024-07-09 20:38:02,361: INFO: 16