Data Ingestion 

In [1]:
import os
os.chdir("../")

%pwd

'c:\\Projects\\DSML\\ReactorExplorer'

In [3]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen = True)
class DataIngestionConfig:
    root_dir: Path
    source_url: str
    local_data_file: Path
    unzip_dir: Path


In [None]:
import sys
from reactorexplorer.constants import *
from reactorexplorer.utils.utils import read_yaml, create_directories
from reactorexplorer.exception.exception_handler import AppException

class ConfigurationManager:
    def __init__ (
            self,
            config_file_path = CONFIG_FILE_PATH,
            params_file_path = PARAMS_FILE_PATH,
            schema_file_path = SCHEMA_FILE_PATH):
        try:
            self.config = read_yaml(CONFIG_FILE_PATH)
            self.paramls = read_yaml(PARAMS_FILE_PATH)
            self.schema = read_yaml(SCHEMA_FILE_PATH)

            create_directories([self.config.artifacts_root])

        except Exception as e:
            raise AppException(e, sys) from e 
        
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        try:
            config = self.config.data_ingestion
            create_directories([config.root_dir])

            data_ingestion_config = DataIngestionConfig(
                root_dir = config.root_dir,
                source_url = config.source_url,
                local_data_file = config.local_data_file,
                unzip_dir = config.unzip_dir
            )

            return data_ingestion_config
        
        except Exception as e:
            raise AppException(e, sys) from e 

In [14]:
import os
import sys 
import urllib.request as request
import zipfile
from reactorexplorer.logger.log import logging
from reactorexplorer.exception.exception_handler import AppException
from reactorexplorer.utils.utils import get_size

class DataIngestion:
    def __init__(self, config = DataIngestionConfig):
        try:
            self.config = config
        except Exception as e:
            raise AppException(e, sys) from e
    
    def download_file(self):
        try:
            if not os.path.exists(self.config.local_data_file):
                filename, headers = request.urlretrieve(
                    url = self.config.source_url,
                    filename = self.config.local_data_file
                )

                logging.info(f"file downloaded: {self.config.local_data_file}")
            else:
                logging.info(f"file already exists")

        except Exception as e:
            raise AppException(e, sys) from e 

    def extract_zip_file(self):
        try:
            unzip_path = self.config.unzip_dir
            os.makedirs(unzip_path, exist_ok=True)
            with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
                zip_ref.extractall(unzip_path)
          
        except Exception as e:
            raise AppException(e, sys) from e 

In [18]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config = data_ingestion_config)
    data_ingestion.download_file()
    data_ingestion.extract_zip_file()
except Exception as e:
    raise AppException(e, sys) from e

Data Validation

In [19]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    status_file: str
    unzip_data_dir: Path
    all_schema: str

In [32]:
from reactorexplorer.logger.log import logging
from reactorexplorer.exception.exception_handler import AppException
from reactorexplorer.utils.utils import read_yaml, create_directories

class ConfigurationManager1:
    def __init__ (
            self,
            config_file_path = CONFIG_FILE_PATH,
            params_file_path = PARAMS_FILE_PATH,
            schema_file_path = SCHEMA_FILE_PATH):
        try:
            self.config = read_yaml(CONFIG_FILE_PATH)
            self.paramls = read_yaml(PARAMS_FILE_PATH)
            self.schema = read_yaml(SCHEMA_FILE_PATH)

            create_directories([self.config.artifacts_root])

        except Exception as e:
            raise AppException(e, sys) from e 
        
    def get_data_validation_config(self) -> DataValidationConfig:
        try:
            config = self.config.data_validation
            schema = self.schema.COLUMNS

            create_directories([config.root_dir])

            data_validation_config = DataValidationConfig(
                root_dir = config.root_dir,
                status_file = config.status_file,
                unzip_data_dir = config.unzip_data_dir,
                all_schema = schema
            )

            return data_validation_config

        except Exception as e:
            raise AppException(e, sys) from e
        

In [40]:
import os
import sys
import pandas as pd

from reactorexplorer.logger.log import logging
from reactorexplorer.exception.exception_handler import AppException

class DataValidation:
    def __init__(self, config = DataValidationConfig):
        try:
            self.config = config
        except Exception as e:
            raise AppException(e, sys) from e

    def validate_all_columns(self) -> bool:
        try:

            data = pd.read_csv(self.config.unzip_data_dir)
            all_cols = list(data.columns)
            all_schema = self.config.all_schema.keys()

            for col in all_schema:
                if col not in all_cols:
                    logging.info(f"missing column: {col}")
                    return False
                
            logging.info(f"all columns present")
            return True
        
        except Exception as e:
           raise AppException(e, sys) from e
        
    def validate_status(self):
        try:

            is_valid = self.validate_all_columns()
            
            with open(self.config.status_file, "w") as f:
                f.write(f"validation status: {is_valid}\n")

        except Exception as e:
            raise AppException(e, sys) from e


In [39]:
try:
    config = ConfigurationManager1()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValidation(config = data_validation_config)
    data_validation.validate_status()


except Exception as e:
    raise AppException(e, sys) from e 

  data = pd.read_csv(self.config.unzip_data_dir)


Data Transformation

In [46]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    clean_data_dir: Path
    serialized_objects_dir: Path

In [54]:
from reactorexplorer.logger.log import logging
from reactorexplorer.exception.exception_handler import AppException
from reactorexplorer.utils.utils import read_yaml, create_directories

class ConfigurationManager2:
    def __init__ (
            self,
            config_file_path = CONFIG_FILE_PATH,
            params_file_path = PARAMS_FILE_PATH,
            schema_file_path = SCHEMA_FILE_PATH):
        try:
            self.config = read_yaml(CONFIG_FILE_PATH)
            self.paramls = read_yaml(PARAMS_FILE_PATH)
            self.schema = read_yaml(SCHEMA_FILE_PATH)

            create_directories([self.config.artifacts_root])

        except Exception as e:
            raise AppException(e, sys) from e 
        
    def get_data_transformation_config(self) -> DataTransformationConfig:
        try:
            config = self.config.data_transformation

            create_directories([config.root_dir])

            data_transformation_config = DataTransformationConfig(
                root_dir = config.root_dir,
                data_path = config.data_path,
                clean_data_dir = config.clean_data_dir,
                serialized_objects_dir = config.serialized_objects_dir
            )

            return data_transformation_config
        
        except Exception as e:
            raise AppException(e, sys) from e         

In [72]:
import os
import sys
import ast
import pandas as pd
import pickle

from reactorexplorer.logger.log import logging
from reactorexplorer.exception.exception_handler import AppException
from reactorexplorer.utils.utils import read_yaml, create_directories

class DataTransformation:
    def __init__(self, config = DataTransformationConfig):
        try:
            self.config = config 
          
        except Exception as e:
            raise AppException(e, sys) from e


    def transform_data(self):
        try:
            # Read the raw dataset
            raw_data = pd.read_csv(self.config.data_path, on_bad_lines='skip', encoding='latin-1')

            logging.info(f"shape of raw_data: {raw_data.shape}")

            # perform requisite data transformations
            raw_data = raw_data[['country','country_long','name','capacity_mw', 'latitude', 'longitude', 'primary_fuel']]

            raw_data = raw_data[raw_data['primary_fuel'].isin(['Nuclear', 'Oil', 'Hydro', 'Coal', 'Solar']) ]

            clean_data = raw_data.drop_duplicates(['name'])

            logging.info(f"shape of clean data: {clean_data.shape}")

            data_pivot = clean_data.pivot(columns = 'primary_fuel', index = 'name', values = 'capacity_mw')
            data_pivot.fillna(0, inplace= True)

            data_names = data_pivot.index

            # save the transformed dataset
            os.makedirs(self.config.clean_data_dir, exist_ok = True)
            clean_data.to_csv(os.path.join(self.config.clean_data_dir, 'clean_data.csv'), index = False)
            data_pivot.to_csv(os.path.join(self.config.clean_data_dir, 'data_pivot.csv'), index = False)
            logging.info(f"clean data saved to : {self.config.clean_data_dir}")

            # save the clean data as a serialized object
            os.makedirs(self.config.serialized_objects_dir, exist_ok=True)
            pickle.dump(clean_data, open(os.path.join(self.config.serialized_objects_dir, 'clean_data.pkl'), 'wb'))
            pickle.dump(data_pivot, open(os.path.join(self.config.serialized_objects_dir, 'data_pivot.pkl'), 'wb'))
            pickle.dump(data_pivot, open(os.path.join(self.config.serialized_objects_dir, 'data_names.pkl'), 'wb'))
            logging.info(f"saved clean data serialized object to {self.config.serialized_objects_dir}")

          
        except Exception as e:
            raise AppException(e, sys) from e

In [73]:
try:
    config = ConfigurationManager2()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config = data_transformation_config)
    data_transformation.transform_data()

except Exception as e:
    raise AppException(e, sys) from e

  raw_data = pd.read_csv(self.config.data_path, on_bad_lines='skip', encoding='latin-1')


In [81]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen = True)
class ModelTrainerConfig:
    root_dir: Path
    trained_model_name: str
    serialized_objects_dir: Path
    data_pivot_name: str
    data_names_name: str

In [84]:
from reactorexplorer.logger.log import logging
from reactorexplorer.exception.exception_handler import AppException
from reactorexplorer.utils.utils import read_yaml, create_directories

class ConfigurationManager3:
    def __init__ (
            self,
            config_file_path = CONFIG_FILE_PATH,
            params_file_path = PARAMS_FILE_PATH,
            schema_file_path = SCHEMA_FILE_PATH):
        try:
            self.config = read_yaml(CONFIG_FILE_PATH)
            self.paramls = read_yaml(PARAMS_FILE_PATH)
            self.schema = read_yaml(SCHEMA_FILE_PATH)

            create_directories([self.config.artifacts_root])

        except Exception as e:
            raise AppException(e, sys) from e 
        
    def get_model_trainer_config(self) -> ModelTrainerConfig:
        try:
            config = self.config.mode_trainer

            create_directories([config.root_dir])

            model_trainer_config = ModelTrainerConfig(
                root_dir = config.root_dir,
                trained_model_name = config.trained_model_name,
                serialized_objects_dir = config.serialized_objects_dir,
                data_pivot_name = config.data_pivot_name,
                data_names_name = config.data_names_name
            )

            return model_trainer_config
        
        except Exception as e:
            raise AppException(e, sys) from e         
        

In [85]:
import os
import sys
import pickle
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csc_matrix

from reactorexplorer.logger.log import logging
from reactorexplorer.exception.exception_handler import AppException
from reactorexplorer.utils.utils import read_yaml, create_directories

class ModelTrainer:
    def __init__(self, config = ModelTrainerConfig):
        try:
            self.config = config
        except Exception as e:
            raise AppException(e, sys) from e
        
    def train(self):
        try:
            # load pivot data
            data_pivot = pickle.load(open(os.path.join(self.config.serialized_objects_dir, self.config.data_pivot_name), "rb"))
            data_sparse = csc_matrix(data_pivot)

            model = NearestNeighbors(algorith='brute')
            model.fit(data_sparse)

            # save the model
            os.makedirs(self.config.root_dir, exist_ok=True)
            file_name = os.path.join(self.config.serialized_objects_dir, self.config.trained_model_name)
            pickle.dump(model, open(file_name, 'wb'))
            logging.info(f"saved model to {file_name}")
        except Exception as e:
            raise AppException(e, sys) from e 


In [None]:
try:

except Exception as e:
    raise AppException(e, sys) from e 