In [1]:
import os
os.chdir("/Users/vanshbansal/Desktop/FuelGrowth")
%pwd


'/Users/vanshbansal/Desktop/FuelGrowth'

In [3]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class ResultTransformationConfig:
    influencer_avg_main_dir: Path
    influence_total_score_dir: Path
    urls_score_main_dir: Path
    

In [4]:
from src.constants import *
from src.utils.common import read_yaml , write_yaml , create_directories
class ConfigurationManager:
    def __init__(self,
                 config_filepath=CONFIG_FILE_PATH,
                ):
        
        self.config=read_yaml(config_filepath)

        


    def get_result_transformation_config(self)-> ResultTransformationConfig:
        
        transformation_config = self.config.result_transformation
        
        create_directories([transformation_config.influencer_avg_main_dir])
        create_directories([transformation_config.influencer_avg_main_dir])
        create_directories([transformation_config.influencer_avg_main_dir])

        result_transformation_config = ResultTransformationConfig(
            influencer_avg_main_dir = transformation_config.influencer_avg_main_dir,
            influence_total_score_dir = transformation_config.influence_total_score_dir,
            urls_score_main_dir = transformation_config.urls_score_main_dir
        )

        return result_transformation_config

        

In [None]:
import pandas as pd
import os
import cv2
import numpy as np
from urllib.request import urlretrieve
from imagehash import phash
from PIL import Image
from datetime import datetime
from src import logger
import shutil

# Directory to store temporary frames



class ResultTransformation:
    def __init__(self , config:ResultTransformationConfig):
        try:
            self.config = config
        except Exception as e:
            raise e
    
    def is_directory_empty(self , directory_path):
        """Check if a directory is empty."""
        if os.path.exists(directory_path) and os.path.isdir(directory_path):
            return len(os.listdir(directory_path)) == 0
        else:
            raise FileNotFoundError(f"The directory '{directory_path}' does not exist or is not a directory.")
            

    
    def transform_url_data(self , new_urls_path):
        main_url_dir = self.config.urls_score_main_dir
        
        # If no files exist in the dir , we will just copy the latest clean_url file.
        # This means we are running the pipeline for the first time.
        if self.is_directory_empty(main_url_dir):
            
            destination_file_path = os.path.join(main_url_dir, "clean_data.xlsx")
            logger.info(f"Directory is empty, saving new urls to path: {destination_file_path}")
            shutil.copy(new_urls_path, destination_file_path)
            logger.info(f"File saved at path: {destination_file_path}")

            return destination_file_path
        
        # Data/urls already exist at that path
        else:
            logger.info("Trying to merge existing and current urls data...")
            main_url_dir = self.config.urls_score_main_dir
            main_file_path = os.path.join(main_url_dir , "clean_data.xlsx")
            curr_file_path = new_urls_path
            
            df_main = pd.read_excel(main_file_path)
            df_curr = pd.read_excel(curr_file_path)

            # Concatenate the two DataFrames
            df_combined = pd.concat([df_main, df_curr])

            # Drop duplicates based on the 'URL' column
            df_combined = df_combined.drop_duplicates(subset='url', keep='first').reset_index(drop=True)

            # Save the resulting DataFrame if needed
            df_combined.to_excel(main_file_path)

            logger.info("existing and current urls data merged successfully !!!")

            return main_file_path
    

    def create_and_save_final_inf_data(self , main_total_score_path):

        avg_score_file_dir = self.config.influencer_avg_main_dir
        avg_score_file_path = os.path.join(avg_score_file_dir , "clean_data.xlsx")
        
        logger.info(f"Reading df at path: {main_total_score_path}")
        df = pd.read_excel(main_total_score_path)
        df['avg_score'] = df['total_score']/df['no_of_occurance']
        df = df.sort_values(by='avg_score', ascending=False)

        df.to_excel(avg_score_file_path)
        
        logger.info(f"Final avg data saved to path: {avg_score_file_path}")

        return main_total_score_path
        


    def transform_total_score_data(self , new_inf_total_score_path):
        main_total_score_dir = self.config.influence_total_score_dir
        logger.info(f"new total score data is at path: {new_inf_total_score_path}")


        # If no files exist in the dir , we will just copy the latest influencer total score file.
        # This means we are running the pipeline for the first time.
        if self.is_directory_empty(main_total_score_dir):
            
            destination_file_path = os.path.join(main_total_score_dir, "clean_data.xlsx")
            
            logger.info(f"Directory is empty, saving new total_score data to path: {destination_file_path}")
            shutil.copy(new_inf_total_score_path, destination_file_path)
            logger.info(f"File saved at path: {destination_file_path}")



            # Saving final results
            return self.create_and_save_final_inf_data(destination_file_path)
        else:
            logger.info("A file for influencer total score data already exists")


            main_total_score_dir = self.config.influencer_avg_main_dir
            main_total_score_path = os.path.join(main_total_score_dir , "clean_data.xlsx")
            curr_total_score_path = new_inf_total_score_path

            logger.info("Combining both new and existing data")
            df_main = pd.read_excel(main_total_score_path)
            df_curr = pd.read_excel(curr_total_score_path)

            # Concatenate the two dataframes
            df_combined = pd.concat([df_main, df_curr])

            # Group by 'hash' and sum 'total_score'
            df_combined = df_combined.groupby('hash', as_index=False).agg({
                'serial_num' : 'first',
                'image_path': 'first',
                'total_score': 'sum',  # Sum scores
                'no_of_occurance': 'sum',       # Keep the first occurrence
                'recent_occurance': 'max'       # Keep the max value

            })

            # Saving the combined dataframe
            df_combined.to_excel(main_total_score_path)
            logger.info(f"Combined data saved to path: {main_total_score_path}")
            

            # Saving final results
            return self.create_and_save_final_inf_data(main_total_score_path)



In [8]:
try:
    config = ConfigurationManager()
    result_transform_config = config.get_result_transformation_config()
    result_transform = ResultTransformation(config=result_transform_config)

    final_url_path = result_transform.transform_url_data("temp_data/url_score data/clean/2024-12-01_21-47-59.xlsx")
    print(final_url_path)
    
    final_avg_score_path = result_transform.transform_total_score_data("temp_data/model_training_data/2024-12-01_21-48-11.xlsx")
    print(final_avg_score_path)


except Exception as e:
    raise e

[2024-12-01 23:04:37,936: INFO: common: yaml file: config.yaml loaded successfully]
[2024-12-01 23:04:37,965: INFO: common: created directory at: main_data/influencer_avg_data]
[2024-12-01 23:04:37,967: INFO: common: created directory at: main_data/influencer_avg_data]
[2024-12-01 23:04:37,968: INFO: common: created directory at: main_data/influencer_avg_data]
[2024-12-01 23:04:37,972: INFO: 696704311: Directory is empty, saving new urls to path: main_data/urls_score data/clean_data.xlsx]
[2024-12-01 23:04:37,981: INFO: 696704311: File saved at path: main_data/urls_score data/clean_data.xlsx]
main_data/urls_score data/clean_data.xlsx
[2024-12-01 23:04:37,981: INFO: 696704311: Current total score data is at path: temp_data/model_training_data/2024-12-01_21-48-11.xlsx]
[2024-12-01 23:04:37,982: INFO: 696704311: Directory is empty, saving new total_score data to path: main_data/model_training_data/clean_data.xlsx]
[2024-12-01 23:04:37,985: INFO: 696704311: File saved at path: main_data/mo