In [10]:
import os
import shutil
import os
import sys
from multiprocessing import Pool
import pandas as pd
pd.options.mode.chained_assignment = None
import glob
import re
import multiprocessing 

In [11]:
class Utils: 

    DATA_DIR = os.path.abspath("../storage/")
    DECOMPILER_OUTPUT_DIR = os.path.join(DATA_DIR, 'decompiled-bytecodes')
    DECOMPILER_TIMEOUT = 3600
    STUDY_START_DATE = "2015-01-01"
    STUDY_END_DATE = "2022-09-01"
    
    BQ_KEY_PATH = 'lateral-command-433401-d4-89aa899f9420.json'
    BQ_PROJECT_ID = '-'.join(BQ_KEY_PATH.split("-")[:len(BQ_KEY_PATH.split("-")) -1])
    BQ_STORAGE_PROXY_DETECTOR = 'storage_dynamic_proxy_detector'
    BQ_STORAGE_BYTECODES = 'storage_bytecodes'    
    CORE_COUNT = int(multiprocessing.cpu_count() * 0.75) # 80 * .75 = 60
    
    @staticmethod
    def create_directory(directory_path, override= False):
        """
        This function checks if the specified directory exists, deletes it along with its contents if overrides set true,
        and if director does not exist it creates an empty directory at the same path.
    
        Parameters:
        directory_path (str): The file path to the directory to be checked and recreated.
        """
        # Check if the directory exists
        if os.path.exists(directory_path) and override:
            # Remove the directory and all its contents
            shutil.rmtree(directory_path)
            print(f"Removed existing directory: {directory_path}")
            
            # Recreate the directory
            os.makedirs(directory_path)
            print(f"Created new directory: {directory_path}")
        
        elif not os.path.exists(directory_path):
            # create the directory
            os.makedirs(directory_path)
            print(f"Created new directory: {directory_path}")
    
    @staticmethod
    def multicore_read_csv(target_dir, num_cores=CORE_COUNT):
        """
        Read multiple CSV files from the specified directory using multiple cores.

        Args:
            target_dir (str): The directory containing the CSV files.
            num_cores (int): The number of cores to use for reading the files.

        Returns:
            pd.DataFrame: A concatenated DataFrame containing data from all CSV files.

        Raises:
            ValueError: If target_dir is not a directory or num_cores is not a positive integer.
            FileNotFoundError: If no CSV files are found in the target directory.
        """
        if not os.path.isdir(target_dir):
            raise ValueError(f"The specified target directory '{target_dir}' is not a valid directory.")
        
        if not isinstance(num_cores, int) or num_cores <= 0:
            raise ValueError("The number of cores must be a positive integer.")

        # Ensure the target directory path ends with a slash
        if not target_dir.endswith('/'):
            target_dir += '/'
        
        batches_path = [batch for batch in glob.glob(target_dir + "*.csv")]

        if len(batches_path) == 0:
            raise FileNotFoundError(f"No CSV files found in the target directory '{target_dir}'.")

        print('Reading {} CSV files from {} directory ...'.format(len(batches_path), target_dir))
        
        with Pool(num_cores) as p:
            df = pd.concat(p.map(pd.read_csv, batches_path), ignore_index=True)
        
        return df

    @staticmethod
    def convert_to_dict(dataframe, col1_key, col2_value):
        """
        Convert two columns of a DataFrame into a dictionary with keys from col1_key and values from col2_value.
    
        Args:
            dataframe (pd.DataFrame): The DataFrame containing the data.
            col1_key (str): The column name to use as keys in the dictionary.
            col2_value (str): The column name to use as values in the dictionary.
    
        Returns:
            dict: A dictionary with keys from col1_key and values from col2_value.
    
        Raises:
            TypeError: If the input dataframe is not a pandas DataFrame.
            ValueError: If col1_key or col2_value are not columns in the DataFrame.
        """
        if not isinstance(dataframe, pd.DataFrame):
            raise TypeError("The input dataframe must be a pandas DataFrame.")
        
        if col1_key not in dataframe.columns:
            raise ValueError(f"Column '{col1_key}' is not in the DataFrame.")
        
        if col2_value not in dataframe.columns:
            raise ValueError(f"Column '{col2_value}' is not in the DataFrame.")
    
        return dict(zip(dataframe[col1_key], dataframe[col2_value]))

    @staticmethod
    def escape_ansi(line):
        ansi_escape =re.compile(r'(\x9B|\x1B\[)[0-?]*[ -\/]*[@-~]')
        return ansi_escape.sub('', line)
