In [None]:
import os
import shutil
import os
import sys
from multiprocessing import Pool
import pandas as pd
pd.options.mode.chained_assignment = None
import glob
import re
import multiprocessing
import sqlite3

In [None]:
import sqlite3
import pandas as pd
import os

class SQLiteProxyDict:
    def __init__(self, db_path, table_name, key_column, value_column, csv_path=None, force_reload=False):
        self.db_path = db_path
        self.table_name = table_name
        self.key_column = key_column
        self.value_column = value_column
        
        db_dir = os.path.dirname(self.db_path)
        if db_dir and not os.path.exists(db_dir):
            os.makedirs(db_dir, exist_ok=True)

        table_exists = False
        if os.path.exists(self.db_path):
            with self._get_conn() as conn:
                cursor = conn.cursor()
                cursor.execute(f"SELECT name FROM sqlite_master WHERE type='table' AND name='{self.table_name}';")
                if cursor.fetchone():
                    table_exists = True
        
        if force_reload or not table_exists:
            if csv_path:
                print(f"Populating table '{self.table_name}' in database '{self.db_path}' from CSV directory: {csv_path}")
                self._populate_from_csv(csv_path)
            elif not table_exists:
                 raise LookupError(f"Database table '{self.table_name}' not found in '{self.db_path}' and no CSV path provided to populate it.")
        
    def _get_conn(self):
        return sqlite3.connect(self.db_path)

    
    def _populate_from_csv(self, csv_dir_path): # csv_reader_func is not directly used here anymore for batch loading
        print(f"Reading CSVs from directory: {csv_dir_path} and populating table '{self.table_name}'")
        
        if not os.path.isdir(csv_dir_path):
            raise ValueError(f"The specified CSV directory '{csv_dir_path}' is not a valid directory.")

        csv_files = [os.path.join(csv_dir_path, f) for f in os.listdir(csv_dir_path) if f.endswith('.csv')]

        if not csv_files:
            raise FileNotFoundError(f"No CSV files found in the directory '{csv_dir_path}'.")

        with self._get_conn() as conn:
            # Drop the table if it exists to ensure a clean load, matching 'replace' behavior
            conn.execute(f"DROP TABLE IF EXISTS \"{self.table_name}\";")
            conn.commit()

            first_file = True
            for csv_file_path in csv_files:
                try:
                    # Read one CSV file at a time
                    df_chunk = pd.read_csv(csv_file_path)
                    
                    if self.key_column not in df_chunk.columns or self.value_column not in df_chunk.columns:
                        print(f"Warning: Key column '{self.key_column}' or value column '{self.value_column}' not in {csv_file_path}. Skipping this file for table '{self.table_name}'. Columns found: {list(df_chunk.columns)}")
                        continue

                    df_to_load = df_chunk[[self.key_column, self.value_column]].copy()
                    # Drop duplicates within the chunk based on key_column.
                    # Note: This doesn't handle duplicates across different CSV files if a key appears in multiple files.
                    # If global uniqueness is required and keys can span files, a more complex strategy or
                    # relying on SQLite's UNIQUE constraint (and handling potential errors) would be needed.
                    # For simplicity, we'll keep 'first' within a chunk.
                    df_to_load.drop_duplicates(subset=[self.key_column], keep='first', inplace=True)
                    
                    # Append data to the SQL table
                    # 'if_exists' is 'append' because we handle table creation/dropping manually for the first file
                    # or ensure it's created before the loop.
                    # For the first file, we create the table. For subsequent, we append.
                    if first_file:
                        df_to_load.to_sql(self.table_name, conn, if_exists='replace', index=False)
                        first_file = False
                    else:
                        df_to_load.to_sql(self.table_name, conn, if_exists='append', index=False)
                    
                    print(f"Loaded data from {csv_file_path} into '{self.table_name}'.")

                except pd.errors.EmptyDataError:
                    print(f"Warning: CSV file {csv_file_path} is empty. Skipping.")
                    continue
                except Exception as e:
                    print(f"Error processing file {csv_file_path}: {e}. Skipping this file.")
                    continue
            
            if first_file: # No data was loaded
                 # Create an empty table with correct schema if no files had data or all were skipped
                print(f"No data loaded into '{self.table_name}'. Creating an empty table.")
                # We need a schema. This is tricky without at least one valid DataFrame.
                # For now, we'll assume if all files are bad/empty, the table might not get created correctly
                # or will be empty. A more robust solution might involve defining schema explicitly.
                # Let's try to create it with dummy data if no files were processed.
                # This part is a bit of a placeholder for robust empty table creation.
                # A better way would be to define schema explicitly.
                # conn.execute(f"CREATE TABLE IF NOT EXISTS \"{self.table_name}\" (\"{self.key_column}\" TEXT PRIMARY KEY, \"{self.value_column}\" TEXT);")

                # For now, if no files are processed, the table won't be created by to_sql.
                # We will rely on the index creation to fail if the table doesn't exist,
                # or handle it by checking if the table exists before creating index.
                pass


            # Create index after all data is loaded
            # Check if table exists before creating index, in case all CSVs were empty/faulty
            cursor = conn.cursor()
            cursor.execute(f"SELECT name FROM sqlite_master WHERE type='table' AND name='{self.table_name}';")
            if cursor.fetchone():
                safe_key_column_for_index = self.key_column.replace('"', '""')
                safe_table_name_for_index = self.table_name.replace('"', '""')
                conn.execute(f'CREATE INDEX IF NOT EXISTS "idx_{safe_table_name_for_index}_{safe_key_column_for_index}" ON "{safe_table_name_for_index}" ("{safe_key_column_for_index}");')
                print(f"Index created on '{self.key_column}' for table '{self.table_name}'.")
            else:
                print(f"Table '{self.table_name}' was not created (possibly no valid CSV data). Index creation skipped.")

            conn.commit()
        print(f"Successfully populated table '{self.table_name}' in '{self.db_path}' by processing files individually.")


    def __getitem__(self, key):
        with self._get_conn() as conn:
            cursor = conn.cursor()
            # Ensure column names are safe for SQL selection
            safe_value_column = f'"{self.value_column}"'
            safe_key_column = f'"{self.key_column}"'
            safe_table_name = f'"{self.table_name}"'
            
            query = f"SELECT {safe_value_column} FROM {safe_table_name} WHERE {safe_key_column} = ?"
            cursor.execute(query, (key,))
            row = cursor.fetchone()
        
        if row:
            return row[0]
        else:
            raise KeyError(f"Key '{key}' not found in table '{self.table_name}' column '{self.key_column}'.")

    def __contains__(self, key):
        with self._get_conn() as conn:
            cursor = conn.cursor()
            safe_key_column = f'"{self.key_column}"'
            safe_table_name = f'"{self.table_name}"'
            
            query = f"SELECT 1 FROM {safe_table_name} WHERE {safe_key_column} = ?"
            cursor.execute(query, (key,))
            return cursor.fetchone() is not None
            
    def get(self, key, default=None):
        try:
            return self[key]
        except KeyError:
            return default

    def close(self):
        # Connections are managed per operation with 'with' statement, so explicit close might not be needed
        # unless a persistent connection was maintained.
        pass

# Make sure this class is defined in Utils.ipynb so %run ./Utils.ipynb makes it available.
# Also ensure pandas and os are imported in the cell where this class is defined or earlier in Utils.ipynb.

In [None]:
class Utils: 

    DATA_DIR = os.path.abspath("../storage/")
    DECOMPILER_OUTPUT_DIR = os.path.join(DATA_DIR, 'decompiled-bytecodes')
    DECOMPILER_TIMEOUT = 3600
    STUDY_START_DATE = "2015-01-01"
    STUDY_END_DATE = "2025-05-01"
    
    BQ_KEY_PATH = 'cryptoassetanalytics-1e1c1b69e836.json'
    BQ_PROJECT_ID = '-'.join(BQ_KEY_PATH.split("-")[:len(BQ_KEY_PATH.split("-")) -1])
    BQ_STORAGE_PROXY_DETECTOR = 'storage_dynamic_proxy_detector'
    BQ_STORAGE_BYTECODES = 'storage_bytecodes'    
    CORE_COUNT = int(multiprocessing.cpu_count() * 0.75) # 80 * .75 = 60
    
    @staticmethod
    def create_directory(directory_path, override= False):
        """
        This function checks if the specified directory exists, deletes it along with its contents if overrides set true,
        and if director does not exist it creates an empty directory at the same path.
    
        Parameters:
        directory_path (str): The file path to the directory to be checked and recreated.
        """
        # Check if the directory exists
        if os.path.exists(directory_path) and override:
            # Remove the directory and all its contents
            shutil.rmtree(directory_path)
            print(f"Removed existing directory: {directory_path}")
            
            # Recreate the directory
            os.makedirs(directory_path)
            print(f"Created new directory: {directory_path}")
        
        elif not os.path.exists(directory_path):
            # create the directory
            os.makedirs(directory_path)
            print(f"Created new directory: {directory_path}")
    
    @staticmethod
    def multicore_read_csv(target_dir, num_cores=CORE_COUNT):
        """
        Read multiple CSV files from the specified directory using multiple cores.

        Args:
            target_dir (str): The directory containing the CSV files.
            num_cores (int): The number of cores to use for reading the files.

        Returns:
            pd.DataFrame: A concatenated DataFrame containing data from all CSV files.

        Raises:
            ValueError: If target_dir is not a directory or num_cores is not a positive integer.
            FileNotFoundError: If no CSV files are found in the target directory.
        """
        if not os.path.isdir(target_dir):
            raise ValueError(f"The specified target directory '{target_dir}' is not a valid directory.")
        
        if not isinstance(num_cores, int) or num_cores <= 0:
            raise ValueError("The number of cores must be a positive integer.")

        # Ensure the target directory path ends with a slash
        if not target_dir.endswith('/'):
            target_dir += '/'
        
        batches_path = [batch for batch in glob.glob(target_dir + "*.csv")]

        if len(batches_path) == 0:
            raise FileNotFoundError(f"No CSV files found in the target directory '{target_dir}'.")

        print('Reading {} CSV files from {} directory ...'.format(len(batches_path), target_dir))
        
        with Pool(num_cores) as p:
            df = pd.concat(p.map(pd.read_csv, batches_path), ignore_index=True)
        
        return df

    @staticmethod
    def convert_to_dict(dataframe, col1_key, col2_value):
        """
        Convert two columns of a DataFrame into a dictionary with keys from col1_key and values from col2_value.
    
        Args:
            dataframe (pd.DataFrame): The DataFrame containing the data.
            col1_key (str): The column name to use as keys in the dictionary.
            col2_value (str): The column name to use as values in the dictionary.
    
        Returns:
            dict: A dictionary with keys from col1_key and values from col2_value.
    
        Raises:
            TypeError: If the input dataframe is not a pandas DataFrame.
            ValueError: If col1_key or col2_value are not columns in the DataFrame.
        """
        if not isinstance(dataframe, pd.DataFrame):
            raise TypeError("The input dataframe must be a pandas DataFrame.")
        
        if col1_key not in dataframe.columns:
            raise ValueError(f"Column '{col1_key}' is not in the DataFrame.")
        
        if col2_value not in dataframe.columns:
            raise ValueError(f"Column '{col2_value}' is not in the DataFrame.")
    
        return dict(zip(dataframe[col1_key], dataframe[col2_value]))

    @staticmethod
    def escape_ansi(line):
        ansi_escape =re.compile(r'(\x9B|\x1B\[)[0-?]*[ -\/]*[@-~]')
        return ansi_escape.sub('', line)
