In [1]:
import os 
os.chdir('/app')

In [2]:
from dotenv import load_dotenv
load_dotenv(dotenv_path='config/.env')
database_path = os.getenv('DATABASE_PATH')
print(database_path)

database/db/ohlcv_data.db


In [35]:
from data.gather.indian_equity import gather_ohlcv_indian_equity

symbols , data = gather_ohlcv_indian_equity(timeframe='1d')

In [36]:
ohlcv_data = data.copy()

In [15]:
pip freeze

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [26]:
import os
import duckdb
import pandas as pd

def save_symbol_data_as_parquet(symbol , data , base_directory='database/finstore', market_name='indian_equity', timeframe='1d'):

    # Define the directory path based on the market name, timeframe, and symbol
    dir_path = os.path.join(base_directory, f"market_name={market_name}", f"timeframe={timeframe}", str(symbol))
    os.makedirs(dir_path, exist_ok=True)

    # Define the file path for the Parquet file
    file_path = os.path.join(dir_path, 'ohlcv_data.parquet')

    # Write the dataframe to a Parquet file with ZSTD compression
    data.to_parquet(file_path, index=False, compression='zstd')

# Example of usage
# data_dict = {'AAPL': df_aapl, 'GOOGL': df_googl}  # This should be your data dictionary input
# save_data_as_parquet(data_dict)


In [28]:
for symbol, data in ohlcv_data.items():
    save_symbol_data_as_parquet(symbol , data)

In [3]:
import duckdb
import os
import pandas as pd
from concurrent.futures import ProcessPoolExecutor

def read_parquet_for_symbol(symbol, market_name='indian_equity', timeframe='1d', base_directory='database/finstore'):
    """
    Reads the Parquet file for a given symbol and returns it as a DataFrame.

    Parameters:
        symbol (str): The symbol to read data for.
        market_name (str): The market name (default: 'indian_equity').
        timeframe (str): The timeframe (default: '1d').
        base_directory (str): The base directory where data is stored (default: 'database/finstore').

    Returns:
        tuple: A tuple containing the symbol and its corresponding DataFrame.
    """
    # Define the directory path and file path based on the input parameters
    file_path = os.path.join(base_directory, f"market_name={market_name}", f"timeframe={timeframe}", symbol, 'ohlcv_data.parquet')

    # Check if the file exists
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"Parquet file not found for symbol '{symbol}' at '{file_path}'")

    # Create a DuckDB connection (in-memory for this operation)
    conn = duckdb.connect()
    conn.execute("PRAGMA threads=4")  # Use multiple threads for parallel reading

    # Read the entire Parquet file into a DataFrame
    df = conn.execute(f"SELECT * FROM read_parquet('{file_path}')").fetchdf()

    # Close the DuckDB connection
    conn.close()

    return symbol, df

def read_all_symbols(symbols, market_name='indian_equity', timeframe='1d', base_directory='database/finstore'):
    """
    Reads the Parquet files for all given symbols in parallel and returns a dictionary with the results.

    Parameters:
        symbols (list): List of symbols to read data for.
        market_name (str): The market name (default: 'indian_equity').
        timeframe (str): The timeframe (default: '1d').
        base_directory (str): The base directory where data is stored (default: 'database/finstore').

    Returns:
        dict: A dictionary with symbols as keys and their corresponding DataFrames as values.
    """
    results = {}
    with ProcessPoolExecutor() as executor:
        # Use ProcessPoolExecutor to read each symbol's data in parallel
        futures = {executor.submit(read_parquet_for_symbol, symbol, market_name, timeframe, base_directory): symbol for symbol in symbols}
        for future in futures:
            symbol = futures[future]
            try:
                symbol, df = future.result()
                results[symbol] = df
            except Exception as e:
                print(f"Error reading data for symbol {symbol}: {e}")
    
    return results

# Example of usage
# symbols_list = ['AAPL', 'GOOGL', 'MSFT']  # List of symbols to read
# data_dict = read_all_symbols(symbols_list, market_name='indian_equity', timeframe='1d')
# print(data_dict)


In [4]:
from data.fetch.indian_equity import fetch_symbol_list_indian_equity
symbols = fetch_symbol_list_indian_equity(index_name='nse_eq_symbols')
dict_data = read_all_symbols(symbols , market_name='indian_equity', timeframe='1d')
