In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('CASH_Orders_03072018.DAT', delimiter='\t')

In [5]:
data.size


151329928

TypeError: unhashable type: 'DataFrame'

In [31]:
import pandas as pd
import math
import csv

def parse_order(line: str):
    """Parse one 87-byte NSE cash market order record (with price and side)."""
    try:
        transaction_time = line[22:36].strip()   # Jiffies timestamp
        side = line[36]                          # 'B' or 'S'
        activity_type = int(line[37])            # 1=Entry, 3=Cancel, 4=Modify
        limit_price = int(line[66:74]) / 100.0   # Price in ₹
        return transaction_time, side, activity_type, limit_price
    except Exception:
        return None

def compute_best_bid_ask_chunk(chunk_data, output_parquet: str):
    """Compute best bid/ask for one chunk and save to Parquet."""
    records = []
    for line in chunk_data:
        line = str(line).strip()
        if not line or len(line) < 74:
            print("not valid")
            continue

        parsed = parse_order(line)
        if not parsed:
            continue

        timestamp, side, activity, price = parsed
        if activity == 3:  # Skip cancelled orders
            continue

        records.append((timestamp, side, price))

    df = pd.DataFrame(records, columns=["Timestamp", "Side", "Price"])
    if df.empty:
        print(f"⚠️ No valid data in this chunk for {output_parquet}.")
        return

    best_bid = df[df["Side"] == "B"].groupby("Timestamp")["Price"].max().rename("Best_Bid")
    best_ask = df[df["Side"] == "S"].groupby("Timestamp")["Price"].min().rename("Best_Ask")

    result = pd.concat([best_bid, best_ask], axis=1).reset_index()
    try:
        result.to_parquet(output_parquet, index=False, compression="snappy")
    except Exception as e:
        print(f"⚠️ Parquet failed ({e}), saving as CSV instead.")
        result.to_csv(output_parquet.replace(".parquet", ".csv"), index=False)

    print(f"✅ {output_parquet} written with {len(result):,} rows.")

def get_chunk(data, chunk_index, total_chunks=10):
    """Return a specific chunk (1-based index) of the data."""
    # Convert to list if DataFrame or Series
    print(total_chunks)
    if isinstance(data, pd.DataFrame):
        col_name = data.columns[0]
        data_list = data[col_name].tolist()
    elif isinstance(data, pd.Series):
        data_list = data.tolist()
    else:
        data_list = list(data)

    n = len(data_list)
    chunk_size = math.ceil(n / total_chunks)
    start = (chunk_index - 1) * chunk_size
    end = min(start + chunk_size, n)
    return data_list[start:end]

# ---------- USAGE ----------
# For example, to process chunk 3 separately:
chunk_index = 2
chunk_data = get_chunk(data, chunk_index, total_chunks=5)
compute_best_bid_ask_chunk(chunk_data, f"best_bid_ask_chunk_{chunk_index}.parquet")


5
✅ best_bid_ask_chunk_2.parquet written with 26,831,384 rows.


In [8]:
pip install --upgrade pandas pyarrow


Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd

# ---------- MOCK DATA ----------
mock_data = [
    "0" * 22 + "1000" + "B" + "1" + "0" * 28 + "42500000" + "0" * 13,
    "0" * 22 + "1000" + "S" + "1" + "0" * 28 + "42600000" + "0" * 13,
    "0" * 22 + "1001" + "B" + "1" + "0" * 28 + "42700000" + "0" * 13,
    "0" * 22 + "1001" + "S" + "1" + "0" * 28 + "42800000" + "0" * 13,
    "0" * 22 + "1002" + "B" + "1" + "0" * 28 + "42900000" + "0" * 13,
]

# ---------- PARSING FUNCTION ----------
def parse_order(line: str):
    try:
        timestamp = line[22:26].strip()
        side = line[26]
        activity = int(line[27])
        price = int(line[66:74]) / 100.0
        return timestamp, side, activity, price
    except Exception:
        return None

# ---------- PROCESS AND SAVE FUNCTION ----------
def save_best_bid_ask(data, output_file="mock_best_bid_ask.parquet"):
    records = []
    for line in data:
        parsed = parse_order(line)
        if not parsed:
            continue
        timestamp, side, activity, price = parsed
        if activity == 3:  # skip cancelled
            continue
        records.append((timestamp, side, price))
    
    df = pd.DataFrame(records, columns=["Timestamp", "Side", "Price"])
    if df.empty:
        print("⚠️ No valid records.")
        return
    
    best_bid = df[df["Side"] == "B"].groupby("Timestamp")["Price"].max().rename("Best_Bid")
    best_ask = df[df["Side"] == "S"].groupby("Timestamp")["Price"].min().rename("Best_Ask")
    result = pd.concat([best_bid, best_ask], axis=1).reset_index()
    
    # Try Parquet, fallback to CSV
    try:
        result.to_parquet(output_file, index=False, compression="snappy")
        print(f"✅ Saved to Parquet: {output_file}")
    except Exception as e:
        print(f"⚠️ Parquet failed ({e}), saving as CSV instead.")
        csv_file = output_file.replace(".parquet", ".csv")
        result.to_csv(csv_file, index=False)
        print(f"✅ Saved to CSV: {csv_file}")
    
    print(result)

# ---------- RUN MOCK EXAMPLE ----------
save_best_bid_ask(mock_data)


✅ Saved to Parquet: mock_best_bid_ask.parquet
  Timestamp  Best_Bid  Best_Ask
0      1000       0.0       0.0
1      1001       0.0       0.0
2      1002       0.0       NaN


In [5]:
df = pd.read_parquet('best_bid_ask_chunk_1.parquet')

In [32]:
df[df['Best_Ask'].notna()]

Unnamed: 0,Timestamp,Best_Bid,Best_Ask
27982,79631253512681,367.70,374.55
27984,79631253512697,181.65,186.65
27985,79631253512698,181.85,186.55
27986,79631253512699,112.20,128.90
27987,79631253512700,364.40,128.85
...,...,...,...
26819512,79632409336894,,36.50
26819513,79632409337408,,2871.35
26819514,79632409337409,,2871.35
26819515,79632409338278,,124.15


In [None]:
def parse_price_volume(line: str):
    """
    Extract price and volume from an NSE 87-byte cash market order line.
    
    Price: positions 66-73 (divide by 100 to get ₹)
    Volume: positions 74-81 (integer)
    """
    print(len(line))
    try:
        price = int(line[66:74]) / 100.0
        volume_og = int(line[58:66])
        volume_dis = int(line[50:58])

        return price, volume_og, volume_dis
    except Exception as e:
        print(f"⚠️ Could not parse line: {e}")
        return None, None

# ---------- EXAMPLE ----------
line = 'RMCASH100000000000180079631194590447B1bbALOKTEXTBE00000000001500000000042500000000NNN0'

price, volume_og, volume_dis = parse_price_volume(line)
print(f"Price: {price} ₹, Volume: {volume_dis}")


87
Price: 4.25 ₹, Volume: 0


In [None]:
import pandas as pd
import math

def parse_order_full(line: str):
    """
    Parse one NSE 87-byte record and return:
    timestamp, side, price, volume_disclosed, price_type, symbol, original_qty
    Handles:
      - Limit Orders (Price_Type="Limit")
      - Stop Loss Orders (Price_Type="Trigger")
    Only Entry (activity_type=1)
    """
    try:
        timestamp = line[22:36].strip()
        side = line[36]
        activity_type = int(line[37])
        symbol = line[38:48].lstrip('b').strip()
        original_qty = int(line[58:66])
        disclosed_qty = int(line[50:58])
        limit_price = int(line[66:74]) / 100.0
        trigger_price = int(line[74:82]) / 100.0
        market_limit_flag = line[82]
        stop_loss_flag = line[83]

        if activity_type == 3:  # Only Entry
            return None
        if market_limit_flag != 'N':  # Only Limit orders
            return None

        if stop_loss_flag == 'Y':
            price = trigger_price
            price_type = "Trigger"
        else:
            price = limit_price
            price_type = "Limit"

        return timestamp, side, price, disclosed_qty, price_type, symbol, original_qty
    except Exception:
        return None

def compute_chunk_full(chunk_data, output_file: str):
    """
    Process a chunk and save per transaction:
    - Timestamp
    - Side (B/S)
    - Price (Limit or Trigger)
    - Volume_Disclosed
    - Price_Type ("Limit" or "Trigger")
    - Symbol
    - Original Quantity
    """
    records = []
    for line in chunk_data:
        parsed = parse_order_full(line)
        if parsed:
            records.append(parsed)

    df = pd.DataFrame(records, columns=[
        "Timestamp", "Side", "Price", "Volume_Disclosed", "Price_Type", "Symbol", "Original_Qty"
    ])
    if df.empty:
        print(f"⚠️ No valid orders in this chunk for {output_file}.")
        return

    # Save to Parquet, fallback to CSV
    try:
        df.to_parquet(output_file, index=False, compression="snappy")
    except Exception as e:
        print(f"⚠️ Parquet failed ({e}), saving as CSV instead.")
        df.to_csv(output_file.replace(".parquet", ".csv"), index=False)

    print(f"✅ {output_file} written with {len(df):,} rows.")
    print(df.head())

def get_chunk(data, chunk_index, total_chunks=5):
    if isinstance(data, pd.DataFrame):
        col_name = data.columns[0]
        data_list = data[col_name].tolist()
    elif isinstance(data, pd.Series):
        data_list = data.tolist()
    else:
        data_list = list(data)

    n = len(data_list)
    chunk_size = math.ceil(n / total_chunks)
    start = (chunk_index - 1) * chunk_size
    end = min(start + chunk_size, n)
    return data_list[start:end]

# ---------- USAGE EXAMPLE ----------
chunk_index = 6
chunk_data = get_chunk(data, chunk_index, total_chunks=5)
compute_chunk_full(chunk_data, f"orders_chunk_{chunk_index}.parquet")


✅ orders_chunk_5.parquet written with 27,465,335 rows.
        Timestamp Side   Price  Volume_Disclosed Price_Type Symbol  \
0  79631403218569    S  661.60                36      Limit  TECHM   
1  79631403218587    B  660.50                 0      Limit  TECHM   
2  79631403218588    S  661.40               142      Limit  TECHM   
3  79631403218589    B  660.45                 0      Limit  TECHM   
4  79631403218590    S  661.75               136      Limit  TECHM   

   Original_Qty  
0            36  
1            40  
2           142  
3            40  
4           136  


In [4]:
import pandas as pd
dff = pd.read_parquet("D:\Data03082018\orders_chunk_5.parquet")

In [7]:
dff.head(100)

Unnamed: 0,Timestamp,Side,Price,Volume_Disclosed,Price_Type,Symbol,Original_Qty
0,79631403218569,S,661.60,36,Limit,TECHM,36
1,79631403218587,B,660.50,0,Limit,TECHM,40
2,79631403218588,S,661.40,142,Limit,TECHM,142
3,79631403218589,B,660.45,0,Limit,TECHM,40
4,79631403218590,S,661.75,136,Limit,TECHM,136
...,...,...,...,...,...,...,...
95,79631403220827,B,658.05,0,Limit,TECHM,1200
96,79631403220828,B,660.55,0,Limit,TECHM,40
97,79631403220829,B,657.20,0,Limit,TECHM,1200
98,79631403220811,B,660.05,101,Limit,TECHM,101
