In [1]:
import pandas as pd
import json
import warnings
warnings.filterwarnings("ignore")


In [3]:
df = pd.read_csv('data/kalshi_trade_data.csv')

In [4]:
contract_wise = df.groupby('ticker_name').agg({'contracts_traded': 'sum', 'price': 'mean'})

In [5]:
contract_wise = contract_wise.sort_values('contracts_traded', ascending=False)

In [6]:
contract_wise = contract_wise.reset_index()

In [7]:
contract_wise['ticker_only'] = contract_wise['ticker_name'].apply(lambda x: x.split('-')[0])

In [51]:
category = json.loads(open('categories.json').read())

# Method 1: Using apply with a function
def categorize_ticker(ticker_name):
    for key in category.keys():
        if key in str(ticker_name):
            return category[key]
    return 'Uncategorized'

contract_wise['category'] = contract_wise['ticker_name'].apply(categorize_ticker)

In [52]:
contract_wise.to_csv('data/category_wise.csv', index=None)

In [53]:
null = contract_wise[contract_wise['category'] == 'Uncategorized']

In [54]:
null['contracts_traded'].sum() / contract_wise['contracts_traded'].sum()

0.03652751773567131

In [55]:
len(null)/len(contract_wise)

0.005566104685731822

In [56]:
null[null['ticker_name'].str.contains('BTC')]

Unnamed: 0,ticker_name,contracts_traded,price,ticker_only,category


In [57]:
null['ticker_only'].value_counts()[:100].keys()

Index(['PRESPARTYGA', 'KXNEXTSPEAKER', 'KXWA4PRIMARY', 'KXECADVANTAGE',
       'KXNYCCOUNCIL39', 'KXNIAGARASOUTH', 'PRESPARTYNE', 'KXMALAWIASSEMBLY',
       'GOVPARTYND', 'KXDISTANCE3IATLAS', 'KXGOVNMNOMD', 'KXDEPCIA',
       'KXACQUIREREALMADRID', 'KXDONATEBEASTWATER',
       'KXPODCASTGUESTCALLHERDADDY', 'PRESPARTYME1', 'KXALLINGUEST',
       'KXLANGLEY', 'KXTOPALBUMBYTAYLORSWIFT', 'GOVPARTYME',
       'KXMYSTERYSTOCKBUFFET', 'KXUCL16PSGSTB', 'KXMCGREGORFIGHTNEXT', 'SORA',
       'KXO13MENSINGLES', 'HOUSECA40', 'SENATEMA', 'KXCO2LEVEL',
       'KXNYCBOROUGHWINMAN', 'HOUSEMI4', 'GOVPARTYFL', 'HOUSEFL6', 'SENATEAL',
       'GOVPARTYIL', 'KXNEXTTONGAPM', 'KXUEFASUPERCUP', 'PRESPARTYMD',
       'PRESPARTYME2', 'KXEDUCUTS', 'KXTURKEYAMB', 'KXOAIPROFIT',
       'KXKLARSTRIP', 'KXAUSCOALITION', 'KXOTHERLAUNCH', 'GOVPARTYTN',
       'KXBOXING', 'GOVPARTYVA', 'KXRESCISSIONBILL', 'SENATEDE', 'KXCA11D',
       'KXCONGRESSTRADES', 'PRESPARTYDE', 'KXUCL16PSVJUV', 'HOUSEPA1',
       'PRESPARTYDC',

In [60]:
df = pd.read_csv('data/kalshi_trade_data.csv')
category = json.loads(open('categories.json').read())
pattern = '|'.join(category.keys())


def categorize_chunk(chunk):
    global pattern
    return chunk['ticker_name'].str.findall(pattern).str[0].map(category)

# Process in chunks
chunk_size = 100000
chunks = []
for i in range(0, len(df), chunk_size):
    print(f"{round(i/len(df) * 100, 2)}% done")
    chunk = df.iloc[i:i+chunk_size]
    chunk['category'] = categorize_chunk(chunk)
    chunks.append(chunk)

df = pd.concat(chunks, ignore_index=True)
df.to_csv('data/df_with_category.csv', index=None)

0.0% done
0.42% done
0.84% done
1.26% done
1.68% done
2.1% done
2.52% done
2.94% done
3.35% done
3.77% done
4.19% done
4.61% done
5.03% done
5.45% done
5.87% done
6.29% done
6.71% done
7.13% done
7.55% done
7.97% done
8.39% done
8.81% done
9.23% done
9.65% done
10.06% done
10.48% done
10.9% done
11.32% done
11.74% done
12.16% done
12.58% done
13.0% done
13.42% done
13.84% done
14.26% done
14.68% done
15.1% done
15.52% done
15.94% done
16.35% done
16.77% done
17.19% done
17.61% done
18.03% done
18.45% done
18.87% done
19.29% done
19.71% done
20.13% done
20.55% done
20.97% done
21.39% done
21.81% done
22.23% done
22.65% done
23.06% done
23.48% done
23.9% done
24.32% done
24.74% done
25.16% done
25.58% done
26.0% done
26.42% done
26.84% done
27.26% done
27.68% done
28.1% done
28.52% done
28.94% done
29.35% done
29.77% done
30.19% done
30.61% done
31.03% done
31.45% done
31.87% done
32.29% done
32.71% done
33.13% done
33.55% done
33.97% done
34.39% done
34.81% done
35.23% done
35.64% done


In [59]:
df.columns

Index(['create_ts', 'ticker_name', 'contracts_traded', 'price'], dtype='object')