In [2]:
import pandas as pd
from datetime import datetime, timedelta
from clob_client import PolymarketClient,timestamp_to_datetime


In [4]:

# PolymarketClient 초기화
client = PolymarketClient()

# 입력 데이터 읽기 
matching_df = pd.read_csv('merged_result.csv')
questions = matching_df['matching_questions'].tolist()
dates = matching_df['upload_date'].tolist() 

# condition_id 매핑 (예: closed_trump_questions_description.csv)
closed_trump_df = pd.read_csv('closed_trump_questions_description.csv')
condition_ids = []
for question in questions:
    matched_row = closed_trump_df[closed_trump_df['question'] == question]
    if not matched_row.empty:
        condition_ids.append(matched_row.iloc[0]['condition_id'])
    else:
        condition_ids.append(None)
        
# 결과를 저장할 리스트
results = []

# 변동을 계산할 날짜 오프셋 (1일, 3일, 5일 후)
offsets = [1, 3, 5]


In [None]:
#가격 레이블링

import pandas as pd
from datetime import datetime, timedelta
from clob_client import PolymarketClient,timestamp_to_datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
from functools import lru_cache

# PolymarketClient 초기화
client = PolymarketClient()

# 캐싱을 위한 함수 정의
@lru_cache(maxsize=100)
def get_price_history(condition_id, outcome):
    # 한 번에 가격 히스토리를 가져오는 가정 (API 구현에 따라 조정 필요)
    resp = client.get_price(condition_id, outcome)
    return resp['history']

def get_price_at_date_cached(condition_id, outcome, target_date_str):
    price_history = get_price_history(condition_id, outcome)
    target_date = datetime.strptime(target_date_str, '%Y-%m-%dT%H:%M:%SZ')
    target_ts = int(target_date.timestamp())  # 타임스탬프로 변환
    
    # 가장 가까운 시간대의 가격 찾기
    closest_price = None
    min_time_diff = float('inf')
    for price_point in price_history:
        time_diff = abs(price_point['t'] - target_ts)
        if time_diff < min_time_diff:
            min_time_diff = time_diff
            closest_price = price_point['p']
    return closest_price

# 병렬 처리를 위한 함수
def fetch_price_change(condition_id, date_str, offset):
    target_date = datetime.strptime(date_str, '%Y-%m-%d')
    future_date = target_date + timedelta(days=offset)
    future_date_str = future_date.strftime('%Y-%m-%d') + 'T00:00:00Z'
    
    target_yes_price = get_price_at_date_cached(condition_id, 'Yes', f"{date_str}T00:00:00Z")
    future_yes_price = get_price_at_date_cached(condition_id, 'Yes', future_date_str)
    
    if target_yes_price is not None and future_yes_price is not None:
        if future_yes_price > target_yes_price:
            return 1  # 상승
        elif future_yes_price < target_yes_price:
            return -1  # 하락
        else:
            return 0  # 중립
    return None

# 메인 로직
matching_df = pd.read_csv('matching_questions_rollcall_updated.csv')
questions = matching_df['matching_questions'].tolist()
dates = matching_df['date'].tolist()

closed_trump_df = pd.read_csv('closed_trump_questions_description.csv')
condition_ids = []
for question in questions:
    matched_row = closed_trump_df[closed_trump_df['question'] == question]
    condition_ids.append(matched_row.iloc[0]['condition_id'] if not matched_row.empty else None)

results = []
offsets = [1, 3, 5]

# 병렬 처리
with ThreadPoolExecutor(max_workers=10) as executor:
    futures = []
    for i, (condition_id, date_str) in enumerate(zip(condition_ids, dates)):
        if condition_id is None:
            continue
        
        # 타겟 날짜의 Yes 가격 조회
        target_yes_price = get_price_at_date_cached(condition_id, 'Yes', f"{date_str}T00:00:00Z")
        change_labels = {}
        
        # 미래 날짜 가격 조회 병렬 처리
        for offset in offsets:
            future = executor.submit(fetch_price_change, condition_id, date_str, offset)
            futures.append((i, offset, future))
        
        results.append({
            'question': questions[i],
            'date': date_str,
            'yes_price': target_yes_price,
            **change_labels
        })
    
    # 결과 수집
    for i, offset, future in futures:
        change_labels[f'change_{offset}d'] = future.result()
        results[i].update(change_labels)

# 결과를 DataFrame으로 변환하고 CSV로 저장
result_df = pd.DataFrame(results)
result_df.to_csv('question_prices_with_changes.csv', index=False)
print("가격 변동 라벨이 'question_prices_with_changes.csv'에 저장되었습니다.")

KeyboardInterrupt: 

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from clob_client import PolymarketClient, timestamp_to_datetime
from concurrent.futures import ThreadPoolExecutor
from functools import lru_cache

# PolymarketClient 초기화
client = PolymarketClient()

# 캐싱을 위한 함수 정의
@lru_cache(maxsize=100)
def get_price_history(condition_id, outcome):
    # 한 번에 가격 히스토리를 가져온다고 가정 (API 구현에 따라 조정 가능)
    resp = client.get_price(condition_id, outcome)
    return resp['history']

def get_price_at_date_cached(condition_id, outcome, target_date_str):
    """
    캐시된 price_history에서 target_date_str (예: '2025-03-14T00:00:00Z')와
    가장 가까운 시간대의 가격을 찾아 반환
    """
    price_history = get_price_history(condition_id, outcome)
    target_date = datetime.strptime(target_date_str, '%Y-%m-%dT%H:%M:%SZ')
    target_ts = int(target_date.timestamp())  # 타임스탬프로 변환
    
    closest_price = None
    min_time_diff = float('inf')
    for price_point in price_history:
        time_diff = abs(price_point['t'] - target_ts)
        if time_diff < min_time_diff:
            min_time_diff = time_diff
            closest_price = price_point['p']
    return closest_price

def fetch_volatility(condition_id, base_date_str, days=5, threshold=0.03):
    """
    base_date_str가 이미 'YYYY-MM-DDTHH:MM:SSZ' 형식이라고 가정.
    base_date_str부터 days일(기본 5일) 뒤까지 매일 'Yes' 가격을 가져와
    일별 수익률을 계산한 뒤, 표준편차가 threshold 초과 시 1, 아니면 0 반환
    """
    # Day0 파싱
    base_date = datetime.strptime(base_date_str, '%Y-%m-%dT%H:%M:%SZ')
    
    # Day0 ~ Day5 가격 수집
    daily_prices = []
    for day_offset in range(days + 1):
        check_date = base_date + timedelta(days=day_offset)
        # 날짜를 그대로 '%Y-%m-%dT%H:%M:%SZ' 형식으로 포맷
        check_date_str = check_date.strftime('%Y-%m-%dT%H:%M:%SZ')
        
        price = get_price_at_date_cached(condition_id, 'Yes', check_date_str)
        daily_prices.append(price)

    # 일별 수익률 계산
    daily_returns = []
    for i in range(len(daily_prices) - 1):
        p1 = daily_prices[i]
        p2 = daily_prices[i + 1]
        if p1 is not None and p2 is not None and p1 != 0:
            r = (p2 - p1) / p1
            daily_returns.append(r)
        else:
            daily_returns.append(None)
    
    # None이 아닌 값만 골라 표본 표준편차 계산
    valid_returns = [r for r in daily_returns if r is not None]
    if len(valid_returns) > 1:
        std_val = np.std(valid_returns, ddof=1)
    else:
        std_val = None
    
    # 표준편차가 임계값을 넘으면 변동성 영향 O(1), 아니면 X(0)
    if std_val is not None and std_val > threshold:
        return 1
    else:
        return 0

# 메인 로직
df = pd.read_csv('merged_result.csv')
questions = df['matching_questions'].tolist()
dates = df['upload_date'].tolist()  
condition_ids = df['condition_id'].tolist()
results = []

# 병렬 처리
with ThreadPoolExecutor(max_workers=10) as executor:
    futures = []
    for i, (condition_id, date_str) in enumerate(zip(condition_ids, dates)):
        if condition_id is None or pd.isnull(date_str):
            # condition_id 또는 날짜가 없으면 건너뜀
            results.append({
                'question': questions[i],
                'date': date_str,
                'volatility_label': None
            })
            continue
        
        # 표준편차(5일 기준) → 변동성 임계값: 임의 예시(0.03)
        future = executor.submit(fetch_volatility, condition_id, date_str, 5, 0.03)
        futures.append((i, future))
    
    # 결과 수집
    for i, future in futures:
        label = future.result()
        results.append({
            'question': questions[i],
            'date': dates[i],
            'volatility_label': label
        })

# 결과 저장
result_df = pd.DataFrame(results)
result_df.to_csv('question_prices_with_volatility.csv', index=False)
print("표준편차 기반 변동성 라벨링이 'question_prices_with_volatility.csv'에 저장되었습니다.")


표준편차 기반 변동성 라벨링이 'question_prices_with_volatility.csv'에 저장되었습니다.
