In [None]:
import os

import pandas as pd
from pandas import DataFrame
import numpy as np
from scipy.stats.mstats import winsorize
from tqdm.notebook import tqdm  # 改用notebook样式

from Constants import Constants as const

tqdm.pandas(desc="Processing...")  # 针对notebook的初始化

# Try to append address information

In [None]:
reg_df = pd.read_stata(os.path.join(const.RESULT_PATH, '20250603_stock_act_reg_data_v2.dta'))
bill_header_df = pd.read_csv(os.path.join(const.DATABASE_PATH, 'bill mcdonald', 'LoughranMcDonald_10-K_HeaderData_1993-2024.zip'))

In [None]:
bill_header_df.head()

In [None]:
reg_df['cik'] = reg_df['cik'].replace('', np.nan)

In [None]:
ctat_df = pd.read_csv(os.path.join(const.COMPUSTAT_PATH, '1985_2024_ctat_firm_names.zip'),
                      usecols=[const.GVKEY, const.CIK, 'fyear'])
ctat_df = ctat_df.loc[ctat_df['fyear'] <= 2015].dropna(how='any').drop_duplicates(
    subset=[const.GVKEY], keep='last').drop(['fyear'], axis=1)


In [None]:
reg_df2 = reg_df.merge(ctat_df, how='left', on=[const.GVKEY], suffixes=('', '_ctat'))
reg_df2[const.CIK] = reg_df2[const.CIK + '_ctat'].fillna(reg_df2[const.CIK])
reg_df2.drop([const.CIK + '_ctat'], axis=1, inplace=True)

In [None]:
# 加载数据
ctat_df = pd.read_csv(os.path.join(const.COMPUSTAT_PATH, '1950_2024_ctat_firm_location_information.zip'),
                      usecols=[const.GVKEY, 'datadate', 'fyear', 'add1', 'add2', 'add3', 'add4', 'addzip', 'city', 'county', 'state'])

# 1️⃣ 用 datadate 补全 fyear（小于7月减一，大于等于7月不变）
ctat_df['datadate_parsed'] = pd.to_datetime(ctat_df['datadate'], errors='coerce')

# 只在 fyear 为空的行上赋值
mask_missing_fyear = ctat_df['fyear'].isna()
ctat_df.loc[mask_missing_fyear, 'fyear'] = np.where(
    ctat_df.loc[mask_missing_fyear, 'datadate_parsed'].dt.month < 7,
    ctat_df.loc[mask_missing_fyear, 'datadate_parsed'].dt.year - 1,
    ctat_df.loc[mask_missing_fyear, 'datadate_parsed'].dt.year
)

# 2️⃣ 去掉 add1, add2, add3, add4, addzip, city, state 全部为空的行
address_cols = ['add1', 'add2', 'add3', 'add4', 'addzip', 'city', 'state']
ctat_df = ctat_df.dropna(subset=address_cols, how='all')

# 3️⃣ 每年每公司只保留 datadate 最大的那一行
ctat_df = (ctat_df.sort_values('datadate_parsed')
                    .groupby([const.GVKEY, 'fyear'], as_index=False)
                    .tail(1))

# 最终输出（去掉 datadate_parsed 临时列）
ctat_df = ctat_df.drop(columns=['datadate_parsed'])

print(f"✅ 清理完成：剩余 {len(ctat_df)} 行，{ctat_df[const.GVKEY].nunique()} 家公司")


In [None]:
ctat_df.head()

In [None]:
# 1️⃣ 确保 key 类型一致
ctat_df['gvkey'] = ctat_df['gvkey'].astype(int)
reg_df2['gvkey'] = reg_df2['gvkey'].astype(int)
ctat_df['fyear'] = ctat_df['fyear'].astype(int)
reg_df2['fiscal_year'] = reg_df2['fiscal_year'].astype(int)

ctat_df['datadate_parsed'] = pd.to_datetime(ctat_df['datadate'], errors='coerce')

# 2️⃣ 按 gvkey + fyear 合并 ctat 地址信息到 reg_df
merged_df = pd.merge(reg_df2, ctat_df, how='left', left_on=['gvkey', 'fiscal_year'], right_on=['gvkey', 'fyear'],
                     suffixes=('', '_ctat'))

# 3️⃣ 对于同一年缺失地址的，用时间最近的地址补全

# 先选出 ctat_df 中每个 gvkey 最近的一条记录
latest_ctat = (ctat_df.sort_values('datadate_parsed', ascending=False)
                        .groupby('gvkey', as_index=False)
                        .first())

# 定义地址字段列表（要补全的列）
address_cols = ['add1', 'add2', 'add3', 'add4', 'addzip', 'city', 'state']

# 遍历每个字段，如果缺失（NaN），用 latest_ctat 补全
for col in address_cols:
    merged_df[col] = merged_df[col].fillna(
        merged_df.merge(latest_ctat[['gvkey', col]], on='gvkey', how='left')[f'{col}_y']
    )

# 4️⃣ 去掉多余列（比如 fyear_ctat）
if 'fyear_ctat' in merged_df.columns:
    merged_df = merged_df.drop(columns=['fyear_ctat', 'datadate_parsed', 'county', 'datadate'])

print(f"✅ 合并和补全完成，总行数：{len(merged_df)}")


In [None]:
merged_df.to_pickle(os.path.join(const.TEMP_PATH, '20250603_stock_act_address_data.pkl'))

In [None]:
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

# 选择唯一地址
address_cols = ['add1', 'add2', 'add3', 'add4', 'city_ctat', 'state', 'addzip']
unique_addresses = merged_df[address_cols].drop_duplicates()
print('Unique addresses number: ' + str(unique_addresses.shape[0]))

# 将 NaN 替换为空字符串，并拼接地址
unique_addresses['full_address'] = unique_addresses[address_cols].fillna('').apply(
    lambda row: ' '.join(row.values.astype(str)), axis=1
)

# 初始化 geocoder
geolocator = Nominatim(user_agent="my_unique_app_2027")
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)  # 加速率限制防封IP

# 假设 unique_addresses 是你整理好的 DataFrame，含 'full_address' 列
results = []
for address in tqdm(unique_addresses['full_address'].head(10)):
    try:
        location = geocode(address)
        if location:
            results.append((address, location.latitude, location.longitude))
        else:
            results.append((address, None, None))
    except Exception as e:
        print(f"Error on {address}: {e}")
        results.append((address, None, None))

# 转换为 DataFrame
geo_df = pd.DataFrame(results, columns=['full_address', 'latitude', 'longitude'])

In [None]:
import time

# ========== 初始化 ==========
address_cols = ['add1', 'add2', 'add3', 'add4', 'city_ctat', 'state', 'addzip']
unique_addresses = merged_df[address_cols].drop_duplicates()
print('Unique addresses number: ' + str(unique_addresses.shape[0]))

# 拼接完整地址
unique_addresses['full_address'] = unique_addresses[address_cols].fillna('').apply(
    lambda row: ' '.join(row.values.astype(str)), axis=1
)

# 需要处理的地址列表
remaining = set(unique_addresses['full_address'])
results = {}

# 一批 user_agent 可以循环使用（换 IP、换头用）
user_agents = [f"wya_address_app_attempt_{i}" for i in range(1, 11)]

# ========== 循环重试 ==========
max_retries = 10
for attempt in range(max_retries):
    if not remaining:
        print("✅ All addresses successfully geocoded!")
        break

    print(f"🔄 Attempt {attempt + 1} with user_agent {user_agents[attempt % len(user_agents)]}")
    geolocator = Nominatim(user_agent=user_agents[attempt % len(user_agents)])
    geocode = RateLimiter(geolocator.geocode, min_delay_seconds=2)

    # 逐个请求剩余的地址
    failed_this_round = set()
    for address in tqdm(remaining):
        try:
            location = geocode(address)
            if location:
                results[address] = (location.latitude, location.longitude)
            else:
                failed_this_round.add(address)
        except Exception as e:
            print(f"Error on {address}: {e}")
            failed_this_round.add(address)

    # 更新剩余需要重试的
    remaining = failed_this_round

    if remaining:
        print(f"⏳ Waiting 10 minutes before next retry (still {len(remaining)} addresses left)...")
        time.sleep(10 * 60)  # 10 minutes in seconds

# ========== 保存结果 ==========
# 转换为 DataFrame
geo_df = pd.DataFrame.from_dict(results, orient='index', columns=['latitude', 'longitude']).reset_index()
geo_df = geo_df.rename(columns={'index': 'full_address'})


In [None]:
remaining

In [None]:
geolocator = Nominatim(user_agent='wya_test_2077')
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=2)
geocode('#245, 7 West 41st Avenue, San Mateo, CA, 94403')

In [None]:
import requests

# ========== Your input ==========
# remaining_addresses = list of full address strings you want to geocode
# For example:
# remaining_addresses = ['1600 Pennsylvania Ave NW, Washington, DC 20500', 'New York, NY 10001', ...]
# Here we assume you already have that list.
# You can load it from CSV if needed:
# remaining_addresses = pd.read_csv("failed_addresses.csv")['full_address'].tolist()

# Insert your Mapbox token here
MAPBOX_ACCESS_TOKEN = "pk.eyJ1IjoibWFrcndhbmciLCJhIjoiY2pjbzl5M3NzMTN3djJ4bnhhM3h5NTN6ZCJ9.jiuTGT54fo2t1VNz70MNLw"

# ========== Geocode Function ==========
def geocode_mapbox(address, access_token):
    base_url = "https://api.mapbox.com/geocoding/v5/mapbox.places/"
    url = f"{base_url}{requests.utils.quote(address)}.json"
    params = {
        "access_token": access_token,
        "limit": 1,
        "country": "US"  # Optional: restrict to US
    }

    try:
        response = requests.get(url, params=params)
        response.raise_for_status()
        data = response.json()
        if data['features']:
            coords = data['features'][0]['geometry']['coordinates']
            return coords[1], coords[0]  # lat, lon
        else:
            return None, None
    except Exception as e:
        print(f"❌ Error for address: {address}, {e}")
        return None, None

# ========== Run Geocoding ==========
for address in tqdm(remaining):
    lat, lon = geocode_mapbox(address, MAPBOX_ACCESS_TOKEN)
    results[address] = (lat, lon)
    time.sleep(0.5)  # Slight delay to respect Mapbox rate limits (600 req/minute for free tier)


In [None]:
# 转换为 DataFrame
geo_df = pd.DataFrame.from_dict(results, orient='index', columns=['latitude', 'longitude']).reset_index()
geo_df = geo_df.rename(columns={'index': 'full_address'})

In [None]:
geo_df.loc[geo_df['latitude'].isnull()]

In [None]:
geo_df.to_excel(os.path.join(const.TEMP_PATH, '20250603_stock_act_address_data.xlsx'), index=False)

In [None]:
geo_df_clear = pd.read_excel(os.path.join(const.TEMP_PATH, '20250603_stock_act_address_data.xlsx'))

In [None]:
unique_addresses_with_geocode = unique_addresses.merge(geo_df_clear, on='full_address')
reg_df3 = merged_df.merge(unique_addresses_with_geocode, on=address_cols, how='left')


In [None]:
reg_df3.loc[reg_df3['latitude'].isnull()]

In [None]:
from geopy.distance import geodesic

# ✅ Washington D.C. 的地理坐标
dc_coords = (38.89511, -77.03637)  # 纬度, 经度


# ✅ 创建一个新列，计算距离（单位：公里）
reg_df3['distance_to_dc_km'] = reg_df3.apply(
    lambda row: geodesic((row['latitude'], row['longitude']), dc_coords).kilometers
    if pd.notnull(row['latitude']) and pd.notnull(row['longitude']) else None,
    axis=1
)



In [None]:
reg_df3['distance_to_dc_km'].describe()

In [None]:
reg_df3['cik'] = reg_df3['cik'].astype(float)
reg_df3.drop(['add3', 'add4'], axis=1).to_stata(os.path.join(const.RESULT_PATH, '20250604_stock_act_reg_data_v1.dta'), write_index=False, version=119)