In [5]:
import pandas as pd
import os

# ================= 配置区域 =================
# 请确保这里的文件名和你电脑上的实际文件名完全一致
file_corn = 'corn county ninput.xlsx'            # 如果你是 .xlsx 文件，脚本会自动识别
file_weather = 'ppt_tmin_tmean_tmax_stable_800m_1990_2019.csv' 
file_ref = '25F121B9-3EB4-3BE7-AC50-870ADF7D8454.csv'    
output_file = 'final_panel_data_1990_2019.csv'           
# ===========================================

def read_file_smart(filepath):
    """
    智能读取函数：自动判断是 Excel 还是 CSV，并处理编码问题
    """
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"找不到文件: {filepath}，请检查路径是否正确。")

    # 获取文件后缀
    _, ext = os.path.splitext(filepath)
    ext = ext.lower()

    # 1. 如果是 Excel 文件 (.xlsx, .xls)
    if ext in ['.xlsx', '.xls']:
        print(f"检测到 Excel 文件: {filepath}，正在使用 read_excel 读取...")
        try:
            return pd.read_excel(filepath)
        except Exception as e:
            raise ValueError(f"读取 Excel 文件失败: {e}\n请确保安装了 openpyxl (pip install openpyxl)")

    # 2. 如果是 CSV 文件，尝试多种编码
    print(f"检测到 CSV/文本 文件: {filepath}，正在尝试读取...")
    attempts = [
        ('utf-8-sig', None), 
        ('gbk', None),       
        ('utf-16', '\t'),    
        ('ISO-8859-1', None) 
    ]
    
    for encoding, sep in attempts:
        try:
            # engine='python' 更稳定
            df = pd.read_csv(filepath, encoding=encoding, sep=sep, engine='python')
            print(f"-> 成功！(编码: {encoding})")
            return df
        except:
            continue
            
    raise ValueError(f"无法读取文件 {filepath}，请检查文件格式。")

def clean_text(df, cols=['State', 'County']):
    for col in cols:
        if col in df.columns:
            # 强制转为字符串，大写，去空格
            df[col] = df[col].astype(str).str.upper().str.strip()
    return df

# ================= 主程序 =================

print("=== 开始处理 ===")

# 1. 智能读取数据
df_corn = read_file_smart(file_corn)
df_weather = read_file_smart(file_weather)
df_ref = read_file_smart(file_ref)

# 2. 数据清洗
print("\n正在清洗文本格式...")
df_corn = clean_text(df_corn)
df_weather = clean_text(df_weather)
df_ref = clean_text(df_ref)

# 统一列名 (防止大小写差异)
df_corn.columns = [c.strip() for c in df_corn.columns] # 去除列名空格
if 'YEAR' in df_corn.columns: df_corn.rename(columns={'YEAR': 'Year'}, inplace=True)
if 'Date' in df_weather.columns: df_weather.rename(columns={'Date': 'Year'}, inplace=True)

# 3. 聚合气象数据
print("正在聚合气象数据...")
weather_cols = ['ppt (inches)', 'tmin (degrees F)', 'tmean (degrees F)', 'tmax (degrees F)']
# 只取存在的列
weather_cols = [c for c in weather_cols if c in df_weather.columns]

if weather_cols:
    df_weather_agg = df_weather.groupby(['State', 'County', 'Year'], as_index=False)[weather_cols].mean()
else:
    print("警告：未找到气象列，跳过聚合步骤。")
    df_weather_agg = df_weather

# 4. 合并数据
print("正在合并面板数据...")
# 检查合并键是否存在
join_keys = ['State', 'County', 'Year']
for key in join_keys:
    if key not in df_corn.columns: raise ValueError(f"玉米数据中缺少列: {key}")
    if key not in df_weather_agg.columns: raise ValueError(f"气象数据中缺少列: {key}")

df_panel = pd.merge(df_corn, df_weather_agg, on=join_keys, how='inner')

# 5. 匹配 ANSI 代码
print("正在匹配 ANSI 代码...")
ansi_map = {}
# 自动寻找 ANSI 列名
state_ansi_col = next((c for c in df_ref.columns if 'State' in c and 'ANSI' in c), 'State ANSI')
county_ansi_col = next((c for c in df_ref.columns if 'County' in c and 'ANSI' in c), 'County ANSI')

for index, row in df_ref.iterrows():
    try:
        if pd.notna(row.get(state_ansi_col)) and pd.notna(row.get(county_ansi_col)):
            state_val = int(row[state_ansi_col])
            county_val = int(row[county_ansi_col])
            ansi_code = f"{state_val:02d}{county_val:03d}"
            ansi_map[(row['State'], row['County'])] = ansi_code
    except:
        continue

df_panel['ANSI Code'] = df_panel.apply(lambda row: ansi_map.get((row['State'], row['County']), None), axis=1)

# 6. 筛选年份并保存
df_panel = df_panel[(df_panel['Year'] >= 1990) & (df_panel['Year'] <= 2019)]

print(f"\n处理完成！最终行数: {len(df_panel)}")
df_panel.to_csv(output_file, index=False, encoding='utf-8-sig')
print(f"文件已保存到: {output_file}")

=== 开始处理 ===
检测到 Excel 文件: corn county ninput.xlsx，正在使用 read_excel 读取...
检测到 CSV/文本 文件: ppt_tmin_tmean_tmax_stable_800m_1990_2019.csv，正在尝试读取...
-> 成功！(编码: utf-8-sig)
检测到 CSV/文本 文件: 25F121B9-3EB4-3BE7-AC50-870ADF7D8454.csv，正在尝试读取...
-> 成功！(编码: utf-8-sig)

正在清洗文本格式...
正在聚合气象数据...
正在合并面板数据...
正在匹配 ANSI 代码...

处理完成！最终行数: 20648
文件已保存到: final_panel_data_1990_2019.csv
