In [1]:
import pandas as pd
import requests
import os
from dotenv import load_dotenv
from tqdm import tqdm

In [2]:
DEEPSEEK_API_KEY = os.getenv('DEEPSEEK_API_KEY')

In [3]:
def deepseek_label(prompt, api_key):
    url = "https://api.deepseek.com/v1/chat/completions"
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }
    data = {
        "model": "deepseek-chat",
        "messages": [
            {"role": "user", "content": prompt}
        ],
        "temperature": 0
    }
    response = requests.post(url, headers=headers, json=data, timeout=30)
    response.raise_for_status()
    return response.json()['choices'][0]['message']['content'].strip()

In [4]:
def row_to_prompt(row):
    return (
        f"日期：{row['Date']}，时间：{row['Time']}，类别：{row['Category']}，收支：{row['in/out']}，"
        f"金额：{row['Amount']}，描述：{row['Product_Description']}，状态：{row['Status']}，备注：{row['Note']}，"
        f"对方：{row['Counterparty']}，对方账号：{row['Counterparty_Account']}，支付方式：{row['Payment_Method']}。\n"
        "请为这条流水分配一个详细类别（如：饮料、投资、交通等），只输出类别名称。"
    )

In [5]:
# 0. 读取数据
df = pd.read_csv('Data/update/cleaned.csv')
df_labeled = pd.read_csv('Data/cleaned_labeled.csv')

# 合并 Date 和 Time 以及 sub_category
df = df.merge(
    df_labeled[['Date', 'Time', 'sub_category']].rename(columns={'sub_category': 'sub_category_labeled'}),
    on=['Date', 'Time'],
    how='left'
)

if 'sub_category' not in df.columns:
    df['sub_category'] = ''

df['sub_category'] = df['sub_category_labeled'].combine_first(df['sub_category'])
df.drop(columns=['sub_category_labeled'], inplace=True)

In [6]:
mask = df['sub_category'] == ''
for i, idx in enumerate(tqdm(df[mask].index)):
    prompt = row_to_prompt(df.loc[idx])
    try:
        label = deepseek_label(prompt, DEEPSEEK_API_KEY)
        df.at[idx, 'sub_category'] = label
        if i % 100 == 0:
            df.to_csv('Data/update/cleaned_labeled.csv', index=False)
    except Exception as e:
        print(f"Error at {idx}: {e}")

100%|██████████| 684/684 [19:57<00:00,  1.75s/it]


In [7]:
df.to_csv('Data/cleaned_labeled.csv', index=False)

In [8]:
df['sub_category'].unique()

array(['购物', '红包', '投资', '教育', '通讯费', '学术资源', '捐款/慈善', '房租', '人情往来',
       '红包/礼物', '转账', '社交红包', '红包/礼金', '红包支出', '家居用品', '红包/转账', '社交/娱乐',
       '饮料', '娱乐服务', '社交支出', '保险', '社交娱乐', '数字服务订阅', '通讯服务', '娱乐',
       '社交/人情', '社交', '服饰配件', '社交/人情往来', '礼物', '打赏/小费', '捐款/公益', '娱乐订阅',
       '餐饮', '电子产品', '食品杂货', '交通', '零食/小吃', '慈善捐款', '服装', '餐饮外卖',
       '零食/便利店购物', '通讯费用', '聚餐', '零食/饮料', '零食/自动售货机', '洗衣服务', '洗衣',
       '知识付费', '零食/便利店', '零食', '社交/红包', '商业服务', '退款', '食品', '办公用品',
       '转账退款', '住宿', '打赏/礼物', '文具用品', '快递费', '快餐', '便利店/自动售货机', '社交费用',
       '教育费用', '转账/汇款', '收入-转账', '收入-零售', '打赏/赞赏', '自动售货机/便利店', '快递服务',
       '红包收入', '快餐/餐饮', '社交/人情支出', '文具/书籍', '收入-个人转账', '网购', '收款', '转账收入',
       '教育服务', '数字娱乐订阅', '亲友转账', '小额收款', '医疗费用', '收入', '网购/电子产品',
       '电子产品/数码产品', '转账/收款', '书籍/文具', '亲友借款', '收入-零售收入', '食品/超市购物',
       '生活用品', '收入-商家收款', '教育支出', '转账收款', '家庭支出', '提现', '软件服务', '社交转账',
       '甜点/蛋糕', '旅游门票', '食品/餐饮', '小吃/零食', '家居建材', '转账/红包', '电子产品/网购',
       '红包/礼品', '礼品/鲜

In [10]:

# # 1. 选择特征和标签
# # 需要数值化的特征
# categorical_features = ['Date', 'Time', 'Category', 'in/out', 'Status', 'Note', 'Counterparty', 'Counterparty_Account', 'Payment_Method']
# text_feature = 'Product_Description'
# target = 'new_category'

# # 2. 标注部分数据
# # 标注 investment
# df.loc[
#     (df['new_category'] == '') &
#     (df['Product_Description'].str.contains('买入|定期理财-建信养老飞月宝|余额宝-蚂蚁星愿主动攒入|银行产品-阜欣欣1号', na=False)),
#     ['new_category', 'in/out']
# ] = ['investment', 1]

# df.loc[
#     (df['new_category'] == '') &
#     (df['Product_Description'].str.contains('发放|卖出|分红|派息|赎回', na=False)),
#     ['new_category', 'in/out']
# ] = ['investment', -1]

# # 标注 drink
# df.loc[
#     (df['new_category'] == '') &
#     (df['in/out'] == 1) &
#     (
#         df['Counterparty'].str.contains('星巴克|农夫山泉|古茗|友宝|可口可乐|coffee', na=False, case=False) |
#         df['Product_Description'].str.contains('星巴克|农夫山泉|古茗|友宝|可乐|牛奶|雪碧|苏打水|送水|饮料|纯净水|汽水|啤酒|coffee|饮品|茶百道', na=False, case=False)
#     ),
#     'new_category'
# ] = 'drink'

# # 标注 incity_transport
# df.loc[
#     (df['new_category'] == '') &
#     (df['in/out'] == 1) &
#     (
#         df['Product_Description'].str.contains('滴滴.*车|高德.*车|单车|哈啰|骑行|地铁|公交|轨道交通|打车|快车|城市通|杭州通', na=False) |
#         df['Counterparty'].isin([
#             '上海都畅数字技术有限公司', '杭州优行科技有限公司', '西安城市通发展有限责任公司', '泾阳县宏安客运有限公司',
#             '咸阳市公共交通集团公司', '叮嗒出行', '滴滴出行', '杭州市民卡', '杭州地铁运营有限公司',
#             '上海新上铁实业发展集团有限公司', '哈啰出行'
#         ])
#     ),
#     'new_category'
# ] = 'incity_transport'

# # 标注 train_ticket
# df.loc[
#     (df['new_category'] == '') &
#     (df['in/out'] == 1) &
#     (df['Product_Description'].str.contains('火车票', na=False)),
#     'new_category'
# ] = 'train_ticket'

# # 标注 airplane_ticket
# df.loc[
#     (df['new_category'] == '') &
#     (df['in/out'] == 1) &
#     (df['Product_Description'].str.contains('机票', na=False)),
#     'new_category'
# ] = 'airplane_ticket'

# # 标注red_packet, -1
# df.loc[
#     (df['new_category'] == '') &
#     df['Product_Description'].isin(['蚂蚁森林-光伏治沙现金奖励', '现金红包-来自蚂蚁森林']),
#     ['new_category', 'in/out']
# ] = ['red_packet', -1]

# # 标注tissue
# df.loc[
#     (df['new_category'] == '') &
#     df['Product_Description'].str.contains('抽纸|卷纸|纸巾|面巾|湿巾',na=False),
#     'new_category'
# ] = 'tissue'

# # 标注 haircut

# df.loc[
#     (df['new_category'] == '') &
#     df['Product_Description'].str.contains('优剪|洗剪吹|美发|理发|剪发', na=False),
#     'new_category'
# ] = 'haircut'

# # 标注 mobile_communication
# # 话费、通讯相关
# df.loc[
#     (df['new_category'] == '') &
#     df['Product_Description'].str.contains('ShunShop|话费', na=False),
#     'new_category'
# ] = 'mobile_communication'

# # 标注 KFC/肯德基等快餐
# df.loc[
#     (df['new_category'] == '') &
#     df['Product_Description'].str.contains('KFC|肯德基', na=False),
#     'new_category'
# ] = 'food'

# # 标注 electricity_fee
# # 电费
# df.loc[
#     (df['new_category'] == '') &
#     df['Product_Description'].str.contains('电费', na=False),
#     'new_category'
# ] = 'electricity_fee'

# # 标注 gas_fee
# # 燃气费
# df.loc[
#     (df['new_category'] == '') &
#     df['Product_Description'].str.contains('燃气', na=False),
#     'new_category'
# ] = 'gas_fee'

# # 标注 movie
# # 电影相关
# df.loc[
#     (df['new_category'] == '') &
#     df['Product_Description'].str.contains('电影', na=False),
#     'new_category'
# ] = 'movie'

# # 标注 clothing
# # 服饰装扮
# df.loc[
#     (df['new_category'] == '') &
#     df['Product_Description'].str.contains('睡衣|衬衫|皮带|短裤|背心|袜子|短袖|帽子|手套|鞋|长袖', na=False),
#     'new_category'
# ] = 'clothing'

# # 标注 shopping
# # 购物相关
# df.loc[
#     (df['new_category'] == '') &
#     df['Product_Description'].str.contains('物美|盒马|华润|好又多', na=False),
#     'new_category'
# ] = 'shopping'