In [49]:
import os

stopwords = []
with open('stopwords.txt', mode='r', encoding='utf-8') as file:
    for line in file:
        stopwords.append(line.strip())  

In [51]:
from openai import OpenAI
import pandas as pd
from tqdm import tqdm

client = OpenAI(
    api_key='YOUR KEY', 
    base_url="YOUR URL",
)

def get_label(title):
    # 敏感词库过滤
    for i in stopwords:
        if i in title:
            title = title.replace(i, '')
    ROLE = '你是一名优秀的标注人员，十分擅长新闻文本标注。可以根据新闻标题语气进行紧急程度打分（1-3分）。3分表示特别紧急，2分表示紧急，1分表示不紧急。'
    ROLE += '请参考下面的一些案例。'
    ROLE += '3分的标题：珠海台风预警升级！海滨景区、沙滩浴场关闭！冷空气也来了'
    ROLE += '2分的标题：(1) 新闻早茶｜受台风“小犬”影响，珠海交通服务有调整 (2) 对防汛救灾工作作出重要指示'
    ROLE += '1分的标题：让全网心疼的南通老张，被认定见义勇为｜南通早七点'
    
    USER_CONTENT = '下面有一个标题，需要你根据标题进行打分，请直接给出打分的具体数字，不需要对结果进行任何解释。标题：' 
    USER_CONTENT += title

    completion = client.chat.completions.create(
        model="deepseek-v3", 
        messages = [
            {'role': 'system', 'content': ROLE},
            {'role': 'user', 'content': USER_CONTENT},
        ],
    )
    return completion.choices[0].message.content

In [52]:
df = pd.read_csv('./data/typhoon.csv')[['title', 'read_num', 'title_len', 'date', 'link']]

In [54]:
df

Unnamed: 0,title,read_num,title_len,date,link
0,珠海台风预警升级！海滨景区、沙滩浴场关闭！冷空气也来了→,100001,28,2023-10-06 16:25:00,http://mp.weixin.qq.com/s?__biz=MzA4MjEzOTQ4Mw...
1,珠海台风预警生效！“小犬”\u200b趋向广东路径有变，将在这里掠过或登陆！返程注意→,68734,43,2023-10-05 11:12:00,http://mp.weixin.qq.com/s?__biz=MzA4MjEzOTQ4Mw...
2,新闻早茶｜受台风“小犬”影响，珠海交通服务有调整→,46764,25,2023-10-05 07:19:00,http://mp.weixin.qq.com/s?__biz=MzA4MjEzOTQ4Mw...
3,抓实抓细各项措施！黄志豪研究部署台风“小犬”防御工作,13738,26,2023-10-04 21:46:00,http://mp.weixin.qq.com/s?__biz=MzA4MjEzOTQ4Mw...
4,“小犬”已加强为强台风级！预计影响珠海时间……假期返程注意！,100001,30,2023-10-02 20:15:00,http://mp.weixin.qq.com/s?__biz=MzA4MjEzOTQ4Mw...
...,...,...,...,...,...
5066,"让全网心疼的南通老张，被认定见义勇为｜南通早七点""",10191,25,2023-09-06 06:37:00,http://mp.weixin.qq.com/s?__biz=MzA5MjQ1MjkwNg...
5067,进入防风Ⅰ级应急响应！汕尾发布紧急动员令,100001,20,2023-08-31 14:45:00,http://mp.weixin.qq.com/s?__biz=MjM5MTM1NTM1MA...
5068,"紧急通知！超强台风将经过赣州""",100001,15,2023-07-27 11:53:00,http://mp.weixin.qq.com/s?__biz=MjM5OTc2Mzg2Mg...
5069,"习近平对防汛救灾工作作出重要指示""",183,17,2023-08-02 16:23:00,http://mp.weixin.qq.com/s?__biz=MzI0NjE4MDU3Nw...


In [55]:
for i, r in tqdm(df.iterrows(), total=len(df)):
    if i >= 3439:
        emergency = get_label(r['title'])
        row = [r['link'], emergency]
        pd.DataFrame([row]).to_csv('./data/title_emergency.csv', index=None, mode='a', header=None)

100%|███████████████████████████████████████████████████| 5071/5071 [28:09<00:00,  3.00it/s]


# 合并数据

In [56]:
df_em = pd.read_csv('./data/title_emergency.csv', names=['link', 'em_label'])
df_em = df_em.drop_duplicates(subset='link')
df_em

Unnamed: 0,link,em_label
0,http://mp.weixin.qq.com/s?__biz=MzA4MjEzOTQ4Mw...,3
1,http://mp.weixin.qq.com/s?__biz=MzA4MjEzOTQ4Mw...,3
2,http://mp.weixin.qq.com/s?__biz=MzA4MjEzOTQ4Mw...,2
3,http://mp.weixin.qq.com/s?__biz=MzA4MjEzOTQ4Mw...,2
4,http://mp.weixin.qq.com/s?__biz=MzA4MjEzOTQ4Mw...,3
...,...,...
5066,http://mp.weixin.qq.com/s?__biz=MzA5MjQ1MjkwNg...,1
5067,http://mp.weixin.qq.com/s?__biz=MjM5MTM1NTM1MA...,3
5068,http://mp.weixin.qq.com/s?__biz=MjM5OTc2Mzg2Mg...,3
5069,http://mp.weixin.qq.com/s?__biz=MzI0NjE4MDU3Nw...,2


In [58]:
set(df_em['em_label'])

{'1', '2', '2分', '3', '3分'}

In [60]:
# 格式化数据，2分=>2， 3分=>3
df_em['em_label'] = df_em['em_label'].str.replace('分', '', regex=False)

In [62]:
new_df = pd.merge(df, df_em, on='link')

In [65]:
df_em.to_csv('./data/title_emergency.csv', index=None, header=None)