In [2]:
import os
import re
import requests
from bs4 import BeautifulSoup
from datetime import datetime,timedelta
from langchain.chat_models import AzureChatOpenAI
from langchain.chains import create_extraction_chain
from langchain.document_loaders import AsyncHtmlLoader
from langchain.document_transformers import BeautifulSoupTransformer
from pprint import pprint
from openai import AzureOpenAI
import requests
import json

client = AzureOpenAI(
    api_key="2c167bc108b048b0bcfa3996772ee6d2",  
    api_version="2023-09-01-preview",
    azure_endpoint="https://bmd.openai.azure.com/"
)

deployment_name = 'gpt-4-1106'
PAST_DAY = 3

# Set the current date
today_date = datetime.now().strftime("%Y-%m-%d")

#set past x date
past_x_date = (datetime.now() - timedelta(days=PAST_DAY)).strftime("%Y-%m-%d")


# Initialize Azure Chat OpenAI
llm = AzureChatOpenAI(
    openai_api_version='2023-09-01-preview',
    deployment_name='gpt-35',
    azure_endpoint="https://bmd.openai.azure.com/",
    api_key='2c167bc108b048b0bcfa3996772ee6d2',
)

# Define schemas for extraction
schema = {
    "properties": {
        "news_article_title": {"type": "string"},
        "news_article_link": {"type": "string"},
    },
    "required": ["news_article_title", 'news_article_link'],
}

schema_with_date = {
    "properties": {
        "news_article_title": {"type": "string"},
        "news_article_link": {"type": "string"},
        "datetime": {"type": "string"},
    },
    "required": ["news_article_title", 'news_article_link', 'datetime'],
}

def extract(content: str, schema: dict):
    """
    Extract content based on the provided schema.

    :param content: HTML content to extract data from.
    :param schema: Schema definition for extraction.
    :return: Extracted data.
    """
    return create_extraction_chain(schema=schema, llm=llm).run(content)

def get_latest_news_twofirst(urls):
    """
    Scrape content from provided URLs using Playwright.

    :param urls: List of URLs to scrape.
    :param schema: Schema definition for extraction.
    :return: Extracted content.
    """

    print(f"Scraping {urls} URLs...")
    loader = AsyncHtmlLoader(urls)
    docs = loader.load()
    soup = BeautifulSoup(docs[0].page_content, 'html.parser')
    report_elements = soup.find_all(class_="report")
    extracted_content = extract(schema=schema, content=report_elements)
    pprint(extracted_content)
    return extracted_content


# Additional functions for scraping various websites

def get_latest_news_vapepost(url):
    """
    Fetch the latest news from the Vapepost website.

    :param url: URL of the Vapepost website.
    :return: List of the latest news articles.
    """

    print(f"Scraping {url} URL...")
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    news_items = soup.find('div', id='tdi_17', class_='td_block_inner')
    
    extracted_content = extract(schema=schema, content=news_items)
    result = []

    print(f"Extracting {len(extracted_content)} news articles...")
    for item in extracted_content:
        news_title = item['news_article_title']
        news_link = item['news_article_link']
        print (news_link,news_title)

        article_response = requests.get(news_link)
        article_soup = BeautifulSoup(article_response.content, 'html.parser')
        time_tag = article_soup.find('time')
        desc = article_soup.find('p', class_='td-post-sub-title')

        if time_tag and 'datetime' in time_tag.attrs:
            datetime_value = datetime.fromisoformat(time_tag['datetime']).strftime("%Y-%m-%d")
            if datetime_value >= past_x_date:
                print('add news:')
                print( news_title + ' ; 【description】 ' + desc.text, time_tag['datetime'])
                result.append({"news_title": news_title + ' ; 【description】 ' + desc.text, "news_link": news_link})
            else:
                print('out of date:')
                print( news_title + ' ; 【description】 ' + desc.text, time_tag['datetime'])

    pprint(result)
    return result

def get_latest_news_vapouround(url):
    """
    Fetch the latest news from the Vapouround website.

    :param url: URL of the Vapouround website.
    :return: List of the latest news articles.
    """
    print( f"Scraping {url} URL...")
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    blocks = soup.find_all('article')
    result = []
    print(f"Extracting {len(blocks)} news articles...")
    for block in blocks:
        extracted_content = extract(schema=schema_with_date, content=block)
        datetime_value = re.sub(r'(st|nd|rd|th)', '', extracted_content[0]['datetime'])
        datetime_value = datetime.strptime(datetime_value, "%d %B %Y").strftime("%Y-%m-%d")

        if datetime_value >= past_x_date:
            print('add news:')
            print( extracted_content[0]['news_article_title'] , extracted_content[0]['datetime'])
            result.append({"news_title": extracted_content[0]['news_article_title'], "news_link": extracted_content[0]['news_article_link']})
        else:
            print('out of date:')
            print( extracted_content[0]['news_article_title'] , extracted_content[0]['datetime'])
            break

    pprint(result)
    return result

def get_latest_news_vapeast(url):
    """
    Fetch the latest news from the https://vapeast.com/news/

    :param url: URL of the Vapouround website.
    :return: List of the latest news articles.
    """
    print( f"Scraping {url} URL...")
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    blocks = soup.find_all('div',class_ = 'td-block-span4')
    result = []
    print(f"Extracting {len(blocks)} news articles...")
    for block in blocks:
        extracted_content = extract(schema=schema_with_date, content=block)
        # Convert to datetime object
        datetime_obj = datetime.strptime(extracted_content[0]['datetime'], "%B %d, %Y")
        datetime_value = datetime_obj.strftime("%Y-%m-%d")
        if datetime_value >= past_x_date:
            print('add news:')
            print( extracted_content[0]['news_article_title'] , extracted_content[0]['datetime'])
            result.append({"news_title": extracted_content[0]['news_article_title'], "news_link": extracted_content[0]['news_article_link']})
        else:
            print('out of date:')
            print( extracted_content[0]['news_article_title'] , extracted_content[0]['datetime'])
            break

    pprint(result)
    return result

def get_latest_news_vape360(url):
    """
    Fetch the latest news from https://vaping360.com/vape-news/.

    :param url: URL of the Vapepost website.
    :return: List of the latest news articles.
    """

    print(f"Scraping {url} URL...")
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    news_items = soup.find('div', class_='category-block')
    
    extracted_content = extract(schema=schema, content=news_items)
    result = []

    print(f"Extracting {len(extracted_content)} news articles...")
    for item in extracted_content:
        news_title = item['news_article_title']
        news_link = item['news_article_link']
        print (news_link,news_title)

        article_response = requests.get(news_link)
        article_soup = BeautifulSoup(article_response.content, 'html.parser')
        article_header = article_soup.find('div', class_='general-header')
        article_content = extract(schema=schema_with_date, content=article_header)
        
        date_str = article_content[0]['datetime']
        date_obj = datetime.strptime(date_str, "%B %d, %Y")
        formatted_date = date_obj.strftime("%Y-%m-%d")
        if formatted_date >= past_x_date:
            print('add news:')
            print( article_content[0]['news_article_title'] , article_content[0]['datetime'])
            result.append({"news_title": news_title, "news_link": news_link})
        else:
            print('out of date:')
            print( article_content[0]['news_article_title'] , article_content[0]['datetime'])
    pprint(result)
    return result


def get_latest_news_tobacco_reporter(url):
    """
    Fetch the latest news from the https://tobaccoreporter.com/

    :param url: URL of the Vapouround website.
    :return: List of the latest news articles.
    """
    print( f"Scraping {url} URL...")
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    blocks = soup.find_all('div',class_ = 'post-block-item style3')
    result = []
    extracted_content = extract(schema=schema_with_date, content=blocks)
    print(f"Extracting {len(extracted_content)} news articles...")
    for content in extracted_content:
    # Convert to datetime object
        datetime_obj = datetime.strptime(content['datetime'], "%B %d, %Y")
        datetime_value = datetime_obj.strftime("%Y-%m-%d")
        if datetime_value >= past_x_date:
            print('add news:')
            print( content['news_article_title'] , content['datetime'])
            result.append({"news_title": content['news_article_title'], "news_link": content['news_article_link']})
        else:
            print('out of date:')
            print( content['news_article_title'] , content['datetime'])

    pprint(result)
    return result

# Summarize the news
def gpt_summarize(combied,deployment_name):
  print(f"Summarizing using {deployment_name}...")
  # Send a completion call to generate an answer
  response = client.chat.completions.create(
      model=deployment_name,
      messages=[{"role": "user", "content": f'''
                  [NEWS]: {combied}
  You are a news reporter, Use chinese to summarize [NEWS] related to vaping, ecig news and politics, 
  and follow the rules below:
  
  - use as much as possible different sources.
  - try to avoid advertisement and other topics.
  - divide topic into countries, following such order US>UK>GERMAN>etc, if country name is mentioned, use country name.
  - Notice that China is the exporter of vape products, so any news which mentioned China and other countries, should be put into the other country's category.
  - try to conclude similar topics.
  - do not include any news related to specific brands: elfbar,lostmary.
  - do not include your explaination. 

  finally, use following markdown format, if no news, do not write anything:

美国：
> - 一些总结。[→](links)
> - 一些总结  [→](links)

-英国：
> - 一些总结。[→](links)

-法国:
> - 一些总结。[→](links)
  
  '''}]
  )
  print(response.choices[0].message.content)
  return response.choices[0].message.content

print(f'past date: {past_x_date}')
print(f'today{today_date}')

past date: 2024-01-23
today2024-01-26


In [3]:

# Scrape data from specified URLs
two_first_news = get_latest_news_twofirst(f"https://www.2firsts.cn/report/detail?date={today_date}")
vape_post_news = get_latest_news_vapepost('https://www.vapingpost.com/')
vapouround_news = get_latest_news_vapouround('https://www.vapouround.co.uk/news/')
vape_post_vapeast = get_latest_news_vapeast('https://vapeast.com/news/')
vape_360_news = get_latest_news_vape360('https://vaping360.com/vape-news/')
tobacco_reporter_news = get_latest_news_tobacco_reporter('https://tobaccoreporter.com/')
vape_voice_news = get_latest_news_tobacco_reporter('https://vaporvoice.net/')


Scraping https://www.2firsts.cn/report/detail?date=2024-01-26 URLs...


Fetching pages: 100%|##########| 1/1 [00:01<00:00,  1.61s/it]


[{'news_article_link': 'https://www.2firsts.cn/news/detail?id=8491',
  'news_article_title': '12月中国出口韩国电子烟约6213万美元，环比下降了18.3%，同比下降15.54%。'},
 {'news_article_link': 'https://www.2firsts.cn/news/detail?id=8489',
  'news_article_title': 'TPE24即将开幕，两个至上已派出前方人员探访美国线下电子烟市场，在洛杉矶，口味禁令致行业调整，大麻、水烟产品挤占市场。'},
 {'news_article_link': 'https://www.2firsts.cn/admin/article',
  'news_article_title': '美国FDA就对22款SMOK产品下发营销拒绝令一事回复两个至上，不对SMOK正在发起的诉讼发表评论。'},
 {'news_article_link': 'https://www.2firsts.cn/news/detail?id=8522',
  'news_article_title': '新型烟草（设备）在日本获得公共大量露出，别的国家有跟进乃至效仿的可能吗？日本对于新型烟草产品的宣传在未来会转向吗 '
                        '？'},
 {'news_article_link': 'https://www.2firsts.cn/news/detail?id=8500',
  'news_article_title': '新西兰健康部副部长凯西•科斯特洛对于烟草管制提出了创新建议，希望冻结烟草消费税增长三年，避免通胀影响，科斯特洛表示，这将减轻吸烟者的经济负担。'},
 {'news_article_link': 'https://www.2firsts.cn/news/detail?id=8494',
  'news_article_title': '俄亥俄州议会推翻禁售口味烟草制品的禁令，口味烟草产品即将重新登上该州的店铺货架。'},
 {'news_article_link': 'https://www.2firsts.cn/news/detail?id=8502',


In [4]:
# Get Result
combied = two_first_news + vape_post_news + vapouround_news + vape_post_vapeast + vape_360_news \
+ tobacco_reporter_news + vape_voice_news
sources = {
    'two_first_news':len(two_first_news),
    'vape_post_news':len(vape_post_news),
    'vapouround_news':len(vapouround_news),
    'vape_post_vapeast':len(vape_post_vapeast),
    'vape_360_news':len(vape_360_news),
    'tobacco_reporter_news':len(tobacco_reporter_news),
    'vape_voice_news':len(vape_voice_news)
}
pprint(sources)



{'tobacco_reporter_news': 5,
 'two_first_news': 8,
 'vape_360_news': 3,
 'vape_post_news': 3,
 'vape_post_vapeast': 10,
 'vape_voice_news': 5,
 'vapouround_news': 1}


In [56]:

summarized = gpt_summarize(combied,deployment_name)
#save sources and summarized to md file
with open(f'./output/{today_date}.md','w') as f:
    f.write(summarized)
    f.write('\n')
    f.write('sources:\n')
    f.write(str(sources))



Summarizing using gpt-4-1106...
美国：
> - 美国食品药品监督管理局（FDA）对Suorin以及blu PLUS+发出营销拒绝命令，影响了部分电子烟产品的销售。[→](https://www.2firsts.cn/news/detail?id=8407)
> - 烟草制品营销申请（PMTA）审批日期被 FDA 定于6月30日。FDA Status Report 中提到了相关讯息。[→](https://vaporvoice.net/2024/01/22/new-pmta-finish-date-is-june-30-fda-status-report/)

新西兰：
> - 中国对新西兰11月份的电子烟出口约797万美元，环比增长26.46%，同比增长35.05%。[→](https://www.2firsts.cn/news/detail?id=8385)

俄罗斯：
> - 俄罗斯州长指示制定全面禁售电子烟的法案，并提交国家杜马审议，以应对学校中电子烟普及引起的有害健康事件。[→](https://www.2firsts.cn/news/detail?id=8387)

阿根廷：
> - 阿根廷新的烟草税法案取消了最低税收规定，对烟草行业产生较大影响，并引发当地医疗专家对电子烟使用的关注。[→](https://www.2firsts.cn/news/detail?id=8395)

拉脱维亚：
> - 拉脱维亚政府实施了一系列烟草控制措施，提高了购买烟草制品的年龄限制至20岁，并限制了电子烟烟油和替代品的调味剂使用，对包装也作出了规定。[→](https://www.2firsts.cn/news/detail?id=8403)

英国：
> - 英国代表团计划在COP10推广使用电子烟作为戒烟工具。[→](https://vaporvoice.net/2024/01/22/sticking-to-its-guns-2/)

瑞典：
> - 瑞典戒烟的数据在斯德哥尔摩展示。[→](https://tobaccoreporter.com/2024/01/22/how-sweden-quit-smoking-shown-in-stockholm/)


In [57]:
from datetime import datetime

# Send message to dingding
def dingmessage(content):
    # Set the current date
    today_date = datetime.now().strftime("%Y-%m-%d")
    current_time = datetime.now().strftime("%Y-%m-%d %H:%M")

    # 构建请求头部
    header = {"Content-Type": "application/json", "Charset": "UTF-8"}

    # 循环生成器并发送消息
    message = {
    "msgtype": "markdown",
    "markdown": {
        "title":f"{today_date}电子烟新闻",
        "text": f'''
# <b> {today_date} 电子烟新闻 
{content}
###### 爱奇迹洞察与数据BI组
###### {current_time}发布 
            '''
        },
        "at": {
            "atMobiles": [
                "150XXXXXXXX"
            ],
            "atUserIds": [
                "user123"
            ],
            "isAtAll": False
        }
    }


    message_json = json.dumps(message)

    # 请求的URL，WebHook地址
    #I&I 小群
    ii_webhook = f"https://oapi.dingtalk.com/robot/send?access_token=c1fafba29851c42474583b1f405dd242a6c91a94933e083e9ca1a9ef10aec305"
    #BMD 大群
    bmd_webhook = f"https://oapi.dingtalk.com/robot/send?access_token=459f14525095eab35ad2e6e610e7bddae9229539928c9f28c38186c1debe9ac0"
    #公关群
    pr_webhook = "https://oapi.dingtalk.com/robot/send?access_token=4d516b6bca6e47a3925025b15c5ce850ec1a13ff0b7f2ea36f2d61d3c21d36f7"

    #测试群
    test_webhook = f"https://oapi.dingtalk.com/robot/send?access_token=4882d356078def127f660c9e497fbc6096fbb3784fca3de816077b94475abd7b"

    #产品小群
    product_webhook = 'https://oapi.dingtalk.com/robot/send?access_token=2150a04d503fb54ffab49c4c8028472c772cd05bcdd688f8683cbd94004b36cf'

    #新兴大区群
    new_area_webhook = 'https://oapi.dingtalk.com/robot/send?access_token=d4a6960503dccbf2744aceb594be9c93d630f9fa9a828ac271ba3951366a798e'

    #MKT 总部群
    mkt_base_webhook = 'https://oapi.dingtalk.com/robot/send?access_token=a61b603a9508ae5e996272277b110f01563d52a95138e57129cbc056e3502dc5'

    hooklist = [ii_webhook,
                bmd_webhook,
                pr_webhook,
                test_webhook,
                product_webhook,
                new_area_webhook,
                mkt_base_webhook
                ]
    #single sent
    # info = requests.post(url=mkt_base_webhook, data=message_json, headers=header, verify=False)  # 打印返回的结果
    # print(info.text)

    #loop sent
    for webhook in hooklist:
        info = requests.post(url=webhook, data=message_json, headers=header, verify=False)  # 打印返回的结果
        print(info.text)

#read md file and send to dingding
# Set the current date
today_date = datetime.now().strftime("%Y-%m-%d")

with open(f'./output/{today_date}.md','r') as f:
# with open(f'./output/2024-01-08.md','r') as f:
    summarized = f.read()

#delete lines starts with sources and below
summarized = summarized.split('sources:')[0]
print(summarized)

dingmessage(summarized)

美国：
> - 美国食品药品监督管理局（FDA）对Suorin以及blu PLUS+发出营销拒绝命令，影响了部分电子烟产品的销售。[→](https://www.2firsts.cn/news/detail?id=8407)
> - 烟草制品营销申请（PMTA）审批日期被 FDA 定于6月30日。FDA Status Report 中提到了相关讯息。[→](https://vaporvoice.net/2024/01/22/new-pmta-finish-date-is-june-30-fda-status-report/)

新西兰：
> - 中国对新西兰11月份的电子烟出口约797万美元，环比增长26.46%，同比增长35.05%。[→](https://www.2firsts.cn/news/detail?id=8385)

俄罗斯：
> - 俄罗斯州长指示制定全面禁售电子烟的法案，并提交国家杜马审议，以应对学校中电子烟普及引起的有害健康事件。[→](https://www.2firsts.cn/news/detail?id=8387)

阿根廷：
> - 阿根廷新的烟草税法案取消了最低税收规定，对烟草行业产生较大影响，并引发当地医疗专家对电子烟使用的关注。[→](https://www.2firsts.cn/news/detail?id=8395)

拉脱维亚：
> - 拉脱维亚政府实施了一系列烟草控制措施，提高了购买烟草制品的年龄限制至20岁，并限制了电子烟烟油和替代品的调味剂使用，对包装也作出了规定。[→](https://www.2firsts.cn/news/detail?id=8403)

英国：
> - 英国代表团计划在COP10推广使用电子烟作为戒烟工具。[→](https://vaporvoice.net/2024/01/22/sticking-to-its-guns-2/)

瑞典：
> - 瑞典戒烟的数据在斯德哥尔摩展示。[→](https://tobaccoreporter.com/2024/01/22/how-sweden-quit-smoking-shown-in-stockholm/)





{"errcode":0,"errmsg":"ok"}




{"errcode":0,"errmsg":"ok"}




{"errcode":0,"errmsg":"ok"}




{"errcode":0,"errmsg":"ok"}




{"errcode":0,"errmsg":"ok"}




{"errcode":0,"errmsg":"ok"}




{"errcode":0,"errmsg":"ok"}
