<a href="https://colab.research.google.com/github/yyokii/generator-of-ai-commented-news/blob/main/AISummary.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!python --version

Python 3.10.12


In [None]:
!pip install feedparser
!pip install langchain
!pip install openai
!pip install requests BeautifulSoup4
!pip install pyyaml
!pip install langid

In [None]:
# print(feedparser.__version__)

In [None]:
import os
import getpass

os.environ["OPENAI_API_KEY"] = getpass.getpass(prompt = 'OPEN API KEY ')

In [None]:
import feedparser
from html import unescape
import re

def get_feed(url, site_name):
    feed = feedparser.parse(url)
    articles = []
    for entry in feed.entries:
        content = entry.get('content', [{}])[0].get('value', '')
        clean_content = unescape(re.sub('<.*?>', '', content))  # HTMLタグを削除
        article = {
            'site': site_name,
            'title': entry.title,
            'link': entry.link,
            'updated': entry.updated,
            'content': clean_content,
            'comments': [],
        }
        articles.append(article)
    return articles


In [None]:
rss_feeds = {
    "CNN": "https://assets.wor.jp/rss/rdf/reuters/technology.rdf",
    "TechCrunch": "https://techcrunch.com/feed/",
    "The Verge": "https://www.theverge.com/rss/index.xml",
    "WIRED": "https://www.wired.com/feed/rss",
    "CNET": "http://feeds.japan.cnet.com/rss/cnet/all.rdf?_gl=1*1u1m776*_ga*MTAzMjc1MDQ2NS4xNjg2ODAxNjU1*_ga_JGFXZS6RMN*MTY4NjgwMTY1NS4xLjEuMTY4NjgwMTY1OC41Ny4wLjA.",
    "Ars Technica": "https://feeds.arstechnica.com/arstechnica/index"
}

In [None]:
# すべてのrssを取得し、article配列を作成する

all_articles = []

for site_name, url in rss_feeds.items():
    site_articles = get_feed(url, site_name)
    all_articles.extend(site_articles)

In [None]:
titles = ', '.join([article['title'] for article in all_articles])

In [None]:
print(titles)

In [None]:
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(model_name="gpt-4", temperature=.7, max_tokens=500)

In [None]:
# 魅力的なタイトルをピックアップ
import openai
import json

schema = {
    "type": "object",
    "properties": {
        "title": {
            "type": "array",
            "items": { "type": "string" }
        }
    },
    "required": ["title"]
}

openai.api_key = os.getenv("OPENAI_API_KEY")

completion = openai.ChatCompletion.create(
  model="gpt-4-0613",
  messages=[
    {"role": "system", "content": "You are a professional editor."},
    {"role": "user", "content": f"""
    You are a professional editor.
    List the titles of the articles and select 5 of them that you think would be of interest to creators and engineers.

    Input titles are ${titles}

    # Constraints:
    * The number of elements in the title array is 5. Please select only attractive titles.
    * Do not select similar titles.
    * For example, the following articles are attractive.
      * Articles about the latest research or technology
      * Articles about security incidents/accidents
      * Innovative product or service release
      * Updates on popular products, services, or libraries
      * Anything about anything other than the above, you may choose if you find it appealing.
    """}
  ],
  functions=[{"name": "pick_titles", "parameters": schema}],
  function_call={"name": "pick_titles"},
  temperature=0.8,
)

data = json.loads(completion.choices[0].message.function_call.arguments)
selected_titles = data["title"]
print(selected_titles)
print(len(selected_titles))

In [None]:
selected_articles = [article for article in all_articles if article['title'] in selected_titles]
if len(selected_articles) != 5:
    print("⚠️ 選ばれた記事が5件未満です")

In [None]:
from bs4 import BeautifulSoup
import requests

# CNNの記事を取得
def get_cnn_article_text(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # ここで記事の本文が<p class="Paragraph-paragraph-2Bgue ArticleBody-para-TD_9x">タグに囲まれている
    paragraphs = soup.find_all('p', {"class": "Paragraph-paragraph-2Bgue ArticleBody-para-TD_9x"})

    # 各段落のテキストを結合
    article_text = '\n'.join([p.text for p in paragraphs])

    return article_text

def update_cnn_articles_with_text(articles):
    for article in articles:
        article['content'] = get_cnn_article_text(article['link'])

# wiredの記事を取得
def get_wired_article_text(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # "lead-in-text-callout"クラスを持つspanタグの親のpタグの内容を取得
    lead_in_text = [p.text for p in soup.find_all('p') if p.find('span', class_='lead-in-text-callout')]

    # "paywall"クラスを持つタグの内容を取得
    paywall_text = [p.text for p in soup.find_all('p', class_='paywall')]

    article_text = ' '.join(lead_in_text + paywall_text)

    return article_text

def update_wired_articles_with_text(articles):
    for article in articles:
        article['content'] = get_wired_article_text(article['link'])

# the vergeの記事を取得
def get_verge_article_text(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    article_body_components = soup.find_all('div', {'class': 'duet--article--article-body-component'})
    article_body_text = [p.text for component in article_body_components for p in component.find_all('p')]

    article_text = ' '.join(article_body_text)

    return article_text

def update_verge_articles_with_text(articles):
    for article in articles:
        article['content'] = get_verge_article_text(article['link'])

# cnetの記事を取得
def get_cnet_article_text(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    article_body = soup.find('div', {'id': 'NWrelart:Body'})

    if article_body is not None:
        p_tags = article_body.find_all('p')

        # Get the text from each p tag and join them together.
        text = ' '.join(tag.get_text() for tag in p_tags)
        return text
    else:
        return ''

def update_cnet_articles_with_text(articles):
    for article in articles:
        article['content'] = get_cnet_article_text(article['link'])

# ars techinicaの記事を取得
def get_ars_techinica_article_text(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    article_body = soup.find('div', {'itemprop': 'articleBody', 'class': 'article-content post-page'})

    if article_body is not None:
        p_tags = article_body.find_all('p')

        # Get the text from each p tag and join them together.
        text = ' '.join(tag.get_text() for tag in p_tags)
        return text
    else:
        return ''

def update_ars_techinica_articles_with_text(articles):
    for article in articles:
        article['content'] = get_ars_techinica_article_text(article['link'])

In [None]:
# クロールし、articleを更新する

for article in selected_articles:
  if article["site"] == "CNN":
    article['content'] = get_cnn_article_text(article['link'])
  elif article["site"] == "The Verge":
    article['content'] = get_verge_article_text(article['link'])
  elif article["site"] == "WIRED":
    article['content'] = get_wired_article_text(article['link'])
  elif article["site"] == "CNET":
    article['content'] = get_cnet_article_text(article['link'])
  elif article["site"] == "Ars Technica":
    article['content'] = get_ars_techinica_article_text(article['link'])
  else :
    print("TechCrunch article")

# selected_articlesのそれぞれの記事のcontentを確認
for article in selected_articles:
    if not article['content']:
        # contentが空である場合、記事の情報を出力
        print(f"Article from {article['site']} with title '{article['title']}' has empty content.")

In [None]:
import langid

def is_japanese(text):
    lang, _ = langid.classify(text)
    return lang == 'ja'

In [None]:
import openai
import json

openai.api_key = os.getenv("OPENAI_API_KEY")

schema = {
    "type": "object",
    "type": "object",
    "properties": {
        "translatedTitle": {
            "type": "string"
        }
    },
    "required": ["translatedTitle"]
}


def translate(text):
  completion = openai.ChatCompletion.create(
      model="gpt-4-0613",
      messages=[
          {"role": "system", "content": "You are a professional translator."},
           {"role": "user", "content": f"Please translate the title ${text} into Japanese."}
          ],
      functions=[{"name": "traslate", "parameters": schema}],
      function_call={"name": "traslate"},
      temperature=0.9,
  )
  data = json.loads(completion.choices[0].message.function_call.arguments)
  return data["translatedTitle"]


In [None]:
# 日本語でないタイトルがあれば翻訳

for article in selected_articles:
  title = article["title"]
  if not is_japanese(title):
    print(title)
    japanese_title = translate(title)
    article["title"] = japanese_title
    print(japanese_title)

In [None]:
print(selected_titles)

print(selected_titles)
for i in selected_articles:
  print(i["title"])

In [None]:
# 要約の生成

from langchain.llms import OpenAI
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

summary_template = """
# Instructions:
You are a professional editor.
Please output the best summary based on the following constraints and input statements.

# Constraints:
* Two items regarding legal compliance must be observed.
  1. translating the entire text is a copyright infringement and will result in litigation issues. Please summarize the main points of the news in a concise manner. 2.
  2. writing speculation other than what is written in the article is considered spreading rumors and can lead to litigation. Never write anything other than what is written in the article.
* Language is Japanese.
* The maximum number of characters is 800. If the original text is shorter than that, please summarize it shorter than that.
* Do not leave out important keywords.
* Keep sentences concise.

# Input text:
{article}

# Output sentence:
"""

prompt_template = PromptTemplate(input_variables=["article"], template=summary_template)
summary_chain = LLMChain(llm=llm, prompt=prompt_template)

In [None]:
for article in selected_articles:
  print(article["title"] + " の要約\n")
  summary = summary_chain.run(article["content"])
  print(summary)
  print("------")
  article["summary"] = summary

In [None]:
# コメンテーターの生成（openaiのfunctional calling）

import openai
import json

schema = {
  "type": "object",
  "properties": {
    "commentators": {
      "type": "array",
      "items": {
        "person": "object",
        "properties": {
          "name": { "type": "string" },
          "title": { "type": "string" },
          "age": { "type": "number" },
          "sex": { "type": "string" }
        },
        "required": ["name", "title", "age", "sex"]
      }
    }
  },
  "required": ["commentators"]
}

openai.api_key = os.getenv("OPENAI_API_KEY")

completion = openai.ChatCompletion.create(
  model="gpt-4-0613",
  messages=[
    {"role": "system", "content": "You are a professional editor."},
    {"role": "user", "content": """
    Please generate name, title, age and sex of 10 commentators.
    The names can be Japanese or other. Please make sure that the names of the commentators are Japanese or not. Nicknames are acceptable. Please set them randomly.
    Examples of titles are
    software engineer, 研究者, 経済学者, 歴史家, 哲学者, デザイナー, お笑い芸人, 小学生, 中学生, 高校生, 大学生, 主婦
    and so on. You can use any of the titles listed here or any others.
    It can be any person, from historical figures to ordinary people. Try to choose people from a variety of backgrounds for your output.
    Please include at least one person with a non-academic title such as comedian, child, housewife, etc.
    All output should be in Japanese. Names can be in alphabetical or Japanese, whichever is appropriate.
    """}
  ],
  functions=[{"name": "create_commentators", "parameters": schema}],
  function_call={"name": "create_commentators"},
  temperature=0.9,
)

data = json.loads(completion.choices[0].message.function_call.arguments)
commentators = data["commentators"]

for commentator in commentators:
    print(commentator)

In [None]:
# レビューの生成

review_template = """
# INSTRUCTIONS:
You are {person}.
Based on the following constraints and the content of the article, please output your opinion on this content based on your findings.

# Constraints:.
* Language is Japanese
* At least 10 words, 200 at most
* The way you say the words and write the comments should be appropriate to the characteristics of your gender, age, and title.
* Please output your opinion as a {person}, not a general opinion.
* Do not include the contents of the input text, but mainly describe your impressions and opinions about the contents of the input text.
* You may mention not only the content of the input statement, but also any topics related to it
* Try to provide new information that is not available from the input statement. Please base your comments on your past experiences and findings.

# Input Sentence:
{article}

# Output statement:
"""

prompt_template = PromptTemplate(input_variables=["person", "article"], template=review_template)
review_chain = LLMChain(llm=llm, prompt=prompt_template)

In [None]:
import random

for article in selected_articles:
  selected_commentators = random.sample(commentators, 3)
  print(article["title"])
  for commentator in selected_commentators:
    person = "{title} of {age} year old {sex}".format(**commentator)
    print(f"{person}のコメント生成")
    generated_comment = review_chain.run({
    'person': person,
    'article': article["content"]
    })
    print(generated_comment)
    comment = {
    'commentator': commentator,
    'text': generated_comment,
    }
    article["comments"].append(comment)
  print("------")


In [None]:
# dateをISO形式にする
from datetime import datetime
import re

def is_iso_format(date_string):
  """Check if the given date string is in ISO 8601 format."""
  return bool(re.match(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}([+-]\d{2}:\d{2})?', date_string))


def convert_to_iso(date_string):
  date_object = datetime.strptime(date_string, "%a, %d %b %Y %H:%M:%S %z")
  iso_format_date = date_object.isoformat()
  print(f"{date_string} converted to ISO format: {iso_format_date}")
  return iso_format_date

In [None]:
for article in selected_articles:
  date = article["updated"]
  if not is_iso_format(date):
    article["updated"] = convert_to_iso(date)

In [None]:
import yaml
import os

if not os.path.exists('posts'):
    os.makedirs('posts')

# articleオブジェクトの定義
article = {
    'site': 'Site',
    'title': 'Learn How to Pre-render Pages Using Static Generation with Next.js',
    'link': 'https://www.site.com',
    'updated': '2020-03-16T05:35:07.322Z',
    'content': 'Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor fasfaf',
    'comments': [
        {
            'commentator': {'name': 'Commenter1', 'title': 'Comedian', 'age': 30, 'sex': 'male'},
            'text': 'This is a comment.'
        },
        {
            'commentator': {'name': 'Commenter2', 'title': 'Comedian', 'age': 40, 'sex': 'female'},
            'text': 'This is another comment.'
        },
    ]
}

def output_file(article):
  # ファイル名を作成（ここではタイトルを使用）
  filename = article['title'].replace(' ', '_') + '.md'

  # YAML部分を作成
  yaml_part = {
      'title': article['title'],
      'site': article['site'],
      'link': article['link'],
      'coverImage': '/assets/post-cover/',
      'date': article['updated'],
      'ogImage': {'url': '/assets/post-cover/'},
      'comments': article['comments']
  }

  # YAML形式のテキストに変換
  yaml_text = yaml.dump(yaml_part, allow_unicode=True)

  # 最終的なMarkdownテキストを作成
  md_text = '---\n' + yaml_text + '---\n\n' + article['summary']

  file_path = os.path.join('posts', f"{article['title']}.md")

  # ファイルを書き込みモードで開き、内容を書き込む
  with open(file_path, 'w', encoding='utf-8') as f:
      f.write(md_text)

for article in selected_articles:
  output_file(article)

In [None]:
!zip -r /content/posts.zip /content/posts

from google.colab import files
files.download("/content/posts.zip")
