In [1]:
!pip3 install playwright==1.38.0
!pip3 install beautifulsoup4==4.10.0
!pip3 install langchain==0.0.314
!pip3 install goose3==3.1.17
!pip3 install openai==0.27.2



In [2]:
OPENAI_KEY=""

In [10]:
# Web Scraping with Playwright and BeautifulSoup
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright

async def run_playwright(site):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(site)
        page_source = await page.content()
        soup = BeautifulSoup(page_source, "html.parser")
        
        for script in soup(["script", "style"]): # Remove all javascript and stylesheet code
            script.extract()
        text = soup.get_text()
        # Break into lines and remove leading and trailing space on each
        lines = (line.strip() for line in text.splitlines()) 
        # Break multi-headlines into a line each
        chunks = (phrase.strip() for line in lines for phrase in line.split("  ")) 
        data = '\n'.join(chunk for chunk in chunks if chunk) # Drop blank lines
        await browser.close()
    return data

In [11]:
output = await run_playwright("https://techcrunch.com")

In [12]:
output

"TechCrunch | Startup and Technology News\nTechCrunch\nplus-bold\nTechCrunchLoginJoin TechCrunch+SearchsearchTechCrunch+StartupsVentureSecurityAICryptoAppsEventsStartup BattlefieldMoreThe watermelon emoji isn’t just TikTok speak for PalestineMorgan SungTechCrunchplus-boldMarket AnalysisProsecutors will likely proceed with SBF’s second trial in MarchJacquelyn MelinekHardwareHumane’s Ai Pin up closeBrian HeaterAIThis week in AI: OpenAI plays for keeps with GPTsKyle Wiggers, Devin ColdeweyStartupsA comprehensive list of 2023 tech layoffsAlyssa StringerTechCrunch Early StageApril 25, 2024 Boston, MARegister InterestThe LatestTechCrunchplus-boldWorkKeep IT complexity in check with pragmatic composable commerceMariano Gomide de Faria8:35 AM PST•November 12, 2023Mariano Gomide de Faria Contributor Mariano Gomide de Faria has over 20 years of experience in digital commerce and is the founder and co-CEO of global enterprise digital commerce platform VTEX. Le...TechCrunchplus-boldHow Inversion A

In [13]:
# Define Structured Schema for LangChain, Text Extraction with LangChain
from langchain.chat_models import ChatOpenAI
from langchain.chains import create_extraction_chain

structured_schema = {
    "properties": {
        "article_name": {"type": "string"},
        "article_url": {"type": "string"},
    },
    "required": ["article_name", "article_url"],
}
llm = ChatOpenAI(temperature=0, model="gpt-4", openai_api_key=OPENAI_KEY)
extraction_chain = create_extraction_chain(structured_schema, llm)

tech_crunch_json_result = extraction_chain.run(output)


In [14]:
tech_crunch_json_result

[{'article_name': 'Keep IT complexity in check with pragmatic composable commerce',
  'article_url': 'https://techcrunch.com/2023/11/12/keep-it-complexity-in-check-with-pragmatic-composable-commerce/'},
 {'article_name': 'How Inversion Art is trying to become the Y Combinator of the arts world',
  'article_url': 'https://techcrunch.com/2023/11/12/how-inversion-art-is-trying-to-become-the-y-combinator-of-the-arts-world/'},
 {'article_name': 'Three human mistakes VCs often make, and how understanding them can help entrepreneurs fundraise better',
  'article_url': 'https://techcrunch.com/2023/11/12/three-human-mistakes-vcs-often-make-and-how-understanding-them-can-help-entrepreneurs-fundraise-better/'},
 {'article_name': 'Klarna’s and Affirm’s very good week',
  'article_url': 'https://techcrunch.com/2023/11/12/klarnas-and-affirms-very-good-week/'},
 {'article_name': 'Robotics Q&A: CMU’s Matthew Johnson-Roberson',
  'article_url': 'https://techcrunch.com/2023/11/12/robotics-qa-cmus-matthe

In [15]:
# Extract Article Content with Goose
from goose3 import Goose

g = Goose()
url = tech_crunch_json_result[0]['article_url']
article = g.extract(url=url)
main_content = article.cleaned_text
title = article.title


In [16]:
title, main_content

('Keep IT complexity in check with pragmatic composable commerce',
 'Legacy digital commerce architectures are no longer sustainable in today’s commerce arena. With every component tightly integrated into a monolithic architecture, a legacy platform’s inflexibility, low reliability, and high maintenance costs make it nearly impossible for a merchant to test and roll out commerce capabilities in sync with consumer expectations.\n\nTo address these limitations, many merchants have shifted toward a more flexible commerce architecture — like headless commerce. This structure decouples the front-end presentation of a merchant’s storefront from back-end services like inventory management and payment processing. Headless commerce enables merchants to independently evolve and scale each element of their infrastructure, which reduces risk during system updates and supports the creation of unique customer experiences and functionalities.\n\nPlatforms that support headless commerce are API-first,

In [17]:
# Summarize Article Content with OpenAI
from langchain.schema import HumanMessage, SystemMessage

messages = [
    SystemMessage(content="You are an assistant that summarizes articles."),
    HumanMessage(content=f"请用中文为我总结这篇文章: {main_content}"),
]

result = llm(messages)
print(result.content)



传统的数字商务架构在今天的商务领域已经无法持续。由于每个组件都紧密地集成在单一的架构中，传统平台的不灵活性、低可靠性和高维护成本使得商家几乎无法测试和同步推出符合消费者期望的商务功能。

为了解决这些限制，许多商家已经转向更灵活的商务架构，如无头商务。这种结构将商家店面的前端展示与后端服务（如库存管理和支付处理）分离。无头商务使商家能够独立发展和扩展他们的基础设施的每个元素，这降低了系统更新期间的风险，并支持创建独特的客户体验和功能。

支持无头商务的平台是API优先的，这意味着它们可以在各种客户接触点（例如，桌面，移动设备，社交和物联网）之间进行无缝交易。然而，今天的动态数字商务环境往往需要更多的灵活性和定制化。这就是组合式商务发挥作用的地方。

组合式商务提升了无头商务的概念，增强了灵活性和适应性。它不仅将前端与后端分离，如无头商务那样，而且还将商务架构的每个元素（包括内容管理，站点搜索和个性化）分离。这消除了供应商锁定，并使零售商能够为任何功能集成最优秀的应用程序。

因此，商家获得了更新单个组件而不会对整个业务造成破坏的灵活性，使他们能够迅速适应市场变化并进行大规模创新。但是，当涉及到定制时，是否存在过多的自由和灵活性呢？
