In [1]:
!pip install -Uqqq pip --progress-bar off
!pip install -qqq playwright==1.46.0 --progress-bar off
!pip install -qqq html2text==2024.2.26 --progress-bar off
!pip install -qqq langchain-groq==0.1.9 --progress-bar off

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for html2text (setup.py) ... [?25l[?25hdone


In [2]:
!playwright install chromium

Downloading Chromium 128.0.6613.18 (playwright build v1129)[2m from https://playwright.azureedge.net/builds/chromium/1129/chromium-linux.zip[22m
[1G162.8 MiB [] 0% 10.7s[0K[1G162.8 MiB [] 0% 37.7s[0K[1G162.8 MiB [] 0% 19.7s[0K[1G162.8 MiB [] 0% 15.3s[0K[1G162.8 MiB [] 0% 13.3s[0K[1G162.8 MiB [] 0% 12.2s[0K[1G162.8 MiB [] 0% 11.5s[0K[1G162.8 MiB [] 1% 10.6s[0K[1G162.8 MiB [] 1% 10.1s[0K[1G162.8 MiB [] 1% 9.7s[0K[1G162.8 MiB [] 1% 9.8s[0K[1G162.8 MiB [] 1% 9.9s[0K[1G162.8 MiB [] 2% 9.6s[0K[1G162.8 MiB [] 2% 9.1s[0K[1G162.8 MiB [] 2% 9.0s[0K[1G162.8 MiB [] 2% 9.1s[0K[1G162.8 MiB [] 3% 8.8s[0K[1G162.8 MiB [] 3% 8.2s[0K[1G162.8 MiB [] 3% 7.7s[0K[1G162.8 MiB [] 4% 7.7s[0K[1G162.8 MiB [] 4% 8.0s[0K[1G162.8 MiB [] 4% 8.4s[0K[1G162.8 MiB [] 4% 8.5s[0K[1G162.8 MiB [] 4% 8.7s[0K[1G162.8 MiB [] 4% 9.2s[0K[1G162.8 MiB [] 5% 9.4s[0K[1G162.8 MiB [] 5% 9.6s[0K[1G162.8 MiB [] 5% 9.8s[0K[1G162.8 MiB [] 5% 10.1s[0K[1G162.8 MiB [] 5% 10.2s[0K[

In [86]:
import re
from pprint import pprint
from typing import List, Optional

import html2text
import nest_asyncio
import pandas as pd
from google.colab import userdata
from langchain_groq import ChatGroq
from playwright.async_api import async_playwright
from pydantic import BaseModel, Field
from tqdm import tqdm

nest_asyncio.apply()

## Fetch Web Content as Markdown

In [4]:
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"

In [6]:
playwright = await async_playwright().start()
browser = await playwright.chromium.launch()

context = await browser.new_context(user_agent=USER_AGENT)

page = await context.new_page()
await page.goto("https://playwright.dev/")
content = await page.content()

await browser.close()
await playwright.stop()

In [7]:
print(content)

<!DOCTYPE html><html lang="en" dir="ltr" class="plugin-pages plugin-id-default" data-has-hydrated="false" data-theme="light" data-rh="lang,dir,class"><head>
<meta charset="UTF-8">
<meta name="generator" content="Docusaurus v3.5.1">
<title>Fast and reliable end-to-end testing for modern web apps | Playwright</title><meta data-rh="true" name="twitter:card" content="summary_large_image"><meta data-rh="true" property="og:image" content="https://repository-images.githubusercontent.com/221981891/8c5c6942-c91f-4df1-825f-4cf474056bd7"><meta data-rh="true" name="twitter:image" content="https://repository-images.githubusercontent.com/221981891/8c5c6942-c91f-4df1-825f-4cf474056bd7"><meta data-rh="true" property="og:url" content="https://playwright.dev/"><meta data-rh="true" property="og:locale" content="en"><meta data-rh="true" name="docusaurus_locale" content="en"><meta data-rh="true" name="docusaurus_tag" content="default"><meta data-rh="true" name="docsearch:language" content="en"><meta data-r

In [8]:
markdown_converter = html2text.HTML2Text()
markdown_converter.ignore_links = False
markdown_content = markdown_converter.handle(content)

In [9]:
print(markdown_content)

Skip to main content

[![Playwright logo](/img/playwright-logo.svg)![Playwright
logo](/img/playwright-
logo.svg)**Playwright**](/)[Docs](/docs/intro)[API](/docs/api/class-
playwright)

Node.js

  * [Node.js](/)
  * [Python](/python/)
  * [Java](/java/)
  * [.NET](/dotnet/)

[Community](/community/welcome)

[](https://github.com/microsoft/playwright)[](https://aka.ms/playwright/discord)

Search

# Playwright enables reliable end-to-end testing for modern web apps.

[Get
started](/docs/intro)[Star](https://github.com/microsoft/playwright)[65k+](https://github.com/microsoft/playwright/stargazers)

  
  
  

![Browsers \(Chromium, Firefox, WebKit\)](img/logos/Browsers.png)

### Any browser • Any platform • One API

**Cross-browser.** Playwright supports all modern rendering engines including
Chromium, WebKit, and Firefox.

**Cross-platform.** Test on Windows, Linux, and macOS, locally or on CI,
headless or headed.

**Cross-language.** Use the Playwright API in
[TypeScript](https://playwrig

## LLM Setup

In [5]:
MODEL = "llama-3.1-70b-versatile"

llm = ChatGroq(temperature=0, model_name=MODEL, api_key=userdata.get("GROQ_API_KEY"))

In [73]:
SYSTEM_PROMPT = """
You're an expert text extractor. You extract information from webpage content.
Always extract data without changing it and any other output.
"""


def create_scrape_prompt(page_content: str) -> str:
    return f"""
Extract the information from the following web page:
```
{page_content}
```
""".strip()

## Scrape Landing Pages

In [13]:
class ProjectInformation(BaseModel):
    """Information about the project"""

    name: str = Field("Name of the project e.g. Excel")
    tagline: str = Field(
        description="What this project is about e.g. Get deep insights from your numbers",
    )
    benefits: List[str] = Field(
        description="""A list of main benefits of the project including 3-5 words to summarize each one.
    e.g. [
        'Your spreadshits everywhere you go - cloud-backed files with your account',
        'Accuracy without manual calculations - vast amount of built-in formulas ready to use'
    ]
    """
    )

In [25]:
page_scraper_llm = llm.with_structured_output(ProjectInformation)

In [26]:
extraction = page_scraper_llm.invoke(
    [("system", SYSTEM_PROMPT), ("user", create_scrape_prompt(markdown_content))]
)

In [27]:
pprint(extraction.__dict__, sort_dicts=False, width=120)

{'name': 'Playwright',
 'tagline': 'Enables reliable end-to-end testing for modern web apps',
 'benefits': ['Cross-browser support - Chromium, Firefox, WebKit',
              'Cross-platform support - Windows, Linux, macOS',
              'Cross-language support - TypeScript, JavaScript, Python, .NET, Java',
              'Resilient tests - Auto-wait, web-first assertions, tracing',
              'No trade-offs - Multiple tabs, origins, users, trusted events',
              'Full isolation - Browser contexts, fast execution',
              'Powerful tooling - Codegen, Playwright inspector, Trace Viewer']}


In [28]:
async def fetch_page(url, user_agent=USER_AGENT) -> str:
    playwright = await async_playwright().start()
    browser = await playwright.chromium.launch()

    context = await browser.new_context(user_agent=USER_AGENT)

    page = await context.new_page()
    await page.goto(url)
    content = await page.content()

    await browser.close()
    await playwright.stop()

    markdown_converter = html2text.HTML2Text()
    markdown_converter.ignore_links = False
    return markdown_converter.handle(content)

In [30]:
urls = [
    "https://videogen.io/",
    "https://blaze.today/aiblaze/",
    "https://www.insightpipeline.com/",
    "https://apps.apple.com/us/app/today-app-to-do-list-habits/id6461726826",
    "https://brainybear.ai/",
]

In [32]:
extractions = []
for url in tqdm(urls):
    content = await fetch_page(url)
    extractions.append(
        page_scraper_llm.invoke(
            [("system", SYSTEM_PROMPT), ("user", create_scrape_prompt(content))]
        )
    )

100%|██████████| 5/5 [00:23<00:00,  4.70s/it]


In [35]:
rows = []

for extraction, url in zip(extractions, urls):
    row = extraction.__dict__
    row["url"] = url
    rows.append(row)

projects_df = pd.DataFrame(rows)

In [36]:
projects_df

Unnamed: 0,name,tagline,benefits,url
0,VideoGen,Generate videos in seconds with AI,"[Generate videos in seconds with AI, One-Click...",https://videogen.io/
1,AI Blaze,Fast AI Writing and Editing with Dynamic Prompts,[Fast AI Writing and Editing with Dynamic Prom...,https://blaze.today/aiblaze/
2,Insight Pipeline,"Customer insights every week, starting today",[Effortless customer research calls - weekly c...,https://www.insightpipeline.com/
3,Today App,To-Do List & Habits,"[Focus on today's tasks, Prioritize what reall...",https://apps.apple.com/us/app/today-app-to-do-...
4,Brainybear,Train AI Chatbots in 3 Clicks. Help Customers ...,[Quick and Easy Setup - Get started in less th...,https://brainybear.ai/


In [37]:
projects_df.iloc[0].benefits

['Generate videos in seconds with AI',
 'One-Click Video Creation',
 '3M+ Copyright-free Assets',
 'Safe for Commercial Use',
 'No more retakes. Just perfect audio',
 'Add edits with ease',
 'Cost savings on video production',
 'Less hours spent video editing',
 'Increased engagement with video']

In [38]:
projects_df.to_csv("projects.csv", index=None)

## Scrape Car Listings

In [45]:
url = "https://www.autoscout24.com/lst?atype=C&cy=D%2CA%2CB%2CE%2CF%2CI%2CL%2CNL&desc=0&fregfrom=2018&gear=M&powerfrom=309&powerto=478&powertype=hp&search_id=1tih4oks815&sort=standard&ustate=N%2CU"

In [46]:
auto_content = await fetch_page(url)

In [47]:
print(auto_content)

Skip to main content

AutoScout24 is currently only available to a limited extent due to maintenance
work. This affects some functions such as contacting salespeople, logging in
or managing your vehicles for sale.

[ ![auto24-logo](/assets/as24-header-footer/as24-horizontal-
inverse.d34ff335.svg) ](https://www.autoscout24.com/ "AutoScout24 - Used and
new cars")

[ ](https://www.autoscout24.com/favorites)

  * [ Used and New Cars ](https://www.autoscout24.com/)
  * [ Motorbikes ](https://www.autoscout24.com/motorcycle/)

  * [ ](https://www.autoscout24.com/favorites)
  * English 

[ ![](/assets/as24-header-footer/flag-de.013a09fe.svg) Deutschland
](https://www.autoscout24.de/) [ ![](/assets/as24-header-footer/flag-
it.5021f5d3.svg) Italia ](https://www.autoscout24.it/) [
![](/assets/as24-header-footer/flag-at.6eb37b36.svg) Österreich
](https://www.autoscout24.at/) [ ![](/assets/as24-header-footer/flag-
nl.843ae853.svg) Nederland  ](https://www.autoscout24.nl/) [
![](/assets/as24-header-

In [74]:
class CarListing(BaseModel):
    """Information about a car listing"""

    make: str = Field("Make of the car e.g. Toyota")
    model: str = Field("Model of the car, maximum 3 words e.g. Land Cruiser")
    horsepower: int = Field("Horsepower of the engine e.g. 231")
    price: int = Field("Price in euro e.g. 34000")
    mileage: Optional[int] = Field("Number of kilometers on the odometer e.g. 73400")
    year: Optional[int] = Field("Year of registration (if available) e.g. 2015")
    url: str = Field(
        "Url to the listing e.g. https://www.autoscout24.com/offers/lexus-rc-f-advantage-coupe-gasoline-grey-19484ec1-ee56-4bfd-8769-054f03515792"
    )


class CarListings(BaseModel):
    """List of car listings"""

    cars: List[CarListing] = Field("List of cars for sale.")

In [75]:
car_listing_scraper_llm = llm.with_structured_output(CarListings)

In [76]:
extraction = car_listing_scraper_llm.invoke(
    [("system", SYSTEM_PROMPT), ("user", create_scrape_prompt(auto_content))]
)

In [77]:
extraction.cars

[CarListing(make='BMW', model='M3 Limousine', horsepower=480, price=74770, mileage=11711, year=2021, url='https://www.autoscout24.com/offers/bmw-m3-limousine-m-sportsitze-navi-kamera-2-hand-gasoline-green-51721ec5-d41d-4445-940f-b690ea4247ab'),
 CarListing(make='Porsche', model='991 (911) GT3 Touring', horsepower=500, price=178911, mileage=9000, year=2018, url='https://www.autoscout24.com/offers/porsche-991-911-gt3-touring-gasoline-red-c32fabcc-8ded-49ae-be66-0384be12a44c'),
 CarListing(make='Dodge', model='Challenger 6.4 R/T SCAT PACK', horsepower=492, price=42487, mileage=35320, year=2019, url='https://www.autoscout24.com/offers/dodge-challenger-6-4-r-t-scat-pack-blind-spot-tuev-led-gasoline-black-4b329a3a-d2c4-453c-ba2d-5cee1e7a1176'),
 CarListing(make='Porsche', model='911 Cabrio S 3.0 Manueel', horsepower=450, price=128500, mileage=11072, year=2021, url='https://www.autoscout24.com/offers/porsche-911-cabrio-s-3-0-manueel-nieuwstaat-1ste-eig-gasoline-black-27e297c7-03ee-4041-952b-2

In [87]:
def filter_model(row):
    row = re.sub("[^0-9a-zA-Z]+", " ", row)
    parts = row.split(" ")
    return " ".join(parts[:3])


rows = [listing.__dict__ for listing in extraction.cars]

listings_df = pd.DataFrame(rows)
listings_df["model"] = listings_df.model.apply(filter_model)
listings_df

Unnamed: 0,make,model,horsepower,price,mileage,year,url
0,BMW,M3 Limousine,480,74770,11711.0,2021.0,https://www.autoscout24.com/offers/bmw-m3-limo...
1,Porsche,991 911 GT3,500,178911,9000.0,2018.0,https://www.autoscout24.com/offers/porsche-991...
2,Dodge,Challenger 6 4,492,42487,35320.0,2019.0,https://www.autoscout24.com/offers/dodge-chall...
3,Porsche,911 Cabrio S,450,128500,11072.0,2021.0,https://www.autoscout24.com/offers/porsche-911...
4,Ford,Mustang GT 5,446,52980,5.0,,https://www.autoscout24.com/offers/ford-mustan...
5,BMW,M3 Limousine M,480,72880,33062.0,2021.0,https://www.autoscout24.com/offers/bmw-m3-limo...
6,BMW,M2 Coupe Schaltgetriebe,460,65380,13294.0,2023.0,https://www.autoscout24.com/offers/bmw-m2-coup...
7,Ford,Mustang fastback 5,450,47520,26639.0,2020.0,https://www.autoscout24.com/offers/ford-mustan...
8,Dodge,Challenger 6 4,492,48987,2390.0,2022.0,https://www.autoscout24.com/offers/dodge-chall...
9,BMW,M2 COUP CARBON,460,68970,17908.0,2023.0,https://www.autoscout24.com/offers/bmw-m2-coup...


In [88]:
listings_df.to_csv("car-listings.csv", index=None)