## Saving pydantic-validated data to SQLite database with SQLModel

Reference: 
1. Create a SQLModel Model https://sqlmodel.tiangolo.com/#create-a-sqlmodel-model & https://sqlmodel.tiangolo.com/#sqlalchemy-and-pydantic

In [6]:
%pip install requests
%pip install lxml
%pip install pandas
%pip install watermark
%pip install pydantic==2.3.0
%pip install pydantic-core==2.6.3
%pip install sqlmodel
%pip install word2number

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable


Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable


#### Scrape the website content

In [None]:
from pydantic import (BaseModel, 
                      validate_call,
                      computed_field, 
                      field_serializer,
                      HttpUrl,
                      )
from sqlmodel import Field, SQLModel, Session, create_engine
from typing import Optional

class BookModel(SQLModel, table=True):
  id: Optional[int] = Field(default=None, primary_key=True)
  price: str
  book_titles: str
  book_urls: str
  star_ratings_class: str

  @computed_field
  @property
  def star_rating(self) -> str:
    return star_rating_class.replace("star-rating ","")

  @field_serializer('star_rating')
  def serialize_star_rating(star_rating: str) -> int:
    return w2n.word_to_num(star_rating)

engine = create_engine("sqlite:///database.db")
# SQLModel.metadata.create_all(engine)

InvalidRequestError: Table 'bookmodel' is already defined for this MetaData instance.  Specify 'extend_existing=True' to redefine options and columns on an existing Table object.

In [None]:
%%time

import requests
from lxml import html
from word2number import w2n # convert number words (eg. twenty one) to numeric digits (21)
import pandas
from typing import Union # add type hint

class BookSpider:
    def __init__(self):
        self.base_url: AnyHttpUrl = "https://books.toscrape.com"
        self.session: requests.sessions.Session  = requests.Session()
        self.headers: dict[str, str] = {
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
            }
        self.bookmodel = BookModel

    def parse(self, page_num: int=50) -> None:
        results = []
        # loop the page number to scrape
        for page_num in range(1,page_num):
            url = self.base_url + f"/catalogue/page-{page_num}.html"
            output_data = self._fetch_data(url=url)
            validated_bookmodel_data = self._validate_bookmodel_data(output_data)
            results.append(validated_bookmodel_data)
        return results

    @validate_call(validate_return=True)
    def _fetch_data(self, url: HttpUrl) -> dict[str, list]:
        response = self.session.request("GET", url)
        tree = html.fromstring(response.content)      
        prices = tree.xpath("//article[@class='product_pod']//div[@class='product_price']\
            /p[@class='price_color']/text()")
        # bug: how to break down prices -> prices & currencies        
        book_titles = tree.xpath("//article[@class='product_pod']/h3/a/text()")
        book_urls = tree.xpath("//article[@class='product_pod']/h3/a/@href")
        star_ratings_classes = tree.xpath("//article[@class='product_pod']/p/@class")

        data_list = {
                    "prices": prices,
                    "star_ratings_classes": star_ratings_classes,
                    "book_urls": book_urls, 
                    "book_titles": book_titles,
                }
        return data_list

    @validate_call(validate_return=True)
    def _validate_bookmodel_data(self, books_data: dict[str, list]) -> dict[str, list]:
        # validate book data with pydantic
        validated_bookmodel_data = dict(self.bookmodel.model_validate(books_data).model_dump()) ## model_dump is to compute field
        return validated_bookmodel_data

book_spider = BookSpider()
data_list = book_spider.parse()

ValidationError: 4 validation errors for BookModel
price
  Field required [type=missing, input_value={'prices': ['£51.77', '...'s Only the Himalayas"]}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.3/v/missing
book_titles
  Input should be a valid string [type=string_type, input_value=['A Light in the ...', 'T...t's Only the Himalayas"], input_type=list]
    For further information visit https://errors.pydantic.dev/2.3/v/string_type
book_urls
  Input should be a valid string [type=string_type, input_value=['a-light-in-the-attic_10...malayas_981/index.html'], input_type=list]
    For further information visit https://errors.pydantic.dev/2.3/v/string_type
star_ratings_class
  Field required [type=missing, input_value={'prices': ['£51.77', '...'s Only the Himalayas"]}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.3/v/missing

In [None]:
import pandas

df = pandas.DataFrame(data_list)
df.explode(['prices', 'book_titles', 'book_urls', 'star_ratings_classes', 'star_ratings'])

Unnamed: 0,prices,book_titles,book_urls,star_ratings_classes,star_ratings
0,£51.77,A Light in the ...,a-light-in-the-attic_1000/index.html,star-rating Three,3
0,£53.74,Tipping the Velvet,tipping-the-velvet_999/index.html,star-rating One,1
0,£50.10,Soumission,soumission_998/index.html,star-rating One,1
0,£47.82,Sharp Objects,sharp-objects_997/index.html,star-rating Four,4
0,£54.23,Sapiens: A Brief History ...,sapiens-a-brief-history-of-humankind_996/index...,star-rating Five,5
...,...,...,...,...,...
48,£40.44,Icing (Aces Hockey #2),icing-aces-hockey-2_25/index.html,star-rating Four,4
48,£45.24,"Hawkeye, Vol. 1: My ...",hawkeye-vol-1-my-life-as-a-weapon-hawkeye-1_24...,star-rating Three,3
48,£34.96,Having the Barbarian's Baby ...,having-the-barbarians-baby-ice-planet-barbaria...,star-rating Four,4
48,£56.76,"Giant Days, Vol. 1 ...",giant-days-vol-1-giant-days-1-4_22/index.html,star-rating Four,4


## Computing environment

In [None]:
%load_ext watermark

%watermark

# print out pypi packages used
%watermark --iversions

# date
%watermark -u -n -t -z

Last updated: 2024-03-10T10:08:23.577980+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 8.22.2

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.1.75-060175-generic
Machine     : x86_64
Processor   : x86_64
CPU cores   : 16
Architecture: 64bit

word2number: 1.1
pandas     : 2.2.1
requests   : 2.31.0
lxml       : 5.1.0

Last updated: Sun Mar 10 2024 10:08:23UTC

