In [10]:
import sys
sys.path.append("..")

from playwright.async_api import async_playwright, TimeoutError
import requests
from urllib.parse import urlparse, parse_qs

import pandas as pd
from models.cases import Case
from models.scraper import ScraperBase
from datetime import date, datetime, time
from tempfile import NamedTemporaryFile
from rich.console import Console
from models.leads import Lead
from models.scraper import ScraperBase
from rich.progress import Progress

import re
import os
from dotenv import load_dotenv
from twocaptcha import TwoCaptcha
load_dotenv(dotenv_path='.env')
TWOCAPTCHA_API_KEY = os.getenv('TWOCAPTCHA_API_KEY')

console = Console()

class WestViginiaScraper(ScraperBase):
    
    def to_datetime(self, date_str):
        if date_str is None:
            return None
        else:
            return datetime.strptime(date_str, '%m/%d/%Y')
    
    async def init_browser(self):
        console.log("Initation of Browser...")
        pw = await async_playwright().start()
        self.browser = await pw.chromium.launch(headless=False)
        self.context = await self.browser.new_context()
        self.page = await self.context.new_page()
        self.url = "https://eapps.courts.state.va.us/ocis/landing"
        await self.page.goto(self.url)

        await self.page.wait_for_timeout(2000)
        accept_button = await self.page.query_selector('#acceptTerms')
        if accept_button:
            await accept_button.click()
        else:
            print("The 'Accept' button was not found.")
        await self.page.wait_for_load_state("networkidle")
        await self.page.wait_for_timeout(2000)
        
    
    async def search_by_name(self, search_parameter):
        cookies = await self.context.cookies()
        self.req_cookies = {cookie['name']: cookie['value'] for cookie in cookies} # type:ignore

        console.log(f"self.req_cookies------------{self.req_cookies}")
        url = "https://eapps.courts.state.va.us/ocis-rest/api/public/search"
        data = {"courtLevels":[],"divisions":["Adult Criminal/Traffic"],"selectedCourts":[],"searchString":[f"{search_parameter}"],"searchBy":"N"}
        res = requests.post(url, json=data, cookies=self.req_cookies)
        res_dict = res.json()
        search_result = res_dict['context']['entity']['payload']['searchResults'] 
        return search_result

    def get_case_detail(self, search_result):
        url = "https://eapps.courts.state.va.us/ocis-rest/api/public/getCaseDetails"
        data = search_result
        res = requests.post(url, json=data, cookies=self.req_cookies)
                
        detail_data = res.json()
        case_id = detail_data["context"]["entity"]["payload"]["caseTrackingID"]

        charges = []
        charge = {}
        case_charge = detail_data["context"]["entity"]["payload"]["caseCharge"]

        filing_date = case_charge["chargeFilingDate"]
        offense_date = case_charge["offenseDate"]
        arrest_date = case_charge.get("arrestDate")
        charge["offense_date"] =self.to_datetime(offense_date) #type: ignore
        charge["filing_date"] = self.to_datetime(filing_date) #type: ignore
        charge["arrest_date"] = self.to_datetime(arrest_date) #type: ignore
        charges.append(charge)
        
        
        caseCourt = detail_data["context"]["entity"]["payload"]["caseCourt"]
        court_id = caseCourt["fipsCode"]+caseCourt["courtCategoryCode"]["value"]
        
        caseParticipant = detail_data["context"]["entity"]["payload"]["caseParticipant"]
        for participant in caseParticipant:
            if participant["participantCode"] == "DEF":
                contact_info = participant["contactInformation"]
                last_name = contact_info["personName"].get("personSurName")
                first_name = contact_info["personName"].get("personGivenName")
                middle_name = contact_info["personName"].get("personMiddleName")
                address_city  = contact_info["primaryAddress"].get("locationCityName")
                address_zip = contact_info["primaryAddress"].get("locationState")
                address_state_code = contact_info["primaryAddress"].get("locationPostalCode")
                gender = participant["personalDetails"]["gender"]
                birth_date = participant["personalDetails"].get("maskedBirthDate")
        case_dict = {
                "case_id": case_id,
                "court_id": court_id,
                "charges": charges,
                "filing_date": filing_date,
                "arrest_date": arrest_date,
                "offense_date": offense_date,
                "first_name": first_name,
                "middle_name": middle_name,
                "last_name": last_name,
                "gender": gender,
                "birth_date": birth_date,
                "address_city": address_city,
                "address_zip": address_zip,
                "address_state_code": address_state_code
            }
        return case_dict

    async def scrape(self, search_parameter):
        search_name = search_parameter['name']
        await self.init_browser()
        search_results = await self.search_by_name(search_name)        
        case_dicts = []
        for result in search_results:
            case_dict = self.get_case_detail(result)
            case_dicts.append(case_dict)

            case = Case(**case_dict)
            lead = Lead(**case_dict)
            self.insert_case(case)
            self.insert_lead(lead)
        print(case_dicts)

        await self.browser.close()

In [11]:
wvscraper = WestViginiaScraper()
await wvscraper.scrape({'name':'AB'})

{'case_id': '0035370100', 'court_id': '595J', 'charges': [{'offense_date': datetime.datetime(2010, 1, 18, 0, 0), 'filing_date': datetime.datetime(2010, 1, 20, 0, 0), 'arrest_date': datetime.datetime(2010, 1, 19, 0, 0)}], 'filing_date': '01/20/2010', 'arrest_date': '01/19/2010', 'offense_date': '01/18/2010', 'first_name': 'ANTHONY', 'middle_name': 'CORNELIUS', 'last_name': 'BROWN', 'gender': 'M', 'birth_date': '07/25', 'address_city': 'EMPORIA', 'address_zip': 'VA', 'address_state_code': '23847'}
{'case_id': '2000426300', 'court_id': '019G', 'charges': [{'offense_date': datetime.datetime(2020, 9, 3, 0, 0), 'filing_date': datetime.datetime(2020, 9, 3, 0, 0), 'arrest_date': None}], 'filing_date': '09/03/2020', 'arrest_date': None, 'offense_date': '09/03/2020', 'first_name': None, 'middle_name': None, 'last_name': 'OWNER UNKNOWN', 'gender': 'O', 'birth_date': None, 'address_city': None, 'address_zip': None, 'address_state_code': None}
{'case_id': '1821989000', 'court_id': '059G', 'charges'