<a href="https://colab.research.google.com/github/tmgsr02/DS-Career-Resources/blob/master/Driver_License_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q tensorlake

In [None]:
import time
import json
from tensorlake.documentai import DocumentAI
from tensorlake.documentai.models import (
    ParseStatus, ParsingOptions, StructuredExtractionOptions, PageClassConfig,
    PartitionStrategy
)
from pydantic import BaseModel, Field
from typing import List


from google.colab import userdata
TENSORLAKE_API_KEY = userdata.get('TENSORLAKE_API_KEY')


DRIVER_LICENSES = [
    "https://m.media-amazon.com/images/I/61k+xMg3I7L._AC_SL1500_.jpg",
    "https://www.desertdefenders.com/wp-content/uploads/2021/09/california-drivers-license-600x379.jpg",
    "https://m.media-amazon.com/images/I/71i8mAqykvL._AC_SL1500_.jpg",
    "https://s3.amazonaws.com/www.ml.school/archive/sample.pdf"
]

# Defining the schema

Ler's create Pydantic schema for extracting the data from a Driver's License.

In [None]:
class DriverLicense(BaseModel):
    id: str = Field(description="Driver's License number. It could be next to a DL field.")
    first_name: str = Field(description="First name of the Driver's License holder.")
    last_name: str = Field(description="Last name of the Driver's License holder.")
    address: str = Field(description="Address of the Driver's License holder")
    dob: str = Field(description="Date of birth of the Driver's License holder")

# Defining page classifications

In [None]:
page_classifications = [
    PageClassConfig(
        name="Front_of_Drivers_License",
        description="Pages that have a photo of a person."
    ),
    PageClassConfig(
        name="Back_of_Drivers_License",
        description="Pages that have a barcode."
    ),
]

# Defining extraction options

In [None]:
structured_extraction_options = [
    StructuredExtractionOptions(
        schema_name="DriverLicense",
        partition_strategy=PartitionStrategy.PAGE,
        json_schema=DriverLicense.model_json_schema(),
        page_classes=["Front_of_Drivers_License"]
    )
]

# Parsing the documents

In [None]:
def print_results(result):
    print("\n*************************************")
    print("Page Classifications:")
    for page_classification in result.page_classes:
        print(f"* {page_classification.page_class}: {page_classification.page_numbers}")

    for structured_data in result.structured_data:
        print(f"\n[{structured_data.schema_name}]")
        data = structured_data.data
        print(json.dumps(data, indent=2, ensure_ascii=False))
    print("*************************************\n")

Let's now go through the list of files and process them one by one.

In [None]:
doc_ai = DocumentAI(api_key=TENSORLAKE_API_KEY)

for file in DRIVER_LICENSES:
    parse_id = doc_ai.parse(
        file=file,
        page_classifications=page_classifications,
        structured_extraction_options=structured_extraction_options
    )

    print(f"Parse job submitted with ID: {parse_id}")

    result = doc_ai.get_parsed_result(parse_id=parse_id)

    while result.status in [ParseStatus.PENDING, ParseStatus.PROCESSING]:
        time.sleep(5)
        result = doc_ai.get_parsed_result(parse_id=parse_id)
        print(f"Status: {result.status.name}")

    if result.status == ParseStatus.FAILURE :
        print(f"Parse job {parse_id} failed.")
        continue

    print_results(result)

Parse job submitted with ID: parse_fbP87dd6pqNPPBWnRMbFD
Status: PROCESSING
Status: PROCESSING
Status: SUCCESSFUL

*************************************
Page Classifications:
* Front_of_Drivers_License: [1]

[DriverLicense]
{
  "address": "892 MOMONA ST HONOLULU, HI 96820",
  "dob": "06/03/1981",
  "first_name": null,
  "id": "01-47-87441",
  "last_name": "McLOVIN"
}
*************************************

Parse job submitted with ID: parse_tt6KLTDnB6NNfPMJT7BNP
Status: PROCESSING
Status: SUCCESSFUL

*************************************
Page Classifications:
* Front_of_Drivers_License: [1]

[DriverLicense]
{
  "address": "2244 2ND AVE SAN DIEGO, CA 92101",
  "dob": "01/01/1997",
  "first_name": "JUDY JANE",
  "id": "G1111111",
  "last_name": "DOE"
}
*************************************

Parse job submitted with ID: parse_MppCjMHKKHMbKpC6hKHgJ
Status: PROCESSING
Status: SUCCESSFUL

*************************************
Page Classifications:
* Front_of_Drivers_License: [1]

[DriverLicen