## This script is to Extract the SiC Mosfets Parameters from the Datasheet PDF to csv files

Imports and the libraries required here are the regular expressions (re), data tables (pandas) and for reading the pdfs - PyPDF2

In [None]:
# imports 
import os
import re
import sys
from typing import Optional, Tuple
import pandas as pd
from PyPDF2 import PdfReader

In [None]:
# Config setup for the data extracted to be saved as ouput and input path
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..")))
from config import DATA_EXTRACTION_INPUT, DATA_EXTRACTION_OUTPUT


Finding the first number that appears after the parameter name 

Example:
- label_pattern = r"Power Dissipation"
- "Power Dissipation  150 W" and returns that value


In [None]:
def find_value(text: str, label_pattern: str, context: int = 100, dtype=float) -> Optional[float]:
    
    try:
        # Look up to {context} characters ahead for a number like 123 or 123.45
        pattern = rf"{label_pattern}.{{0,{context}}}?([\d.]+)"
        match = re.search(pattern, text, re.IGNORECASE | re.DOTALL)
        if match:
            return dtype(match.group(1))
    except Exception:
        pass
    return None

Extract the values from the correct rpw and column for eg: min/max/typ

In [None]:

def extract_values_by_label(text: str, label: str, expected_count: int = 3) -> Tuple[Optional[float], ...]:
    
    try:
        
        pattern = rf"{label}\s*\n?\s*((?:-?\d+\.?\d*\s+){{{expected_count - 1},}}[\d.+-]+)"
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            numbers = re.findall(r"-?\d+\.?\d*", match.group(1))
            values = tuple(map(float, numbers[:expected_count]))
            
            if len(values) < expected_count:
                values = values + (None,) * (expected_count - len(values))
            return values
    except Exception:
        pass
    return (None,) * expected_count

In [None]:
def extract_reverse_value(text: str, label: str, unit: str, occurrence: int = 1) -> Optional[float]:
    
    pattern = rf"{label}[^0-9A-Za-z]+([\d.]+)\s*{unit}"
    matches = re.findall(pattern, text, re.IGNORECASE)
    try:
        return float(matches[occurrence - 1])
    except Exception:
        return None

Function for the correct temperature max value 

In [None]:
def extract_temperature_max(text: str) -> Optional[int]:
    
    match = re.search(
        r"Operating Junction and Storage Temperature[^+0-9\-]+[-–]?\d+\s*(?:to|–)\s*\+?(\d+)",
        text,
        re.IGNORECASE
    )
    return int(match.group(1)) if match else None

For each PDF in DATA_EXTRACTION_INPUT:
- read text
 - extract values
- write <DeviceID>.csv to DATA_EXTRACTION_OUTPUT

In [None]:
def main() -> None:
# output folder here
os.makedirs(DATA_EXTRACTION_OUTPUT, exist_ok=True)

for filename in os.listdir(DATA_EXTRACTION_INPUT):
    
    if not filename.lower().endswith(".pdf"):
        continue

    # file creation here
    pdf_path = os.path.join(DATA_EXTRACTION_INPUT, filename)
    device_id = os.path.splitext(filename)[0]

    # -extrcating all the values from the pdf 
    try:
        # using PdfReader
        reader = PdfReader(pdf_path)
        # check all the pages and extrcat non null
        text = "".join((page.extract_text() or "") for page in reader.pages)
    except Exception as e:
        print(f"[!] Could not read: {pdf_path} ({e})")
        continue

    VGS_th_min, VGS_th_typ, VGS_th_max = extract_values_by_label(
        text, r"Gate Threshold Voltage", expected_count=3
    )

    RDS_on_typ, RDS_on_max = extract_values_by_label(
        text, r"On-State Resistance", expected_count=2
    )[:2]

    Qg_total = extract_values_by_label(
        text, r"Total Gate Charge", expected_count=1
    )[0]


    Qgs = find_value(text, r"Gate to Source Charge[^0-9]*(\d+\.?\d*)")
    Qgd = find_value(text, r"Gate to Drain Charge[^0-9]*(\d+\.?\d*)")

    Qrr  = extract_reverse_value(text, "Reverse Recovery Charge", "nC", occurrence=2)
    Irrm = extract_reverse_value(text, "Peak Reverse Recovery Current", "A",  occurrence=2)

    Rth_typ, Rth_max = extract_values_by_label(
        text, r"Thermal Resistance from Junction to Case", expected_count=2
    )[:2]

    Tj_max = extract_temperature_max(text)

Ouput data

In [None]:
data = {
    "DeviceID": device_id,

    # Basic ratings / thresholds
    "VBR_DSS": find_value(text, r"Drain\s*-\s*Source Voltage", dtype=int),  # e.g., 650
    "VGS_th_min": VGS_th_min,
    "VGS_th_typ": VGS_th_typ,
    "VGS_th_max": VGS_th_max,

    # Conduction / resistance
    "RDS_on_typ": RDS_on_typ,
    "RDS_on_max": RDS_on_max,

    # Charges
    "Qg_total": Qg_total,
    "Qgs": Qgs,
    "Qgd": Qgd,

    # Misc device parameters
    "Rg_int": find_value(text, r"Internal Gate Resistance"),
    "Ciss": find_value(text, r"Input Capacitance", dtype=int),
    "Coss": find_value(text, r"Output Capacitance", dtype=int),
    "Crss": find_value(text, r"Reverse Transfer Capacitance", dtype=int),

    # Switching times
    "td_on": find_value(text, r"Turn[-\s]?On Delay Time"),
    "tr": find_value(text, r"Rise Time"),
    "td_off": find_value(text, r"Turn[-\s]?Off Delay Time"),
    "tf": find_value(text, r"Fall Time"),

    # Energies
    "Eon": find_value(text, r"\bEON\b[^0-9]*(\d+\.?\d*)"),
    "Eoff": find_value(text, r"\bEOFF\b[^0-9]*(\d+\.?\d*)"),

    # Diode recovery
    "Qrr": Qrr,
    "trr": find_value(text, r"Reverse Recovery Time[^a-zA-Z0-9]*(\d+)", dtype=int),
    "Irrm": Irrm,

    # Thermal
    "Rth_JC_typ": Rth_typ,
    "Rth_JC_max": Rth_max,
    "Tj_max": Tj_max,

    # Power
    "Power_dissipation": find_value(text, r"Power Dissipation", dtype=int),
}

In [None]:
df = pd.DataFrame([data])
output_csv = os.path.join(DATA_EXTRACTION_OUTPUT, f"{device_id}.csv")
try:
    df.to_csv(output_csv, index=False)
    print(f"Extracted: {output_csv}")
except Exception as e:
    print(f"Failed to write CSV for {device_id}: {e}")

Final running main()

In [None]:
if __name__ == "__main__":
    main()