## This script is used to convert the pdf datasheets of each MOSFET - SiC in the form of Tables 

### import libraries 
important:
pdfplumber 
regrex expressions 
pandas 
os
re

In [None]:
import os
import sys
import pdfplumber
import pandas as pd
import re

# The paths input and ouput defined here using the config 
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../")))
from config import DATA_TABLES_INPUT, DATA_TABLES_OUTPUT

os.makedirs(DATA_TABLES_OUTPUT, exist_ok=True)

In [None]:
# These arre the headers if the tables in the datasheets which we are using as target keywords
target_keywords = [
    "maximum ratings", 
    "electrical characteristics",
    "reverse diode", 
    "thermal characteristics"
]

def clean_line_encoding(line):
    return line.replace("ËšC", "°C").replace("˚C", "°C").replace("Â°C", "°C")

# Remove extra spaces in column names and table cells
def clean_dataframe(df):
    df.columns = [re.sub(r"\s+", "", str(col)) for col in df.columns]
    return df.apply(lambda col: col.map(lambda x: re.sub(r"\s+", " ", str(x)).strip() if pd.notnull(x) else x))


### Finding the heading with the keywords 

In [None]:

# heading in text lines that matches our target keywords 
def find_matching_heading(text_lines):
    for line in text_lines:
        cleaned = clean_line_encoding(line).strip().lower()
        for keyword in target_keywords:
            if keyword in cleaned:
                return clean_line_encoding(line.strip()) 
    return None


### Finding the correct headers and the tables in each page in  the pdf 

In [None]:


def standardize_columns(df):
    df.columns = [f"Col{i+1}" for i in range(len(df.columns))]
    return df

# All the datasheets pdf 
for filename in os.listdir(DATA_TABLES_INPUT):
    if not filename.lower().endswith(".pdf"):
        continue  

    pdf_path = os.path.join(DATA_TABLES_INPUT, filename)
    output_path = os.path.join(DATA_TABLES_OUTPUT, filename.replace(".pdf", ".csv"))
    collected_tables = []  

    try:
        # Using pdfplumber here to open the file and finf the tables here 
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                # Extracting the text lines from the page
                text_lines = page.extract_text().split("\n") if page.extract_text() else []
                # Extracting the tables from the page
                tables = page.extract_tables()

                for table in tables:
                    if not table or len(table) < 2 or len(table[0]) < 2:
                        continue

                    # section heading
                    heading = find_matching_heading(text_lines)
                    if heading:
                        try:
                            df = pd.DataFrame(table[1:], columns=table[0])
                        except:
                            df = pd.DataFrame(table)
                            df = standardize_columns(df)

                        # Clean table contents
                        df = clean_dataframe(df)
                        df = df.apply(lambda col: col.map(lambda x: clean_line_encoding(str(x)) if pd.notnull(x) else x))

                        # save with the heading and the table 
                        collected_tables.append((heading, df))

    # Very important to check the files processing or not here
    except Exception as e:
        print(f"Error processing {filename}: {e}")
        continue


    # Save all collected tables for this PDF to a CSV

    if collected_tables:
        all_csv_data = []
        for title, df in collected_tables:
            # heading 
            all_csv_data.append(pd.DataFrame([[title]]))
        
            df = standardize_columns(df)
            all_csv_data.append(df)
            
            all_csv_data.append(pd.DataFrame([[""] * len(df.columns)], columns=df.columns))

        # Combining all into one CSV file
        final_df = pd.concat(all_csv_data, ignore_index=True)
        final_df.to_csv(output_path, index=False, header=False)
        print(f"Extracted: {filename} to {output_path}")
    else:
        print(f"No matching tables found in {filename}")
