<a href="https://colab.research.google.com/github/wardayX/cyhack/blob/main/productname_to_gst_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup & Imports  
Install and import all required libraries.


In [None]:
!pip install pandas pdfplumber fuzzywuzzy python-Levenshtein sentence-transformers

import pandas as pd
import pdfplumber
from fuzzywuzzy import process as fuzzy_process
from sentence_transformers import SentenceTransformer, util
import torch
import re
from google.colab import files
import io

print("Libraries installed and imported.")

## File Upload Functions  
Define helper functions to upload the HSN PDF and GST CSV.

In [None]:
def upload_hsn_pdf():
    print("Please upload your HSN Code PDF (SL NO, HS CODE, DESCRIPTION columns).")
    uploaded = files.upload()
    if not uploaded:
        print("No file uploaded.")
        return None, None
    file_name = list(uploaded.keys())[0]
    print(f"Uploaded '{file_name}'")
    return file_name, uploaded[file_name]

def upload_gst_csv():
    print("\nPlease upload your GST Rates CSV.")
    uploaded = files.upload()
    if not uploaded:
        print("No file uploaded.")
        return None, None
    file_name = list(uploaded.keys())[0]
    print(f"Uploaded '{file_name}'")
    return file_name, uploaded[file_name]

print("File upload functions defined.")

## Parse HSN PDF with Aggregate Hierarchical Descriptions
Extract HS codes and descriptions from the uploaded PDF with enriching parent HSN codes by concatenating child descriptions.


In [None]:
def parse_hsn_pdf(pdf_content):
    """
    Parses the HSN PDF to extract HS Codes and Descriptions.
    This is a generic attempt and might need fine-tuning based on your PDF's structure.
    """
    data = []
    try:
        with pdfplumber.open(io.BytesIO(pdf_content)) as pdf:
            for i, page in enumerate(pdf.pages):
                print(f"Processing PDF page {i+1}/{len(pdf.pages)}...")
                tables = page.extract_tables()
                if tables:
                    for table in tables:
                        header = table[0]
                        if header and 'HS CODE' in str(header).upper() and 'DESCRIPTION' in str(header).upper():
                            data_rows = table[1:]
                        else:
                            data_rows = table

                        for row in data_rows:
                            if len(row) >= 3:
                                sl_no, hs_code, description = row[0], row[1], row[2]
                                hs_code = str(hs_code).replace('\n', ' ').strip() if hs_code else None
                                description = str(description).replace('\n', ' ').strip() if description else None
                                if hs_code and description:
                                    data.append({'HS_Code_PDF': hs_code, 'Description_PDF': description})
                    continue

                text = page.extract_text()
                if text:
                    lines = text.split('\n')
                    for line in lines:
                        match_hs = re.search(r'^\s*(\d{4,8})\s+(.+)', line)
                        if match_hs:
                            hs_code = match_hs.group(1).strip()
                            description = match_hs.group(2).strip()
                            description = re.sub(r'\s{2,}', ' ', description)
                            if hs_code and description:
                                data.append({'HS_Code_PDF': hs_code, 'Description_PDF': description})
                        else:
                            print(f"Could not parse line: {line}")


        if not data:
            print("Warning: No data extracted from PDF. PDF parsing might need custom logic for your file format.")
            print("Consider using page.extract_text() and custom regex if tables are not well-structured.")
            return pd.DataFrame(columns=['HS_Code_PDF', 'Description_PDF'])

        df_hsn = pd.DataFrame(data)
        df_hsn['HS_Code_PDF'] = df_hsn['HS_Code_PDF'].astype(str).str.replace(r'\W+', '', regex=True).str.strip().str.lower()
        df_hsn.dropna(subset=['HS_Code_PDF'], inplace=True)
        df_hsn.drop_duplicates(subset=['HS_Code_PDF'], keep='first', inplace=True)
        print(f"Extracted {len(df_hsn)} unique HSN entries from PDF.")
        return df_hsn
    except Exception as e:
        print(f"Error parsing PDF: {e}")
        print("Please ensure the PDF is not scanned (image-based) and has extractable text.")
        return pd.DataFrame(columns=['HS_Code_PDF', 'Description_PDF'])

print("PDF parsing function defined.")

def aggregate_hsn_descriptions(df_hsn_input):
    if df_hsn_input.empty:
        print("HSN PDF data is empty. Skipping aggregation.")
        return df_hsn_input.copy()

    print("\nAggregating hierarchical HSN descriptions from PDF data...")
    df_hsn = df_hsn_input.copy()
    df_hsn['HS_Code_PDF'] = df_hsn['HS_Code_PDF'].astype(str)
    df_hsn.sort_values(by='HS_Code_PDF', inplace=True)
    df_hsn.reset_index(drop=True, inplace=True)

    aggregated_descriptions = {}
    unique_hs_codes = sorted(df_hsn['HS_Code_PDF'].unique())

    for parent_hs in unique_hs_codes:
        parent_row = df_hsn[df_hsn['HS_Code_PDF'] == parent_hs]
        if parent_row.empty or pd.isna(parent_row['Description_PDF'].iloc[0]):
            continue

        current_descriptions = [parent_row['Description_PDF'].iloc[0]]
        for child_hs in unique_hs_codes:
            if child_hs.startswith(parent_hs) and len(child_hs) > len(parent_hs):
                child_row = df_hsn[df_hsn['HS_Code_PDF'] == child_hs]
                if not child_row.empty and pd.notna(child_row['Description_PDF'].iloc[0]):
                    current_descriptions.append(child_row['Description_PDF'].iloc[0])
        aggregated_descriptions[parent_hs] = ". ".join(list(dict.fromkeys(current_descriptions)))

    df_hsn['Aggregated_Description_PDF'] = df_hsn['HS_Code_PDF'].map(aggregated_descriptions)
    df_hsn['Aggregated_Description_PDF'].fillna(df_hsn['Description_PDF'], inplace=True)

    print("HSN description aggregation complete.")
    return df_hsn