In [3]:
!pip install fasttext
!pip install sentence_transformers


Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m71.7/73.4 kB[0m [31m2.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp311-cp311-linux_x86_64.whl size=4313506 sha256=87332e7602df61

In [3]:
# Standard library imports
import ast
import os
import re

# Third-party library imports

import numpy as np
import pandas as pd
from IPython.display import display
from collections import defaultdict

# Google Colab specific
from google.colab import drive

# NLTK imports
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Scikit-learn imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder

# Other ML imports
from sentence_transformers import SentenceTransformer

# NLTK downloads (consolidated)
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')  # Open Multilingual WordNet (needed for lemmatizer)
nltk.download('punkt_tab')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [4]:
drive.mount('/content/drive', force_remount = True)
df_labels = pd.read_excel('/content/drive/MyDrive/Veridion application/insurance_taxonomy.xlsx')

Mounted at /content/drive


In [None]:
def clean_labels(label_series):
    """
    Clean labels and descriptions by:
    - Converting to lowercase
    - Removing special characters
    - Replacing multiple spaces with single space
    - Stripping whitespace

    Parameters:
    label_series (pd.Series): Series containing labels to be cleaned

    Returns:
    list: Cleaned labels as a list
    """

    # Convert to string and handle NaN values
    labels = label_series['label'].dropna().astype(str).unique().tolist()

    cleaned_labels = []
    for label in labels:
        # Convert to lowercase
        label = label.lower()

        # Remove special characters (keep letters, numbers, spaces, hyphens, and slashes)
        label = re.sub(r"[^a-z0-9 \-/]", "", label)

        # Replace multiple spaces with single space
        label = re.sub(r"\s+", " ", label)

        # Strip leading/trailing whitespace
        label = label.strip()

        # Tokenize and lemmatize each word
        words = word_tokenize(label)


        cleaned_labels.append(label)

    # Remove duplicates that might have been created during cleaning
    cleaned_labels = list(set(cleaned_labels))

    return cleaned_labels

In [None]:
labels = clean_labels(df_labels)
print(labels)

['pet food manufacturing', 'spray painting services', 'windows and doors installation', 'window and door manufacturing', 'hvac inspections', 'human resources services', 'residential plumbing services', 'management consulting', 'boiler installation services', 'gas manufacturing services', 'online marketing services', 'public relations services', 'ornamental plant nurseries', 'tree services - pruning / removal', 'grain handling machinery installation', 'cable installation services', 'carpet manufacturing services', 'commercial plumbing services', 'stationery manufacturing', 'grain processing services', 'carpentry services', 'road and highway construction', 'water treatment services', 'elevator installation services', 'sheet metal services', 'furniture manufacturing', 'dock and pier construction', 'commercial communication equipment installation', 'agricultural equipment services', 'strategic planning services', 'low-rise glass installation', 'vacant building management', 'testing and ins

In [None]:
def find_similar_labels(labels, similarity_threshold=0.85):
    """
    Find similar labels in a list using cosine similarity of TF-IDF vectors.

    Args:
        labels: List of label strings
        similarity_threshold: Minimum similarity score to consider as duplicate (0-1)

    Returns:
        Dictionary of {label: [similar_labels]} pairs
    """
    # Create TF-IDF vectors
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(labels)

    # Calculate pairwise similarities
    similarities = cosine_similarity(tfidf_matrix)

    # Find similar pairs above threshold
    duplicates = defaultdict(list)
    n = len(labels)

    for i in range(n):
        for j in range(i+1, n):
            if similarities[i][j] > similarity_threshold:
                duplicates[labels[i]].append(labels[j])

    return dict(duplicates)


In [None]:
similar_pairs = find_similar_labels(labels, similarity_threshold=0.65)
print("Potential duplicates:")
for label, similar in similar_pairs.items():
    print(f"{label}: {similar}")

Potential duplicates:
spray painting services: ['painting services']
management consulting: ['consulting services']
gas manufacturing services: ['gas installation services']
online marketing services: ['marketing services']
commercial communication equipment installation: ['residential communication equipment installation']
low-rise glass installation: ['high-rise glass installation', 'low-rise signage installation']
commercial driveway construction: ['commercial construction services', 'residential driveway construction']
high-rise glass installation: ['high-rise signage installation']
swimming pool installation services: ['swimming pool maintenance services']
field welding services: ['welding services']
high-rise foundation construction: ['low-rise foundation construction']
gas installation services: ['medical gas installation services']
commercial drain cleaning: ['residential drain cleaning']
low-rise signage installation: ['high-rise signage installation']
food processing services

In [None]:
df_labels = pd.read_csv('/content/drive/MyDrive/Veridion application/deepseek_labels_with_desc.csv')

In [None]:
def clean_text(text):
    if isinstance(text, str):  # Ensure that the text is a string before applying the cleaning
        # Convert to lowercase
        text = text.lower()
        # Remove double spaces
        text = re.sub(r'\s+', ' ', text)
        # Strip leading/trailing spaces
        text = text.strip()
    return text

# Apply the cleaning function to all columns in the dataframe
df_labels = df_labels.applymap(clean_text)
df_labels.head()

  df_labels = df_labels.applymap(clean_text)


Unnamed: 0,label,label_description
0,agricultural equipment services,agricultural equipment services: insurance for...
1,soil nutrient application services,soil nutrient application services: insurance ...
2,pesticide application services,pesticide application services: insurance for ...
3,ornamental plant nurseries,ornamental plant nurseries: insurance for nurs...
4,landscaping services,landscaping services: insurance for profession...


In [None]:
# Convert list to DataFrame and save as CSV
df_labels.to_csv(
    '/content/drive/MyDrive/Veridion application/clean_labels+description.csv',
    index=False
)

In [None]:
# Check if the file exists
file_path = '/content/drive/MyDrive/Veridion application/clean_labels+description.csv'  # Adjust extension
if os.path.exists(file_path):
    print("File saved successfully!")
else:
    print("Error: File not found.")

File saved successfully!


In [5]:
df_labels = pd.read_csv('/content/drive/MyDrive/Veridion application/clean_labels+description.csv')
df_labels.head(3)

Unnamed: 0,label,label_description
0,agricultural equipment services,agricultural equipment services: insurance for...
1,soil nutrient application services,soil nutrient application services: insurance ...
2,pesticide application services,pesticide application services: insurance for ...


In [6]:
from sklearn.manifold import TSNE
import plotly.express as px
from sentence_transformers import SentenceTransformer
st_model = SentenceTransformer('all-mpnet-base-v2', device='cuda')
# Project label embeddings to 2D
embeddings = st_model.encode(df_labels['label_description'])
tsne = TSNE(n_components=2)
projected = tsne.fit_transform(embeddings)

# Plot
fig = px.scatter(x=projected[:,0], y=projected[:,1],
                 text=df_labels['label'])
fig.update_traces(textposition='top center')
fig.show()