#Importing Libraries

In [None]:
!pip install fasttext
!pip install sentence-transformers
!pip install optuna



In [None]:
# Standard library imports
import ast
import os
import re

# Third-party library imports
import fasttext
import numpy as np
import optuna
import pandas as pd
from IPython.display import display

# Google Colab specific
from google.colab import drive

# NLTK imports
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Scikit-learn imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder

# Other ML imports
from sentence_transformers import SentenceTransformer
from xgboost import XGBClassifier

# NLTK downloads (consolidated)
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')  # Open Multilingual WordNet (needed for lemmatizer)
nltk.download('punkt_tab')

ModuleNotFoundError: No module named 'fasttext'

#Loading Data



In [None]:
drive.mount('/content/drive', force_remount = True)

df = pd.read_csv('/content/drive/MyDrive/Veridion application/ml_insurance_challenge.csv')
display(df.head())

Mounted at /content/drive


Unnamed: 0,description,business_tags,sector,category,niche
0,Welchcivils is a civil engineering and constru...,"['Construction Services', 'Multi-utilities', '...",Services,Civil Engineering Services,Other Heavy and Civil Engineering Construction
1,"Kyoto Vegetable Specialists Uekamo, also known...","['Wholesale', 'Dual-task Movement Products', '...",Manufacturing,Fruit & Vegetable - Markets & Stores,"Frozen Fruit, Juice, and Vegetable Manufacturing"
2,Loidholdhof Integrative Hofgemeinschaft is a c...,"['Living Forms', 'Farm Cafe', 'Fresh Coffee', ...",Manufacturing,Farms & Agriculture Production,All Other Miscellaneous Crop Farming
3,PATAGONIA Chapa Y Pintura is an auto body shop...,"['Automotive Body Repair Services', 'Interior ...",Services,Auto Body Shops,"Automotive Body, Paint, and Interior Repair an..."
4,Stanica WODNA PTTK Swornegacie is a cultural e...,"['Cultural Activities', 'Accommodation Service...",Services,Boat Tours & Cruises,"Scenic and Sightseeing Transportation, Water"


#Main data preprocessing

##General statistics

In [None]:
df.info()
df.describe(include='all')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9494 entries, 0 to 9493
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   description    9482 non-null   object
 1   business_tags  9494 non-null   object
 2   sector         9467 non-null   object
 3   category       9467 non-null   object
 4   niche          9494 non-null   object
dtypes: object(5)
memory usage: 371.0+ KB


Unnamed: 0,description,business_tags,sector,category,niche
count,9482,9494,9467,9467,9494
unique,9477,9062,7,450,957
top,The company is a business analysis and English...,[],Manufacturing,Farms & Agriculture Production,Apiculture
freq,3,363,4005,237,10


## Text Preprocessing
Function to clean text: lowercase, remove double spaces, and strip

In [None]:
def clean_text(text):
    if isinstance(text, str):  # Ensure that the text is a string before applying the cleaning
        # Convert to lowercase
        text = text.lower()
        # Remove double spaces
        text = re.sub(r'\s+', ' ', text)
        # Strip leading/trailing spaces
        text = text.strip()
    return text

# Apply the cleaning function to all columns in the dataframe
df = df.applymap(clean_text)
df.head(3)

  df = df.applymap(clean_text)


Unnamed: 0,description,business_tags,sector,category,niche
0,welchcivils is a civil engineering and constru...,"['construction services', 'multi-utilities', '...",services,civil engineering services,other heavy and civil engineering construction
1,"kyoto vegetable specialists uekamo, also known...","['wholesale', 'dual-task movement products', '...",manufacturing,fruit & vegetable - markets & stores,"frozen fruit, juice, and vegetable manufacturing"
2,loidholdhof integrative hofgemeinschaft is a c...,"['living forms', 'farm cafe', 'fresh coffee', ...",manufacturing,farms & agriculture production,all other miscellaneous crop farming


## Handling Duplicates

In [None]:
#It seems we have some duplicates, so we will handle them
# Check for duplicates based on description, excluding NaN
dup_subset = df[df.duplicated(subset=['description'], keep=False)]
dup_subset = dup_subset.dropna(subset=['description'])

print(f"Possible semantic duplicates: {len(dup_subset)}")
# Sort and inspect them
dup_subset.sort_values(by='description')


Possible semantic duplicates: 9


Unnamed: 0,description,business_tags,sector,category,niche
2784,mahkota genteng is a company that specializes ...,"['concrete pits manufacturer', 'full-acrylic w...",manufacturing,ready mix concrete supplier,other concrete product manufacturing
5770,mahkota genteng is a company that specializes ...,"['concrete pits manufacturer', 'full-acrylic w...",manufacturing,ready mix concrete supplier,other concrete product manufacturing
884,mvh containerparts is a leading supplier of co...,"['container spare parts', 'in-house production...",manufacturing,metal storage tanks,metal can manufacturing
4931,mvh containerparts is a leading supplier of co...,"['container manufacturing', 'container spare p...",manufacturing,metal storage tanks,metal can manufacturing
4406,pp electronics (china) official website is a c...,[],manufacturing,plastics products,all other plastics product manufacturing
8163,pp electronics (china) official website is a c...,[],manufacturing,plastics products,all other plastics product manufacturing
706,the company is a business analysis and english...,"['law firm specializing in insurance law', 'fi...",services,title abstract & settlement offices,title abstract and settlement offices
2640,the company is a business analysis and english...,"['historical education', 'historical research'...",services,newspapers & magazines,periodical publishers
9154,the company is a business analysis and english...,"['research and development', 'management consu...",manufacturing,chemicals,industrial gas manufacturing


As I suspected, there are some complete duplicates and some that match only by secription. I will remove the ones that have less business tags. Moreover, there are some entries with zero description or the same description, but upon manual inspection, they seem to be for different companies.

In [None]:
df = df.drop(index = [884, 2784, 4406])
df[df.duplicated(subset=['description'], keep=False)].dropna(subset = 'description')

Unnamed: 0,description,business_tags,sector,category,niche
706,the company is a business analysis and english...,"['law firm specializing in insurance law', 'fi...",services,title abstract & settlement offices,title abstract and settlement offices
2640,the company is a business analysis and english...,"['historical education', 'historical research'...",services,newspapers & magazines,periodical publishers
9154,the company is a business analysis and english...,"['research and development', 'management consu...",manufacturing,chemicals,industrial gas manufacturing


In [None]:
df.shape

(9491, 5)

##Getting rid of stop words + Lemmatization
 I assume these steps might be skipped when working with big data, as having a bigger vocabulary would be better, and these two processes might take a lot of time.

In [None]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stop_words])

df['description'] = df['description'].apply(lambda x: remove_stopwords(x) if pd.notna(x) else x)
df['niche'] = df['niche'].apply(lambda x: remove_stopwords(x) if pd.notna(x) else x)

In [None]:

lemmatizer = WordNetLemmatizer()
df['description'] = df['description'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split() ])if pd.notna(x) else x)

#Missing values handling

First, inspect rows with at least three missing values, because those would be too hard to manage so I will get rid of them.

There are none. good

In [None]:
df[df.isnull().sum(axis=1) >= 3]


Unnamed: 0,description,business_tags,sector,category,niche


Then, I look at the ones with 2 missing values. I notice that for all of them, sector and category columns are missing together.

In [None]:
# Filter rows with 2 or more missing values
print(df[df.isnull().sum(axis=1) >= 2].shape)
df[df.isnull().sum(axis=1) >= 2]

(27, 5)


Unnamed: 0,description,business_tags,sector,category,niche
207,kelle kutsugeras costume designer stylist work...,"['costume design services', 'television produc...",,,specialized design services
607,absinthe alice costume design company founded ...,"['everyday apparel', 'costume design services'...",,,specialized design services
963,everest oxygen uk-based company exclusively di...,"['manufacturing', 'high altitude oxygen system...",,,home health equipment rental
1102,m diesel company located south shore montreal ...,"['heavy vehicle repair services', 'agricultura...",,,commercial industrial machinery equipment (exc...
1210,oxygo guadalajara company specializes sale ren...,"['wound chairs', 'medical equipment rental', '...",,,home health equipment rental
1917,cloud peak energy inc. coal mining company bas...,"['energy production', 'coal mining', 'mining r...",,,surface coal mining
2270,indoshred mumbai-based company specializes pro...,"['secure document shredding', 'door-to-door do...",,,support services
2321,elite control system ltd. ukrainian company sp...,"['gps monitoring platform development', 'secur...",,,"search, detection, navigation, guidance, aeron..."
2941,gorillaz recycling outlet shop specializes on-...,"['furniture pieces', 'recycling and outlet sto...",,,recyclable material merchant wholesalers
3449,rescue-tech company specializes pick-up drop-o...,"['cold and hot metal sawing', 'cold and hot me...",,,recyclable material merchant wholesalers


Finally, by looking at the ones that are missing one entry, I notice that only the description is missing.

In [None]:
# Filter rows with 1 missing value
print(df[df.isnull().sum(axis=1) == 1].shape)
df[df.isnull().sum(axis=1) == 1]


(12, 5)


Unnamed: 0,description,business_tags,sector,category,niche
1032,,"['community engagement services', 'crime preve...",government,courthouses,correctional institutions
2137,,"['whatsapp messaging service', 'android and ip...",services,cable & internet providers,agents wireless telecommunications services
4017,,"['high level of customer satisfaction', 'wood ...",manufacturing,building material manufacturers,fabricated structural metal manufacturing
4399,,"['distribution of first-need items', 'social m...",services,homelessness shelter,temporary shelters
5660,,"['lottery ticket sales', 'lotteries and promot...",services,loans & financing,consumer lending
6098,,[],services,auto services,automotive repair maintenance
6285,,"['journalism services', 'magicians, columnists...",services,event planner services,"promoters performing arts, sports, similar eve..."
6510,,['customized transferees for community and org...,services,"buses, shuttles & local transit",bus motor vehicle transit systems
7920,,"['business directory services', 'business aler...",services,title abstract & settlement offices,title abstract settlement offices
8058,,"['3d rendering services', 'structural detailin...",services,architects & architectural services,drafting services


Now, I wanna look at empty business tags lists. they don't appear as NaNs, but as empty lists

In [None]:
df['business_tags'] = df['business_tags'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
df['business_tags'].apply(type).value_counts()

Unnamed: 0_level_0,count
business_tags,Unnamed: 1_level_1
<class 'list'>,9491


In [None]:
#I expect these to be more, so I won't print them
empty_tags_count = df['business_tags'].apply(lambda x: isinstance(x, list) and len(x) == 0).sum()
print(f"Number of rows with empty business_tags: {empty_tags_count}")


Number of rows with empty business_tags: 362


##Filling missing values:
  I can't pull company descriptions out of thin air, so those will be left empty, but I might have a chance to fill the other missing values.   I notice that we either have no description, or no sector and category. Moreover, since the number of NaNs is so small, I can fill them based on description using a simple model and then manually inspect them. Would not reccomend this with a continuous stream of data, especially for 'category' column, as there might be new categories not prezented here.

###Descriptions

In [None]:
df['description'].fillna('', inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['description'].fillna('', inplace=True)


##Sector

In [None]:
# Get unique values for 'sector' column
sector_unique = df['sector'].unique()


# Display the unique values
print("No of unique values in 'category' column:", len(category_unique))
print(len(df['niche'].unique()))

No of unique values in 'sector' column: 8
No of unique values in 'category' column: 451
957


In [None]:
print(sector_unique)

['services' 'manufacturing' 'wholesale' 'retail' 'government' 'non profit'
 'education' nan]


In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')
le = LabelEncoder()

df_with_description = df[df['description'] != '']

df_sector = df_with_description[df_with_description['sector'].notna()].copy()
tofill_sector = df_with_description[df_with_description['sector'].isna()].copy()

# Function to combine selected columns into a single text feature
def combine_features_sector(row):
    parts = []
    parts.append(str(row['description']))
    parts.append(str(row['business_tags']).replace(',', ' '))  # Remove commas from tags
    parts.append(str(row['niche']))
    return ' '.join(parts)

# Apply to both training data and data to be filled
df_sector['combined_features'] = df_sector.apply(combine_features_sector, axis=1)
tofill_sector['combined_features'] = tofill_sector.apply(combine_features_sector, axis=1)

# Save FastText-compatible training file
def write_fasttext_file(df, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        for _, row in df.iterrows():
            label = f"__label__{row['sector'].replace(' ', '_')}"
            text = row['combined_features'].strip().replace('\n', ' ')
            f.write(f"{label} {text}\n")

# Split into train and test (without stratification to avoid rare categories issue)
train_df, test_df = train_test_split(df_sector, test_size=0.2, random_state=42)

write_fasttext_file(train_df, "fasttext_train_sector.txt")
write_fasttext_file(test_df, "fasttext_test_sector.txt")

# Optuna objective function for sector
def objective_sector(trial):
    params = {
        'epoch': trial.suggest_int('epoch', 10, 100),
        'lr': trial.suggest_float('lr', 0.01, 1.0, log=True),
        'wordNgrams': trial.suggest_int('wordNgrams', 1, 3),
        'dim': trial.suggest_categorical('dim', [50, 100, 150, 200]),
        'minCount': trial.suggest_int('minCount', 1, 5)
    }

    model = fasttext.train_supervised(
        input="fasttext_train_sector.txt",
        **params,
        verbose=0  # Silent mode for tuning
    )

    result = model.test("fasttext_test_sector.txt")
    return result[1]  # Return accuracy

# Run optimization for sector
study_sector = optuna.create_study(direction='maximize')
study_sector.optimize(objective_sector, n_trials=30, show_progress_bar=True)

# Show best parameters
print("Best trial for sector:")
trial_sector = study_sector.best_trial
print(f"  Accuracy: {trial_sector.value:.4f}")
print("  Params: ")
for key, value in trial_sector.params.items():
    print(f"    {key}: {value}")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

[I 2025-04-16 11:19:39,263] A new study created in memory with name: no-name-3d2a06c7-23f7-4c1f-902d-555ad08591b4


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2025-04-16 11:19:45,152] Trial 0 finished with value: 0.9476467477525119 and parameters: {'epoch': 49, 'lr': 0.1763607055287109, 'wordNgrams': 2, 'dim': 200, 'minCount': 4}. Best is trial 0 with value: 0.9476467477525119.
[I 2025-04-16 11:19:49,019] Trial 1 finished with value: 0.9116869381279746 and parameters: {'epoch': 72, 'lr': 0.01547741551566855, 'wordNgrams': 1, 'dim': 200, 'minCount': 3}. Best is trial 0 with value: 0.9476467477525119.
[I 2025-04-16 11:19:51,302] Trial 2 finished with value: 0.9212057112638815 and parameters: {'epoch': 32, 'lr': 0.13258120083210126, 'wordNgrams': 3, 'dim': 50, 'minCount': 3}. Best is trial 0 with value: 0.9476467477525119.
[I 2025-04-16 11:20:00,470] Trial 3 finished with value: 0.9063987308302486 and parameters: {'epoch': 53, 'lr': 0.047213761801351284, 'wordNgrams': 3, 'dim': 200, 'minCount': 2}. Best is trial 0 with value: 0.9476467477525119.
[I 2025-04-16 11:20:08,750] Trial 4 finished with value: 0.9455314648334214 and parameters: {'epo

In [None]:
# Train final sector model with best parameters
best_params_sector = study_sector.best_params

final_model_sector = fasttext.train_supervised(
    input="fasttext_train_sector.txt",
    **best_params_sector,
    verbose=2  # Show training progress
)

# Full dataset evaluation
result_sector = final_model_sector.test("fasttext_test_sector.txt")
print(f"\nFinal Sector Model Accuracy: {result_sector[1]:.4f}")
print(f"Recall@1: {result_sector[2]}")
print(f"Number of examples: {result_sector[0]}")

# Prepare data to fill
tofill_sector = df_with_description[df_with_description['sector'].isna()].copy()
tofill_sector['combined_features'] = tofill_sector.apply(combine_features_sector, axis=1)

# Make predictions
texts_to_predict_sector = tofill_sector['combined_features'].astype(str).tolist()
labels_sector, probabilities_sector = final_model_sector.predict(texts_to_predict_sector)

# Apply confidence threshold (0.7)
filled_indices_sector = []
predicted_sectors = []

for i, (label, prob) in enumerate(zip(labels_sector, probabilities_sector)):
    if prob[0] >= 0.7:
        clean_label = label[0].replace('__label__', '').replace('_', ' ')
        predicted_sectors.append(clean_label)
        filled_indices_sector.append(tofill_sector.index[i])
    else:
        predicted_sectors.append(np.nan)

# Update original dataframe
df.loc[filled_indices_sector, 'sector'] = predicted_sectors[:len(filled_indices_sector)]

# Results summary
print(f"\nFilled {len(filled_indices_sector)} NA values in sector (≥70% confidence)")
print(f"{df['sector'].isna().sum()} remain unfilled")

# Save the unfilled sector rows for inspection
# Run this before any processing
df['sector'] = df['sector'].replace([None, pd.NA, 'null', 'NULL'], np.nan)
unfilled_sector = df[df['sector'].isna()].copy()
print("\nUnfilled sector rows sample:")
unfilled_sector[['description', 'business_tags','sector', 'niche']]
unfilled_sector.to_csv("filled_sector.csv", index=False)


Final Sector Model Accuracy: 0.9545
Recall@1: 0.9545214172395557
Number of examples: 1891

Filled 26 NA values in sector (≥70% confidence)
2 remain unfilled

Unfilled sector rows sample:


###DeepSeek manual filling for sector

In [None]:
# Manually assign sectors based on business description and tags
unfilled_sector['sector'] = [
    'non profit',    # Hegering Sülze (hunting association) - 95% confidence
    'services'       # Clifford Spulock Lighting - 100% confidence
]
df.loc[df['sector'].isna(), 'sector'] = unfilled_sector['sector']
print(f"{df['sector'].isna().sum()} remain unfilled")

0 remain unfilled


##Category

In [None]:
# Get unique values for 'category' column
category_unique = df['category'].unique()

print("No of unique values in 'sector' column:", len(sector_unique))


NameError: name 'df' is not defined

In [None]:
# First, prepare the training data combining multiple features
df_category = df_with_description[df_with_description['category'].notna()].copy()
tofill_category = df_with_description[df_with_description['category'].isna()].copy()

# Function to combine multiple columns into a single text feature
def combine_features(row):
    parts = []
    parts.append(str(row['description']))
    parts.append(str(row['business_tags']).replace(',', ' '))  # Remove commas from tags
    parts.append(str(row['sector']))
    parts.append(str(row['niche']))
    return ' '.join(parts)

# Apply to both training data and data to be filled
df_category['combined_features'] = df_category.apply(combine_features, axis=1)
tofill_category['combined_features'] = tofill_category.apply(combine_features, axis=1)

# Save FastText-compatible training file
def write_fasttext_file(df, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        for _, row in df.iterrows():
            label = f"__label__{row['category'].replace(' ', '_')}"
            text = row['combined_features'].strip().replace('\n', ' ')
            f.write(f"{label} {text}\n")

# Split into train and test
train_df, test_df = train_test_split(df_category, test_size=0.2, random_state=42)

write_fasttext_file(train_df, "fasttext_train_category.txt")
write_fasttext_file(test_df, "fasttext_test_category.txt")

# Optuna objective function
def objective(trial):
    params = {
        'epoch': trial.suggest_int('epoch', 10, 500),
        'lr': trial.suggest_float('lr', 0.001, 1.0, log=True),
        'wordNgrams': trial.suggest_int('wordNgrams', 1, 5),
        'dim': trial.suggest_categorical('dim', [50, 100, 150, 200, 300]),
        'minCount': trial.suggest_int('minCount', 1, 5)
    }

    model = fasttext.train_supervised(
        input="fasttext_train_category.txt",
        **params,
        verbose=0  # Silent mode for tuning
    )

    result = model.test("fasttext_test_category.txt")
    return result[1]  # Return accuracy


# Run optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30, show_progress_bar=True)

# Show best parameters
print("Best trial:")
trial = study.best_trial
print(f"  Accuracy: {trial.value:.4f}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")



[I 2025-04-16 11:22:07,475] A new study created in memory with name: no-name-a0ce1d36-2676-4039-958a-7598d33bdaa2


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2025-04-16 11:24:29,723] Trial 0 finished with value: 0.1496021220159151 and parameters: {'epoch': 499, 'lr': 0.01979383650047403, 'wordNgrams': 4, 'dim': 100, 'minCount': 1}. Best is trial 0 with value: 0.1496021220159151.
[I 2025-04-16 11:26:50,681] Trial 1 finished with value: 0.08488063660477453 and parameters: {'epoch': 442, 'lr': 0.022008630604923498, 'wordNgrams': 5, 'dim': 100, 'minCount': 1}. Best is trial 0 with value: 0.1496021220159151.
[I 2025-04-16 11:29:31,362] Trial 2 finished with value: 0.5145888594164456 and parameters: {'epoch': 274, 'lr': 0.04550415542449943, 'wordNgrams': 2, 'dim': 300, 'minCount': 1}. Best is trial 2 with value: 0.5145888594164456.
[I 2025-04-16 11:29:38,339] Trial 3 finished with value: 0.593103448275862 and parameters: {'epoch': 59, 'lr': 0.13442621111772723, 'wordNgrams': 1, 'dim': 50, 'minCount': 4}. Best is trial 3 with value: 0.593103448275862.
[I 2025-04-16 11:32:19,850] Trial 4 finished with value: 0.6514588859416446 and parameters: {'

In [None]:
# Train final model with best parameters
best_params = study.best_params

final_model = fasttext.train_supervised(
    input="fasttext_train_category.txt",
    **best_params,
    verbose=2  # Show training progress
)

# Full dataset evaluation
result = final_model.test("fasttext_test_category.txt")
print(f"\nFinal Model Accuracy: {result[1]:.4f}")
print(f"Recall@1: {result[2]}")
print(f"Number of examples: {result[0]}")


Final Model Accuracy: 0.6944
Recall@1: 0.6944297082228117
Number of examples: 1885


In [None]:

# Prepare data to fill
tofill_category = df_with_description[df_with_description['category'].isna()].copy()
tofill_category['combined_features'] = tofill_category.apply(combine_features, axis=1)

# Make predictions
texts_to_predict = tofill_category['combined_features'].astype(str).tolist()
labels, probabilities = final_model.predict(texts_to_predict)

# Apply confidence threshold (0.7)
filled_indices = []
predicted_categories = []

for i, (label, prob) in enumerate(zip(labels, probabilities)):
    if prob[0] >= 0.6:
        clean_label = label[0].replace('__label__', '').replace('_', ' ')
        predicted_categories.append(clean_label)
        filled_indices.append(tofill_category.index[i])
    else:
        predicted_categories.append(np.nan)

# Update original dataframe
df.loc[filled_indices, 'category'] = predicted_categories[:len(filled_indices)]
df['category'] = df['category'].replace([None, pd.NA, 'null', 'NULL'], np.nan)
# Results summary
print(f"Filled {len(filled_indices)} NA values (≥60% confidence)")
print(f"{len(tofill_category) - len(filled_indices)} remain unfilled")


Filled 16 NA values (≥60% confidence)
11 remain unfilled


In [None]:
unfilled_category = df[df['category'].isna()].copy()
print("\nUnfilled category rows sample:")
unfilled_category[['description', 'business_tags','sector', 'category', 'niche']]

unfilled_category.to_csv("unfilled_category.csv", index=False)


Unfilled category rows sample:


In [None]:
# Get all unfilled rows
unfilled_data = df[df['sector'].isna() | df['category'].isna()].copy()

# Add prediction suggestions if available (assuming you have the models)
if 'final_model_sector' in globals():
    sector_texts = unfilled_data.apply(combine_features_sector, axis=1).tolist()
    sector_labels, sector_probs = final_model_sector.predict(sector_texts, k=3)
    unfilled_data['sector_suggestions'] = [
        f"{l[0]}({p[0]:.0%}), {l[1]}({p[1]:.0%}), {l[2]}({p[2]:.0%})"
        for l, p in zip(sector_labels, sector_probs)
    ]

if 'final_model_category' in globals():
    category_texts = unfilled_data.apply(combine_features, axis=1).tolist()
    category_labels, category_probs = final_model_category.predict(category_texts, k=3)
    unfilled_data['category_suggestions'] = [
        f"{l[0]}({p[0]:.0%}), {l[1]}({p[1]:.0%}), {l[2]}({p[2]:.0%})"
        for l, p in zip(category_labels, category_probs)
    ]

###DeepSeek manual filling for category

Oxygo Guadalajara (home health equipment rental in niche) → medical supply retailers (95%)

Fits medical equipment rental/sales, though home health care would be better if available.

Cloud Peak Energy (surface coal mining in niche) → mining & gas exploration (90%)

Direct match for coal mining.

Indoshred (support services in niche) → business support services (85%)

Secure shredding aligns with business support.

Elite Control System → alarms, surveillance & security systems (90%)

GPS monitoring and security systems fit well.

Gorillaz Recycling (recyclable material merchant wholesalers in niche) → salvage merchandise & thrift stores (80%)

Alternative: buyback shops (but salvage fits better for their operations).

Joleen Garnett / Andrea Meidrie / Indian Designz → specialized design services (100%)

Perfect for costume/fashion/embroidery design.

Bess Company (home health equipment rental in niche) → medical supply retailers (95%)

Medical equipment sales/rental.

Soho Office (private mail centers in niche) → private mail centers (95%)

Exact match for printing/shipping/mailbox services.

Hegering Sülze (finfish fishing in niche) → hunting & fishing - services & supplies (85%)

Hunting association fits here.

Kohl Recycling / Autowell Motor → scrap metals (95%)

Both are scrap/recycling businesses.

Notes:
Confidence 100%: Perfect matches (e.g., design services, private mail centers).

Confidence 90-95%: Very close but slightly broad (e.g., medical supply retailers vs. home health care).

Confidence 80-85%: Reasonable but not perfect (e.g., salvage merchandise for Gorillaz).

In [None]:
# Manually assign categories (using business description, tags, and niche)
unfilled_category['category'] = [
    'medical supply retailers',          # oxygo guadalajara (95% confidence)
    'mining & gas exploration',          # Cloud Peak Energy (90%)
    'business support services',         # Indoshred (85%)
    'alarms, surveillance & security systems',  # Elite Control System (90%)
    'salvage merchandise & thrift stores',      # Gorillaz Recycling (80%)
    'specialized design services',       # Joleen Garnett (100%)
    'business support services',         # AirConnect (90%)
    'specialized design services',       # Andrea Meidrie (100%)
    'specialized design services',       # Indian Designz (100%)
    'medical supply retailers',          # Bess Company (95%)
    'private mail centers',              # Soho Office (95%)
    'specialized design services',       # Lighting design company (100%)
    'hunting & fishing - services & supplies',  # Hegering Sülze (85%)
    'specialized design services',       # Clifford Spulock (100%)
    'private mail centers',              # Inbox.lv (100%)
    'scrap metals',                     # Kohl Recycling (95%)
    'scrap metals',                     # Autowell Motor (95%)
    'specialized design services'        # Designer Los Angeles (100%)
]


In [None]:
df.loc[df['category'].isna(), 'category'] = unfilled_category['category']
print(f"{df['category'].isna().sum()} remain unfilled")

0 remain unfilled


#Save clean dataset

In [None]:
df.to_csv("/content/drive/MyDrive/Veridion application/clean_data.csv", index=False)

# Check if the file exists
file_path = '/content/drive/MyDrive/Veridion application/clean_data.csv'  # Adjust extension
if os.path.exists(file_path):
    print("File saved successfully!")
else:
    print("Error: File not found.")

File saved successfully!
