Classifying sectors based on the the BERT embedding of the repository readme and the closest NAICS sector's description BERT embedding. Distance between embeddings calculated using Euclidean distance.

In [2]:
!pip install transformers
!pip install torch

import pandas as pd
import numpy as np
import os
from transformers import BertTokenizer, BertModel
import torch
from google.colab import drive


In [3]:
data_path = "../feature_extraction"
naics = pd.read_csv(data_path+"/NAICS descriptions.csv")
naics

Unnamed: 0,Sector,Definition,Descriptions
0,11,"Agriculture, Forestry, Fishing and Hunting","The Agriculture, Forestry, Fishing and Hunting..."
1,21,"Mining, Quarrying, and Oil and Gas Extraction","The Mining, Quarrying, and Oil and Gas Extract..."
2,22,Utilities,The Utilities sector comprises establishments ...
3,23,Construction,The Construction sector comprises establishmen...
4,31-33,Manufacturing,The Manufacturing sector comprises establishme...
5,42,Wholesale Trade,The Wholesale Trade sector comprises establish...
6,44-45,Retail Trade,The Retail Trade sector comprises establishmen...
7,48-49,Transportation and Warehousing,The Transportation and Warehousing sector incl...
8,51,Information,The Information sector comprises establishment...
9,52,Finance and Insurance,The Finance and Insurance sector comprises est...


In [4]:
descriptions = naics['Descriptions'].tolist()
codes = naics['Sector'].tolist()

In [None]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
def embedding_generator_v2(text):
  max_length = 512
  chunks = []
  current_chunk = []
  current_length = 0

  for word in text.split():
      current_chunk.append(word)
      current_length += len(word)

      if current_length >= max_length:
          chunks.append(' '.join(current_chunk))
          current_chunk = []
          current_length = 0

  if current_chunk:
      chunks.append(' '.join(current_chunk))

  all_embeddings = []

  for chunk in chunks:
    tokenized_chunk = tokenizer(chunk, return_tensors="pt")
    with torch.no_grad():
      output = model(**tokenized_chunk)
    embeddings = output.last_hidden_state.mean(dim=1)
    all_embeddings.append(embeddings)

    # Stack the embeddings to get a tensor with shape (num_chunks, embedding_size)
    final_embeddings = torch.stack(all_embeddings)

    # Aggregate embeddings (e.g., average)
    aggregated_embedding = final_embeddings.mean(dim=0)

    return np.array(aggregated_embedding)

In [None]:
bible = {}

for idx,desc in enumerate(descriptions):
    desc_embedding = embedding_generator_v2(desc)
    bible[codes[idx]] = desc_embedding

In [None]:
def closest_sector(verse):
  closest_code = None
  closest_dist = float('inf')
  for codey, vek in bible.items():
    dist = np.linalg.norm(np.array(verse) - np.array(vek))
    # print(codey, dist)
    if dist < closest_dist:
      closest_code = codey
      closest_dist = dist

  return closest_code

In [5]:
corpus = pd.read_csv(data_path+"/Testing_corpus_descriptions.csv")
corpus = corpus.dropna()
corpus

Unnamed: 0,Sector,Description
0,11,Classification of crop disease (yellow rust) f...
1,11,tutorials made for biologists to learn deep le...
2,11,A voting classifier based machine learning sys...
3,11,Online texture triangle visualizer.
4,11,This website provides research support for stu...
...,...,...
89,72,Food or Item Order Management System
90,92,The organized list of awesome @italia projects
91,92,Report.it è un applicativo Android/iOS svilupp...
92,92,🌐 🧑‍🏫 Website for graduate-level course on com...


In [None]:
corpus["BERT Embedding"] = corpus["Description"].apply(embedding_generator_v2)
corpus

Unnamed: 0,Sector,Description,BERT Embedding
0,11,Classification of crop disease (yellow rust) f...,"[[-0.44266263, -0.295407, -0.2782381, -0.07084..."
1,11,tutorials made for biologists to learn deep le...,"[[-0.19462495, -0.12921366, -0.0005651365, 0.2..."
2,11,A voting classifier based machine learning sys...,"[[-0.222521, -0.12153764, 0.037991498, 0.40520..."
3,11,Online texture triangle visualizer.,"[[-0.24195579, -0.43414468, 0.067351244, 0.041..."
4,11,This website provides research support for stu...,"[[-0.22856274, 0.030235507, 0.08255472, 0.1292..."
...,...,...,...
89,72,Food or Item Order Management System,"[[0.06156659, -0.024398966, 0.035033666, 0.075..."
90,92,The organized list of awesome @italia projects,"[[0.3811378, -0.39561918, 0.21778627, 0.122468..."
91,92,Report.it è un applicativo Android/iOS svilupp...,"[[-0.36869088, -0.3951557, 0.6103128, -0.00066..."
92,92,🌐 🧑‍🏫 Website for graduate-level course on com...,"[[-0.20326848, 0.16832393, 0.15740353, 0.05658..."


In [None]:
corpus["Output"] = corpus["BERT Embedding"].apply(closest_sector)
corpus

Unnamed: 0,Sector,Description,BERT Embedding,Output
0,11,Classification of crop disease (yellow rust) f...,"[[-0.44266263, -0.295407, -0.2782381, -0.07084...",21
1,11,tutorials made for biologists to learn deep le...,"[[-0.19462495, -0.12921366, -0.0005651365, 0.2...",61
2,11,A voting classifier based machine learning sys...,"[[-0.222521, -0.12153764, 0.037991498, 0.40520...",42
3,11,Online texture triangle visualizer.,"[[-0.24195579, -0.43414468, 0.067351244, 0.041...",81
4,11,This website provides research support for stu...,"[[-0.22856274, 0.030235507, 0.08255472, 0.1292...",81
...,...,...,...,...
89,72,Food or Item Order Management System,"[[0.06156659, -0.024398966, 0.035033666, 0.075...",72
90,92,The organized list of awesome @italia projects,"[[0.3811378, -0.39561918, 0.21778627, 0.122468...",62
91,92,Report.it è un applicativo Android/iOS svilupp...,"[[-0.36869088, -0.3951557, 0.6103128, -0.00066...",81
92,92,🌐 🧑‍🏫 Website for graduate-level course on com...,"[[-0.20326848, 0.16832393, 0.15740353, 0.05658...",62


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
# Calculate accuracy
accuracy = accuracy_score(corpus['Sector'], corpus['Output'])
print("Accuracy:", accuracy)

# Calculate precision
precision = precision_score(corpus['Sector'], corpus['Output'], average="macro")
print("Precision:", precision)


# Calculate recall
recall = recall_score(corpus['Sector'], corpus['Output'], average="macro")
print("Recall:", recall)

Accuracy: 0.09574468085106383
Precision: 0.09666666666666665
Recall: 0.09


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
