Classifying sectors based on the the KeyBERT embedding of the repository readme and the closest NAICS sector's description KeyBERT embedding. Distance between embeddings calculated using Euclidean distance and both strict and generous scoring approaches are explored.

In [None]:
!pip install transformers
!pip install keybert
!pip install scikit-learn

import pandas as pd
import numpy as np
import os
from transformers import BertTokenizer, BertModel
from keybert import KeyBERT
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, log_loss, roc_auc_score


Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m49.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m85.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m79.3 MB/s[0m eta [36m0:00:00[0m
Col

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
data_path = "/content/drive/My Drive/Clubs/SAAS"
os.chdir(data_path)

In [None]:
naics = pd.read_csv(data_path+"/NAICS descriptions.csv")
exam = pd.read_csv(data_path+"/Testing corpus.csv")

In [None]:
descriptions = naics['Descriptions'].tolist()
codes = naics['Sector'].tolist()

In [None]:
correct = exam['Sector'].tolist()
readmes = exam['Text'].tolist()

In [None]:
kw_model = KeyBERT(model='all-MiniLM-L6-v2')

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
def embedding_generator(text):
  doc_embeddings, word_embeddings = kw_model.extract_embeddings(text)
  return doc_embeddings

In [None]:
bible = {}

for idx,desc in enumerate(descriptions):
    desc_embedding = embedding_generator(desc)
    bible[codes[idx]] = desc_embedding

In [None]:
def closest_sector(verse):
  closest_code = None
  closest_dist = float('inf')
  for codey, vek in bible.items():
    dist = np.linalg.norm(np.array(verse) - np.array(vek))
    print(codey, dist)
    if dist < closest_dist:
      closest_code = codey
      closest_dist = dist

  return closest_code

In [None]:
def top_3_sectors(text):
  scores = {}
  dists = []
  for codey, vek in bible.items():
    dist = np.linalg.norm(embedding_generator(text) - vek)
    scores[dist] = codey
    dists.append(dist)
  dists = sorted(dists)
  first = scores[dists[0]]
  second = scores[dists[1]]
  third = scores[dists[2]]
  return first, second, third

In [None]:
def testing():
  score = 0
  max_score = exam.shape[0]*4
  true_labels = []
  first_predicted_labels = []
  generous_predicted_labels = []
  for index, row in exam.iterrows():
    true_label = row['Sector']
    first, second, third = top_3_sectors(row['Text'])
    if true_label == first:
      score += 4
    elif true_label == second:
      score += 2
    elif true_label == third:
      score += 1
    true_labels.append(true_label)
    first_predicted_labels.append(first)
    if true_label == first or true_label==second or true_label==third:
      generous_predicted_labels.append(true_label)
    else:
      generous_predicted_labels.append(first)
  return true_labels, first_predicted_labels, generous_predicted_labels, score, max_score

In [None]:
true_labels, first_predicted_labels, generous_predicted_labels, score, max_score = testing()

In [None]:
print(f"The score is {score} out of {max_score}")

The score is 224 out of 368


In [None]:
precision = precision_score(true_labels, first_predicted_labels, average="macro")
recall = recall_score(true_labels, first_predicted_labels, average="macro")
f1 = f1_score(true_labels, first_predicted_labels, average="macro")
print("For first_predicted_labels:")
print(f"Accuracy: {accuracy_score(true_labels, first_predicted_labels):.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 score: {f1:.3f}")
#print(f"AUC-ROC: {roc_auc_score(true_labels, first_predicted_labels)}")

For first_predicted_labels:
Accuracy: 0.511
Precision: 0.493
Recall: 0.516
F1 score: 0.493


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
precision = precision_score(true_labels, generous_predicted_labels, average="macro")
recall = recall_score(true_labels, generous_predicted_labels, average="macro")
f1 = f1_score(true_labels, generous_predicted_labels, average="macro")
print("For generous_predicted_labels:")
print(f"Accuracy: {accuracy_score(true_labels, generous_predicted_labels):.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 score: {f1:.3f}")

For generous_predicted_labels:
Accuracy: 0.750
Precision: 0.721
Recall: 0.747
F1 score: 0.723


  _warn_prf(average, modifier, msg_start, len(result))
