Classifying sectors based on the the BERT embedding of the repository readme and the closest NAICS sector's description BERT embedding. Distance between embeddings calculated using Euclidean distance.

In [None]:
!pip install transformers
!pip install torch

import pandas as pd
import numpy as np
import os
from transformers import BertTokenizer, BertModel
import torch
from google.colab import drive


Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m46.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m91.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m80.9 MB/s[0m eta [36m0:00:00[0m
Col

In [None]:
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
data_path = "/content/drive/MyDrive/[Fall 23] DC-Github/Models/"
os.chdir(data_path)

In [None]:
naics = pd.read_csv(data_path+"/NAICS descriptions.csv")
naics

Unnamed: 0,Sector,Definition,Descriptions
0,11,"Agriculture, Forestry, Fishing and Hunting","The Agriculture, Forestry, Fishing and Hunting..."
1,21,"Mining, Quarrying, and Oil and Gas Extraction","The Mining, Quarrying, and Oil and Gas Extract..."
2,22,Utilities,The Utilities sector comprises establishments ...
3,23,Construction,The Construction sector comprises establishmen...
4,31-33,Manufacturing,The Manufacturing sector comprises establishme...
5,42,Wholesale Trade,The Wholesale Trade sector comprises establish...
6,44-45,Retail Trade,The Retail Trade sector comprises establishmen...
7,48-49,Transportation and Warehousing,The Transportation and Warehousing sector incl...
8,51,Information,The Information sector comprises establishment...
9,52,Finance and Insurance,The Finance and Insurance sector comprises est...


In [None]:
descriptions = naics['Descriptions'].tolist()
codes = naics['Sector'].tolist()

In [None]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
def embedding_generator_v2(text):
  max_length = 512
  chunks = []
  current_chunk = []
  current_length = 0

  for word in text.split():
      current_chunk.append(word)
      current_length += len(word)

      if current_length >= max_length:
          chunks.append(' '.join(current_chunk))
          current_chunk = []
          current_length = 0

  if current_chunk:
      chunks.append(' '.join(current_chunk))

  all_embeddings = []

  for chunk in chunks:
    tokenized_chunk = tokenizer(chunk, return_tensors="pt")
    with torch.no_grad():
      output = model(**tokenized_chunk)
    embeddings = output.last_hidden_state.mean(dim=1)
    all_embeddings.append(embeddings)

    # Stack the embeddings to get a tensor with shape (num_chunks, embedding_size)
    final_embeddings = torch.stack(all_embeddings)

    # Aggregate embeddings (e.g., average)
    aggregated_embedding = final_embeddings.mean(dim=0)

    return np.array(aggregated_embedding)

In [None]:
bible = {}

for idx,desc in enumerate(descriptions):
    desc_embedding = embedding_generator_v2(desc)
    bible[codes[idx]] = desc_embedding

In [None]:
def closest_sector(verse):
  closest_code = None
  closest_dist = float('inf')
  for codey, vek in bible.items():
    dist = np.linalg.norm(np.array(verse) - np.array(vek))
    # print(codey, dist)
    if dist < closest_dist:
      closest_code = codey
      closest_dist = dist

  return closest_code

In [None]:
import markdown
from bs4 import BeautifulSoup
def md_to_text(md):
    html = markdown.markdown(md)
    soup = BeautifulSoup(html, features='html.parser')
    return soup.get_text()

import re
import numpy as np
def clean_readme(text):
    if isinstance(text, str):
        # Remove HTML tags
        text = re.sub(r'<.*?>', '', text)

        # Remove Markdown image tags (![...](...))
        text = re.sub(r'!\[.*?\]\(.*?\)', '', text)

        # Remove Markdown links ([...](...))
        text = re.sub(r'\[.*?\]\(.*?\)', '', text)

        if len(text) < 5:
          return np.nan

        return text
    else:
        # Handle cases where the input is not a string (e.g., None)
        return ""

In [None]:
corpus = pd.read_csv(data_path+"/Testing corpus.csv")
corpus = corpus.dropna()
corpus["Text"] = corpus["Text"].apply(md_to_text).str.replace('\n', '')
corpus['Text'] = corpus['Text'].apply(clean_readme)
corpus['Length'] = [len(x.split()) for x in corpus['Text'].tolist()]
corpus

Unnamed: 0,Sector,Text,Length
0,11,Awesome Agriculture A curated list of awesome ...,849
1,11,farmOSfarmOS is a web-based application for fa...,215
2,11,End Official Support and Maintenance for Tania...,252
3,11,FarmVibes.AI: Multi-Modal GeoSpatial ML Models...,1240
4,11,AgroChainThis repository contains the code for...,1004
...,...,...,...
87,72,The-CookbookThe open source cook bookAlso take...,73
88,92,"""Public Administration Blockchain Certified Do...",261
89,92,"""ServiceAgent:innenThis repository contains th...",797
90,92,Report.itReport.it supports those who have bee...,75


In [None]:
corpus["BERT Embedding"] = corpus["Text"].apply(embedding_generator_v2)
corpus

Unnamed: 0,Sector,Text,Length,BERT Embedding
0,11,Awesome Agriculture A curated list of awesome ...,849,"[[-0.065078385, 0.18178034, 0.4143054, 0.24404..."
1,11,farmOSfarmOS is a web-based application for fa...,215,"[[-0.026097843, 0.124135144, 0.32449847, 0.025..."
2,11,End Official Support and Maintenance for Tania...,252,"[[-0.10966953, -0.126918, 0.4999071, 0.0299372..."
3,11,FarmVibes.AI: Multi-Modal GeoSpatial ML Models...,1240,"[[-0.2538863, -0.024011925, 0.6154866, 0.21072..."
4,11,AgroChainThis repository contains the code for...,1004,"[[-0.097173594, -0.009399759, 0.37161306, 0.15..."
...,...,...,...,...
87,72,The-CookbookThe open source cook bookAlso take...,73,"[[-0.13114196, 0.21494645, 0.41242394, 0.08113..."
88,92,"""Public Administration Blockchain Certified Do...",261,"[[0.12544225, -0.0035895961, 0.49058294, 0.055..."
89,92,"""ServiceAgent:innenThis repository contains th...",797,"[[-0.23381959, 0.057331022, 0.5464762, 0.13621..."
90,92,Report.itReport.it supports those who have bee...,75,"[[-0.18504351, -0.1150376, 0.5753982, -0.03014..."


In [None]:
corpus["Output"] = corpus["BERT Embedding"].apply(closest_sector)
corpus

Unnamed: 0,Sector,Text,Length,BERT Embedding,Output
0,11,Awesome Agriculture A curated list of awesome ...,849,"[[-0.065078385, 0.18178034, 0.4143054, 0.24404...",81
1,11,farmOSfarmOS is a web-based application for fa...,215,"[[-0.026097843, 0.124135144, 0.32449847, 0.025...",61
2,11,End Official Support and Maintenance for Tania...,252,"[[-0.10966953, -0.126918, 0.4999071, 0.0299372...",81
3,11,FarmVibes.AI: Multi-Modal GeoSpatial ML Models...,1240,"[[-0.2538863, -0.024011925, 0.6154866, 0.21072...",42
4,11,AgroChainThis repository contains the code for...,1004,"[[-0.097173594, -0.009399759, 0.37161306, 0.15...",42
...,...,...,...,...,...
87,72,The-CookbookThe open source cook bookAlso take...,73,"[[-0.13114196, 0.21494645, 0.41242394, 0.08113...",81
88,92,"""Public Administration Blockchain Certified Do...",261,"[[0.12544225, -0.0035895961, 0.49058294, 0.055...",81
89,92,"""ServiceAgent:innenThis repository contains th...",797,"[[-0.23381959, 0.057331022, 0.5464762, 0.13621...",42
90,92,Report.itReport.it supports those who have bee...,75,"[[-0.18504351, -0.1150376, 0.5753982, -0.03014...",81


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
# Calculate accuracy
accuracy = accuracy_score(corpus['Sector'], corpus['Output'])
print("Accuracy:", accuracy)

# Calculate precision
precision = precision_score(corpus['Sector'], corpus['Output'], average="macro")
print("Precision:", precision)


# Calculate recall
recall = recall_score(corpus['Sector'], corpus['Output'], average="macro")
print("Recall:", recall)

Accuracy: 0.13043478260869565
Precision: 0.12555555555555556
Recall: 0.12000000000000002


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
