In [1]:
from time import sleep
from functools import cache
from keybert import KeyBERT
import numpy as np

kw_model = KeyBERT(model='all-MiniLM-L6-v2')

@cache
def create_embeddings(texts):
    doc_embeddings, word_embeddings = kw_model.extract_embeddings(texts)
    return np.array(doc_embeddings[0])

In [2]:
import pandas as pd

df = pd.read_csv('NAICS descriptions - Sheet1.csv')
embeds = []

for sector, definition, decscription in zip(df['Sector'], df['Definition'], df['Descriptions']):
    embeds.append({
        'label': sector,
        'definition': definition,
        'embed': create_embeddings(f'{definition} {decscription}')
    })

In [3]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

@cache
def create_embeddings(texts):
    doc_embeddings, word_embeddings = kw_model.extract_embeddings(texts)
    return list(doc_embeddings[0])

def get_similarity(text):
    input_embedding = np.array(create_embeddings(str(text)))
    
    similarities = []
    for item in embeds:
        item_embedding = np.array(item['embed'])
        similarity_score = cosine_similarity(input_embedding.reshape(1, -1), item_embedding.reshape(1, -1))[0][0]
        similarities.append((similarity_score, item['label']))

    similarities.sort(key=lambda x: x[0], reverse=True)

    return [score for score, label in similarities]

# Example usage
similarities = get_similarity("some text")
similarities

[0.1948235,
 0.16777931,
 0.119437814,
 0.11784527,
 0.11614431,
 0.107605174,
 0.10450492,
 0.09588417,
 0.0911116,
 0.08241604,
 0.058494505,
 0.05304778,
 0.04806675,
 0.047230512,
 0.04700174,
 0.046165932,
 0.04087612,
 0.03145825,
 0.018338094,
 -0.009339392]

In [9]:
import pandas as pd
from ast import literal_eval

df = pd.read_csv('combined_df.csv')
new_rows = []

# Process all rows
for i in range(len(df)):
    readme = str(df['readme'][i])
    description = str(df['description'][i])
    our_topic = str(df['topic (search)'][i])
    try:    
        topic = literal_eval(str(df['topics'][i]))
        if our_topic in topic:
            topic.remove(our_topic)
        topic = ' '.join(topic)
    except Exception as e:
        topic = str(df['topics'][i])
    if not topic:
        topic = 'not sure'
    
    label = df['NAICS Code'][i]

    new_row = {
        'readme': readme,
        'description': description,
        'label': label,
        'similarity_readme': get_similarity(readme),
        'similarity_description': get_similarity(description), 
        'similarity_topic': get_similarity(topic), 
        'embedding_readme': create_embeddings(readme),
        'embedding_description': create_embeddings(description),
        'embedding_topic': create_embeddings(topic),
    }
    new_rows.append(new_row)
    
    if i % 10 == 0:
        print(f'Finished {i} rows')

# Create a new DataFrame from the processed rows
combined_df = pd.DataFrame(new_rows)
combined_df.to_csv('combined_df_similarity.csv', index=False)

# Load and display the updated DataFrame
updated_df = pd.read_csv('combined_df_similarity.csv')
updated_df


Finished 0 rows
Finished 10 rows
Finished 20 rows
Finished 30 rows
Finished 40 rows
Finished 50 rows
Finished 60 rows
Finished 70 rows
Finished 80 rows
Finished 90 rows
Finished 100 rows
Finished 110 rows
Finished 120 rows
Finished 130 rows
Finished 140 rows
Finished 150 rows
Finished 160 rows
Finished 170 rows
Finished 180 rows
Finished 190 rows
Finished 200 rows
Finished 210 rows
Finished 220 rows
Finished 230 rows
Finished 240 rows
Finished 250 rows
Finished 260 rows
Finished 270 rows
Finished 280 rows
Finished 290 rows
Finished 300 rows
Finished 310 rows
Finished 320 rows
Finished 330 rows
Finished 340 rows
Finished 350 rows
Finished 360 rows
Finished 370 rows
Finished 380 rows
Finished 390 rows
Finished 400 rows
Finished 410 rows
Finished 420 rows
Finished 430 rows
Finished 440 rows
Finished 450 rows
Finished 460 rows
Finished 470 rows
Finished 480 rows
Finished 490 rows
Finished 500 rows
Finished 510 rows
Finished 520 rows
Finished 530 rows
Finished 540 rows
Finished 550 rows
Fin

Unnamed: 0,readme,description,label,similarity_readme,similarity_description,similarity_topic,embedding_readme,embedding_description,embedding_topic
0,"<div align=""center"">\n <a href=""https://erp...",Free and Open Source Enterprise Resource Plann...,['44-45'],"[0.12052955, 0.108674966, 0.073804, 0.06869083...","[0.2809217, 0.26929814, 0.19942525, 0.19053647...","[0.3541449, 0.34394047, 0.29954726, 0.28433025...","[-0.060707223, 0.091015, -0.010843867, 0.04487...","[0.02610234, 0.03241634, 0.006426494, 0.048791...","[0.024646971, 0.031236287, -0.04333933, -0.038..."
1,# Shopizer 3.X (for java 1.8 +) (tested with J...,Shopizer java e-commerce software,['44-45'],"[0.21249312, 0.18798687, 0.14135134, 0.0707777...","[0.4488267, 0.44185627, 0.221044, 0.18593454, ...","[0.36261475, 0.35541975, 0.13932334, 0.1278604...","[-0.104230836, 0.09036086, 0.012103572, -0.010...","[-0.062524214, 0.062162645, -0.0022274256, -0....","[-0.0051593306, -0.024232175, 0.0337664, -0.05..."
2,# Forecasting Best Practices \n\nTime series f...,Time Series Forecasting Best Practices & Examples,['44-45'],"[0.16568339, 0.14886661, 0.12105149, 0.1134930...","[0.15020902, 0.12504116, 0.124677494, 0.113965...","[0.10577319, 0.09851004, 0.085823834, 0.084867...","[-0.14719902, 0.010995603, 0.03816145, 0.03593...","[-0.0812891, -0.039686285, 0.026748434, 0.0407...","[-0.10528991, -0.07301311, 0.07933727, 0.05377..."
3,## Retail Store on Blockchain\n\n### About\n\n...,:moneybag: Retail Store that runs on Ethereum,['44-45'],"[0.4094513, 0.3126638, 0.21346328, 0.19123784,...","[0.45975772, 0.34134144, 0.1912406, 0.15142886...","[0.23647232, 0.17545521, 0.13778466, 0.1353202...","[-0.09359249, 0.012356067, -0.10742976, -0.012...","[-0.028405713, 0.06454651, -0.048357155, -0.01...","[-0.07883304, 0.046956103, -0.015917461, -0.02..."
4,# OpenPapyrus\n\nwww.petroglif.ru\n\n# En\n\n#...,"Sophisticated ERP, CRM, Point-Of-Sale, etc. Op...","['44-45', '42']","[0.3330735, 0.32281908, 0.2978766, 0.2922843, ...","[0.31231993, 0.304128, 0.29571977, 0.2730767, ...","[0.38862348, 0.35221565, 0.3496197, 0.32557008...","[-0.014680124, -0.049634546, -0.07810932, -0.0...","[-0.015994947, -0.044863973, -0.083498314, -0....","[-0.036111325, 0.008490675, -0.038374376, -0.0..."
...,...,...,...,...,...,...,...,...,...
1226,# Twitter-Sentiment-Analysis--Canadian-Electio...,Sentiment Analysis is a branch of Natural Lang...,['81'],"[0.16163963, 0.10233914, 0.098673806, 0.092921...","[0.20804363, 0.12965678, 0.107865855, 0.101284...","[0.08486624, 0.05039034, 0.043075796, 0.040792...","[-0.014767096, 0.04164609, 9.1588816e-05, 0.00...","[0.0009896636, 0.04627126, -0.020407116, 0.010...","[-0.10550112, -0.054189634, -0.038963694, 0.01..."
1227,# Sandstorm Specification\n\nSandstorm is an o...,Specification for an Organizational Tool for P...,['81'],"[0.1529609, 0.1518515, 0.15101181, 0.14724407,...","[0.24986243, 0.1873894, 0.18518254, 0.1755682,...","[0.08486624, 0.05039034, 0.043075796, 0.040792...","[-0.041574333, -0.045421753, -0.042214658, -0....","[0.0023887642, -0.025760842, -0.084524415, -0....","[-0.10550112, -0.054189634, -0.038963694, 0.01..."
1228,"{'message': 'Not Found', 'documentation_url': ...",Old Wordpress Theme for a Football Political o...,['81'],"[0.06247566, 0.027515128, 0.02158877, 0.016611...","[0.12736084, 0.054715466, 0.05178021, 0.020437...","[0.08486624, 0.05039034, 0.043075796, 0.040792...","[-0.0012574658, 0.030167898, 0.008204357, 0.02...","[-0.026684655, 0.042238247, -0.069243334, -0.0...","[-0.10550112, -0.054189634, -0.038963694, 0.01..."
1229,# Election Database Schema Design\nPerformance...,Performance of some analytics on real data fro...,['81'],"[0.18357465, 0.16322222, 0.13101378, 0.0801412...","[0.20498925, 0.19824922, 0.16590615, 0.0967155...","[0.13442394, 0.11035281, 0.107117526, 0.106915...","[0.020250533, -0.031431314, -0.04382569, 0.016...","[-0.03256391, -0.06967075, -0.071044855, 0.023...","[0.07205384, -0.023424562, -0.07713283, 0.0764..."


In [10]:
import pandas as pd

df = df.dropna()
df = pd.read_csv('combined_df_similarity.csv')

finished_df = df.dropna()
finished_df.to_csv('finished_df.csv', index=False)

In [11]:
finished_df['similarity_description'][0]

'[0.2809217, 0.26929814, 0.19942525, 0.19053647, 0.18638256, 0.17618419, 0.1681377, 0.15766785, 0.15634477, 0.15571195, 0.15307021, 0.14751685, 0.1442731, 0.13702013, 0.12868296, 0.12403647, 0.10612276, 0.1027708, 0.09513998, 0.09496352]'

In [12]:
import ast

convert = pd.read_csv('NAICS descriptions - Sheet1.csv').to_dict()['Sector']
convert = {v:k for k, v in convert.items()}

for index, row in finished_df.iterrows():
    vector = [0] * 20
    for j in ast.literal_eval(row['label']):
        if j == '0': continue 
        vector[convert[j]] = 1

    finished_df.at[index, 'label'] = vector

finished_df.to_csv('keyBert_finished_df.csv', index=False)