In [1]:
import pandas as pd
import numpy as np
import pickle

from sentence_transformers import SentenceTransformer

from tqdm import tqdm

In [2]:
# load data
df_startups = pd.read_json('database_json/startups.json')

In [3]:
# df_startups.head()

In [4]:
# df_startups.info()

In [5]:
df_startups['solutions_products_services']

0       <p>The Acquire App offers an Immersive Experie...
1       <p>They offer two products such as DegrAid pro...
2       <p>Affective Markets calculates the success po...
3       <p>Afresh uses AI to produce recommendations i...
4       <p>AiDock brings the actual revolution to digi...
                              ...                        
9891    <p>What’s your target market? (Choose more tha...
9892                                                     
9893    <p>1Lorem ipsum dolor sit amet, consectetuer a...
9894    <p><strong>Company Description</strong></p><p>...
9895    <p><strong>Company Description</strong></p><p>...
Name: solutions_products_services, Length: 9896, dtype: object

In [6]:
# separate out desired info
df_descriptions = df_startups[['id', 'solutions_products_services', 'company_legal_name', 'account_type']]

In [7]:
df_descriptions.head()

Unnamed: 0,id,solutions_products_services,company_legal_name,account_type
0,197,<p>The Acquire App offers an Immersive Experie...,Acquire App,DEMO
1,198,<p>They offer two products such as DegrAid pro...,Polymateria Limited,DEMO
2,199,<p>Affective Markets calculates the success po...,Affective Markets,DEMO
3,200,<p>Afresh uses AI to produce recommendations i...,Afresh Technologies Inc.,DEMO
4,201,<p>AiDock brings the actual revolution to digi...,Aidock Ltd.,DEMO


In [8]:
# clean - remove html tags
def remove_format_tags(text):
    text = text.replace('<p>', '')
    text = text.replace('</p>', '')
    text = text.replace('<strong>', '')
    text = text.replace('</strong>', '')
    
    return text

df_descriptions['solutions_products_services'] = df_descriptions['solutions_products_services'].apply(remove_format_tags)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_descriptions['solutions_products_services'] = df_descriptions['solutions_products_services'].apply(remove_format_tags)


In [9]:
# check cleaning worked as expected
df_descriptions.head()

Unnamed: 0,id,solutions_products_services,company_legal_name,account_type
0,197,The Acquire App offers an Immersive Experience...,Acquire App,DEMO
1,198,They offer two products such as DegrAid produc...,Polymateria Limited,DEMO
2,199,Affective Markets calculates the success poten...,Affective Markets,DEMO
3,200,Afresh uses AI to produce recommendations in a...,Afresh Technologies Inc.,DEMO
4,201,AiDock brings the actual revolution to digital...,Aidock Ltd.,DEMO


In [10]:
# load desired model
roberta_model = SentenceTransformer('paraphrase-distilroberta-base-v1');
all_mpnet_v2_model = SentenceTransformer('all-mpnet-base-v2');

In [11]:
# dictionary to store ids, names, and embeddings (from both models)
temp_emb_dict = {'id_': [], 'company_name': [], 'account_type': [], 'roberta_emb': [], 'mpnet2_emb': []}

def create_embeddings(row):
    temp_emb_dict['id_'].append(row[0])
    temp_emb_dict['company_name'].append(row[2])
    temp_emb_dict['account_type'].append(row[3])
    temp_emb_dict['roberta_emb'].append(roberta_model.encode(row[1]))
    temp_emb_dict['mpnet2_emb'].append(all_mpnet_v2_model.encode(row[1]))

In [12]:
# to show progress
tqdm.pandas()

# create embedding dictionaries
df_descriptions.progress_apply(create_embeddings, axis='columns')

100%|██████████| 9896/9896 [34:54<00:00,  4.73it/s]  


0       None
1       None
2       None
3       None
4       None
        ... 
9891    None
9892    None
9893    None
9894    None
9895    None
Length: 9896, dtype: object

In [13]:
df_emb = pd.DataFrame(temp_emb_dict)

In [14]:
df_emb

Unnamed: 0,id_,company_name,account_type,roberta_emb,mpnet2_emb
0,197,Acquire App,DEMO,"[0.014017829, 0.20147103, -0.10118152, -0.0632...","[0.057079125, -0.008974648, -0.037592165, -0.0..."
1,198,Polymateria Limited,DEMO,"[0.035784442, 0.44326508, -0.07529494, 0.04603...","[0.080428354, 0.02010855, -0.02703527, -0.0144..."
2,199,Affective Markets,DEMO,"[-0.06682719, 0.4243962, -0.018392464, -0.1590...","[0.001699026, 0.037716478, -0.058805294, -0.02..."
3,200,Afresh Technologies Inc.,DEMO,"[0.09948933, -0.1278632, 0.01722331, 0.1215965...","[-0.018926082, 0.034481186, -0.054234214, -0.0..."
4,201,Aidock Ltd.,DEMO,"[0.23271781, 0.5445238, 0.039308317, -0.327170...","[-0.01293812, 0.065178424, -0.04608697, -0.010..."
...,...,...,...,...,...
9891,10120,Test qa,INCOMPLETE,"[-0.037475403, 0.13656099, -0.054801382, 0.667...","[0.06226277, -0.008425264, -0.031678557, -0.04..."
9892,10121,Lamb Strong Inc,INCOMPLETE,"[0.40873337, -0.28501305, 1.0132746, -0.123601...","[-0.012503377, 0.061438844, -0.0067345053, 0.0..."
9893,10122,Test qa 551,INCOMPLETE,"[-0.069904484, 0.16863246, 0.11011889, 0.15451...","[-0.017052792, -0.0037189487, -0.015801629, 0...."
9894,10123,Test ut,STANDARD,"[0.20380749, 0.1713905, 0.28696936, -0.6777866...","[-0.040910114, -0.026950652, -0.0412945, 0.071..."


In [15]:
# df_emb.to_pickle('embeddings.pkl')

In [16]:
# test = pd.read_pickle('embeddings.pkl')
# test.head()