In [1]:
import pandas as pd
import spacy
from tqdm.notebook import tqdm
import re
tqdm.pandas()

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
df_descriptions = pd.read_csv('data/company/organization_descriptions.csv')
df_organizations = pd.read_csv('data/company/organizations.csv')

In [4]:
df_subset = df_descriptions[['uuid', 'description']]

merged_df = pd.merge(df_organizations, df_subset, on='uuid', how='left')

In [5]:
def check_for_tags(category_list):
    if pd.isnull(category_list):
        return False
    tags = category_list.split(',')
    for tag in tags:
        if tag.lower() in ai_tags:
            return True
    return False

In [6]:
#filter to companies that only have given tags
ai_tags = [
    'Artificial Intelligence (AI)', 
    'Generative AI', 
    'Intelligent Systems', 
    'Machine Learning', 
    'Natural Language Processing', 
    'Predictive Analytics', 
    'Robotic Process Automation (RPA)'
]
#ai_tags = ['Artificial Intelligence (AI)']

ai_tags = set([tag.lower() for tag in ai_tags])
filtered = merged_df[(merged_df['category_list'].progress_apply(check_for_tags))]

  0%|          | 0/3487774 [00:00<?, ?it/s]

In [7]:
filtered.columns

Index(['uuid', 'name', 'type', 'permalink', 'cb_url', 'rank', 'created_at',
       'updated_at', 'legal_name', 'roles', 'domain', 'homepage_url',
       'country_code', 'state_code', 'region', 'city', 'address',
       'postal_code', 'status', 'short_description', 'category_list',
       'category_groups_list', 'num_funding_rounds', 'total_funding_usd',
       'total_funding', 'total_funding_currency_code', 'founded_on',
       'last_funding_on', 'closed_on', 'employee_count', 'email', 'phone',
       'facebook_url', 'linkedin_url', 'twitter_url', 'logo_url', 'alias1',
       'alias2', 'alias3', 'primary_role', 'num_exits', 'description'],
      dtype='object')

In [8]:
filtered = filtered.drop(columns = ['type', 'permalink', 'cb_url', 'rank','num_funding_rounds', 'total_funding_usd',
       'total_funding', 'total_funding_currency_code','employee_count', 'email', 'phone',
       'facebook_url', 'linkedin_url', 'twitter_url', 'logo_url', 'alias1',
       'alias2', 'alias3','domain','homepage_url','country_code', 'state_code','num_exits','address', 'postal_code','region','last_funding_on'])


# Lemmatization

In [9]:
import multiprocessing
core_count = max(multiprocessing.cpu_count() -2,1)
nlp = spacy.load("en_core_web_sm")

In [10]:
#check that every row has at least one of the descriptions
filtered = filtered[filtered['description'].notna() | filtered['short_description'].notna()]
#concat into one column
filtered['Combined_Text'] = (
    filtered['short_description'].fillna('') + '. ' + filtered['description'].fillna('')
).str.lower()

In [11]:
descriptions = filtered['Combined_Text']

In [12]:
lemmatized_descriptions = []
for description in tqdm(nlp.pipe(descriptions, n_process = core_count,batch_size = 64),total=len(descriptions)):
    lemmatized_descriptions.append([tok.lemma_ for tok in description])

  0%|          | 0/54837 [00:00<?, ?it/s]

# Clean tokens

In [13]:
from spacy.lang.en.stop_words import STOP_WORDS
stop_words = set(STOP_WORDS)

In [17]:
def clean_tokens(tokens):
    two_char = ['ai', 'ip', 'it', 'pc', 'io', 'ar', 'vr', 'ml', 'qc', 'rf', 'ux', 'ui']
    filtered_tokens = [word for word in tokens if word not in stop_words and word != "-" and re.match(r'[a-zA-Z-]+$', word)]
    filtered_tokens = [word for word in filtered_tokens if len(word) != 1 and (len(word) > 2 or word in two_char)]
    return filtered_tokens

#there are tokens such as ml- where we only care about the ml part
#hyphenated words are also split into two
def split_hyphenated_tokens(tokens):
    processed_tokens = []
    for token in tokens:
        if "-" in token:
            split_parts = token.split("-")
            processed_tokens.extend([part for part in split_parts if part])
        else:
            processed_tokens.append(token)
    return processed_tokens

In [18]:
dehyphenated_tokens = [split_hyphenated_tokens(tokens) for tokens in lemmatized_descriptions]
cleaned_tokens = [clean_tokens(tokens) for tokens in dehyphenated_tokens]

In [19]:
#no need to remove low tokens
filtered['Lemmatized_Tokens'] = cleaned_tokens

In [20]:
#Copy list used in patents here
bigrams_set = {('time', 'series'),
 ('sensor', 'datum'),
 ('computer', 'implement'),
 ('base', 'station'),
 ('following', 'step'),
 ('autonomous', 'driving'),
 ('information', 'processing'),
 ('driver', 'assistance'),
 ('feature', 'extraction'),
 ('voice', 'command'),
 ('image', 'datum'),
 ('method', 'base'),
 ('augment', 'reality'),
 ('control', 'method'),
 ('self', 'adaptive'),
 ('detection', 'method'),
 ('readable', 'medium'),
 ('loss', 'function'),
 ('follow', 'step'),
 ('pre', 'train'),
 ('recognition', 'method'),
 ('technical', 'field'),
 ('processor', 'configure'),
 ('network', 'model'),
 ('face', 'recognition'),
 ('aerial', 'vehicle'),
 ('language', 'processing'),
 ('attention', 'mechanism'),
 ('disclose', 'method'),
 ('medical', 'image'),
 ('capture', 'image'),
 ('deep', 'learning'),
 ('method', 'system'),
 ('processing', 'unit'),
 ('threshold', 'value'),
 ('processing', 'device'),
 ('control', 'unit'),
 ('electronic', 'device'),
 ('random', 'forest'),
 ('human', 'body'),
 ('base', 'deep'),
 ('video', 'frame'),
 ('motor', 'vehicle'),
 ('artificial', 'intelligence'),
 ('computer', 'program'),
 ('readable', 'storage'),
 ('generative', 'adversarial'),
 ('obstacle', 'avoidance'),
 ('identification', 'method'),
 ('model', 'train'),
 ('license', 'plate'),
 ('prior', 'art'),
 ('large', 'scale'),
 ('particle', 'swarm'),
 ('computer', 'vision'),
 ('belong', 'technical'),
 ('control', 'system'),
 ('transitory', 'computer'),
 ('energy', 'consumption'),
 ('learn', 'model'),
 ('region', 'interest'),
 ('problem', 'solve'),
 ('voice', 'recognition'),
 ('natural', 'language'),
 ('early', 'warning'),
 ('power', 'consumption'),
 ('support', 'vector'),
 ('processing', 'method'),
 ('wireless', 'communication'),
 ('bounding', 'box'),
 ('image', 'processing'),
 ('train', 'neural'),
 ('big', 'datum'),
 ('audio', 'signal'),
 ('user', 'terminal'),
 ('neural', 'network'),
 ('feature', 'point'),
 ('processing', 'apparatus'),
 ('anomaly', 'detection'),
 ('vector', 'machine'),
 ('power', 'supply'),
 ('intelligence', 'ai'),
 ('prediction', 'model'),
 ('content', 'item'),
 ('method', 'far'),
 ('model', 'obtain'),
 ('autonomous', 'vehicle'),
 ('feature', 'map'),
 ('comprise', 'step'),
 ('magnetic', 'resonance'),
 ('speech', 'recognition'),
 ('main', 'body'),
 ('objective', 'function'),
 ('train', 'machine'),
 ('artificial', 'neural'),
 ('recognition', 'result'),
 ('image', 'capture'),
 ('real', 'time'),
 ('nucleic', 'acid'),
 ('mobile', 'terminal'),
 ('face', 'image'),
 ('drawing', 'fig'),
 ('storage', 'medium'),
 ('learning', 'algorithm'),
 ('relate', 'technical'),
 ('non', 'transitory'),
 ('fuel', 'cell'),
 ('program', 'product'),
 ('embodiment', 'invention'),
 ('compute', 'device'),
 ('character', 'string'),
 ('mobile', 'device'),
 ('path', 'planning'),
 ('management', 'system'),
 ('unmanned', 'aerial'),
 ('acquisition', 'unit'),
 ('apparatus', 'method'),
 ('extract', 'feature'),
 ('datum', 'set'),
 ('device', 'storage'),
 ('fuzzy', 'logic'),
 ('short', 'term'),
 ('fault', 'diagnosis'),
 ('genetic', 'algorithm'),
 ('convolutional', 'neural'),
 ('human', 'face'),
 ('improve', 'accuracy'),
 ('equipment', 'storage'),
 ('light', 'source'),
 ('user', 'interface'),
 ('time', 'period'),
 ('target', 'object'),
 ('drawing', 'figure'),
 ('point', 'cloud'),
 ('super', 'resolution'),
 ('deep', 'neural'),
 ('wind', 'turbine'),
 ('decision', 'tree'),
 ('determination', 'unit'),
 ('system', 'method'),
 ('feature', 'vector'),
 ('web', 'page'),
 ('electronic', 'equipment'),
 ('unit', 'configure'),
 ('computer', 'readable'),
 ('recording', 'medium'),
 ('machine', 'learning'),
 ('high', 'resolution'),
 ('key', 'point'),
 ('method', 'device'),
 ('training', 'datum'),
 ('mobile', 'robot'),
 ('learning', 'model'),
 ('air', 'conditioner'),
 ('recurrent', 'neural'),
 ('knowledge', 'graph'),
 ('machine', 'learn'),
 ('reinforcement', 'learning'),
 ('virtual', 'reality'),
 ('control', 'device'),
 ('remote', 'sense'),
 ('blood', 'vessel'),
 ('internet', 'thing'),
 ('system', 'comprise'),
 ('training', 'sample'),
 ('parking', 'space'),
 ('input', 'datum'),
 ('training', 'set')}

In [21]:
def detect_bigrams(text, bigrams_set):
    combined_text = []
    i = 0
    text_len = len(text)

    skip = False
    while i < text_len:
        if i < text_len - 1 and (text[i], text[i + 1]) in bigrams_set:
            combined_text.append(f"{text[i]}_{text[i + 1]}")
            i += 1
            skip = True
        else:
            if skip == True:
                i+=1
                skip = False
            else:
                combined_text.append(text[i])
                i += 1
    return combined_text

In [23]:
filtered['Bigrams_Tokens'] = filtered['Lemmatized_Tokens'].progress_apply(lambda x: detect_bigrams(x, bigrams_set))

  0%|          | 0/54837 [00:00<?, ?it/s]

In [26]:
filtered['founded'] = filtered['founded_on'].fillna(filtered['created_at']).astype(str).str[:4]
filtered.to_csv('data/company/Cleaned_Companies.csv',index=False)