# NLP model train
Train NLP model from company descriptions

## Sections
1. we clean the data using regular expression, and tokenize it into sentences
2. we feed the sentences into the Word2Vec model, train it and save the model

## Input
1. Company short and long description with countries.csv

## Output
1. Trained Word2Vec model: liren_model_better.bin

v1.0: Liren SONG, Oxford, Dec 17 2021

In [1]:
import pandas as pd
import nltk
import re
import json
from gensim.models import Word2Vec
import multiprocessing

In [4]:
# read in data
df = pd.read_csv('Company short and long description with countries.csv').copy(deep=True)
# only keep the USA data for easy processing
# todo: other languages support. Note that the tokenizing process will be different
df_description = df[(df["country_code"] == 'USA')]['description']
df_description.shape[0]

326177

In [5]:
# before tokenizing: long string of all the description appended to one another
description_string = str(list(df_description))
description_string[0:500]

"['CrossRoads Shooting Sports is a full-service facility with a core focus on safe handling, sport shooting, and personal protection. Their key features include a full-service Indoor Shooting Range and Training Center with a Simulation system. The participants have the opportunity to enhance their skills, knowledge, and fluency through interactive events and activities. It also has a full-service retail supporting this core philosophy is also available. It was founded in 2014 and is based in John"

In [6]:
# after tokenizing: split into sentence strings
tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
description_sentences = tokenizer.tokenize(description_string)
description_sentences[0:5]

["['CrossRoads Shooting Sports is a full-service facility with a core focus on safe handling, sport shooting, and personal protection.",
 'Their key features include a full-service Indoor Shooting Range and Training Center with a Simulation system.',
 'The participants have the opportunity to enhance their skills, knowledge, and fluency through interactive events and activities.',
 'It also has a full-service retail supporting this core philosophy is also available.',
 'It was founded in 2014 and is based in Johnston, Iowa.']

In [7]:
# clean the sentence tokens
def clean_sentences(text):
    """Make text lowercase, remove punctuation and remove words containing numbers."""
    text = re.sub(r'[^\w]', ' ', text)  # clear all things except underscore and alphanumeric
    text = re.sub(" \d+", " ", text)  # clear all digits
    text = text.lower()  # lower all text

    # replace webpage
    # todo: this part can be further fine tuned
    patterns = ['http', 'www', 'com']
    for pattern in patterns:
        text = re.sub(pattern, '', text)
    return text

In [8]:
cleaned_sentences = [clean_sentences(line) for line in description_sentences]
cleaned_sentences[0:10]

['  crossroads shooting sports is a full service facility with a core focus on safe handling  sport shooting  and personal protection ',
 'their key features include a full service indoor shooting range and training center with a simulation system ',
 'the participants have the opportunity to enhance their skills  knowledge  and fluency through interactive events and activities ',
 'it also has a full service retail supporting this core philosophy is also available ',
 'it was founded in  and is based in johnston  iowa ',
 '    clinicnote is an electronic medical record used by university therapy teaching programs and private practice therapy clinics ',
 '    men s style lab is an online concierge subscription clothing helping men look their best with great clothes that fit both life and style ',
 'it was founded in  and is headquartered in des moines  iowa ',
 '    everstream is a cleveland based network service provider bringing fiber based ethernet  internet and data center solution

In [15]:
# after second tokenizer: sentences splits into sentences of words
# in the format of [[word, word, .....], [word, word, ...], ....]
sentences_of_words = []
for line in cleaned_sentences:
    # tokenizer
    tokens = re.findall(r'\b\w+\b', line)
    if len(tokens) > 1:
        sentences_of_words.append(tokens)
sentences_of_words[0:100]

[['crossroads',
  'shooting',
  'sports',
  'is',
  'a',
  'full',
  'service',
  'facility',
  'with',
  'a',
  'core',
  'focus',
  'on',
  'safe',
  'handling',
  'sport',
  'shooting',
  'and',
  'personal',
  'protection'],
 ['their',
  'key',
  'features',
  'include',
  'a',
  'full',
  'service',
  'indoor',
  'shooting',
  'range',
  'and',
  'training',
  'center',
  'with',
  'a',
  'simulation',
  'system'],
 ['the',
  'participants',
  'have',
  'the',
  'opportunity',
  'to',
  'enhance',
  'their',
  'skills',
  'knowledge',
  'and',
  'fluency',
  'through',
  'interactive',
  'events',
  'and',
  'activities'],
 ['it',
  'also',
  'has',
  'a',
  'full',
  'service',
  'retail',
  'supporting',
  'this',
  'core',
  'philosophy',
  'is',
  'also',
  'available'],
 ['it',
  'was',
  'founded',
  'in',
  'and',
  'is',
  'based',
  'in',
  'johnston',
  'iowa'],
 ['clinicnote',
  'is',
  'an',
  'electronic',
  'medical',
  'record',
  'used',
  'by',
  'university',
  '

[['crossroads',
  'shooting',
  'sports',
  'is',
  'a',
  'full',
  'service',
  'facility',
  'with',
  'a',
  'core',
  'focus',
  'on',
  'safe',
  'handling',
  'sport',
  'shooting',
  'and',
  'personal',
  'protection'],
 ['their',
  'key',
  'features',
  'include',
  'a',
  'full',
  'service',
  'indoor',
  'shooting',
  'range',
  'and',
  'training',
  'center',
  'with',
  'a',
  'simulation',
  'system'],
 ['the',
  'participants',
  'have',
  'the',
  'opportunity',
  'to',
  'enhance',
  'their',
  'skills',
  'knowledge',
  'and',
  'fluency',
  'through',
  'interactive',
  'events',
  'and',
  'activities'],
 ['it',
  'also',
  'has',
  'a',
  'full',
  'service',
  'retail',
  'supporting',
  'this',
  'core',
  'philosophy',
  'is',
  'also',
  'available'],
 ['it',
  'was',
  'founded',
  'in',
  'and',
  'is',
  'based',
  'in',
  'johnston',
  'iowa'],
 ['clinicnote',
  'is',
  'an',
  'electronic',
  'medical',
  'record',
  'used',
  'by',
  'university',
  '

In [17]:
# dump the tokenized file to json
with open("sentences_of_words.json", 'w') as f:
    # indent=2 is not needed but makes the file human-readable
    json.dump(sentences_of_words, f, indent=2)



In [None]:
# you can check the json file if needed
def check_json_file():
    with open("sentences_of_words.json", 'r') as f:
        sentences_of_words = json.load(f)
    print(sentences_of_words[0:100])

# check_json_file()

In [18]:
model = Word2Vec(
    sentences_of_words,
    min_count=5,  # Ignore words that appear less than this
    vector_size=50,  # Dimensionality of word embeddings
    window=5,  # Context window for words during training
    workers=multiprocessing.cpu_count(), # use all cpus, currently gpu support is not ideal
    epochs=30)  # Number of epochs

In [19]:
# try the model with word google
model.wv.most_similar('google')

[('facebook', 0.8936492204666138),
 ('amazon', 0.8696936964988708),
 ('linkedin', 0.858043909072876),
 ('youtube', 0.8467017412185669),
 ('yahoo', 0.8358187675476074),
 ('twitter', 0.8342317342758179),
 ('bing', 0.829176664352417),
 ('slack', 0.8091875314712524),
 ('gmail', 0.8056207895278931),
 ('dropbox', 0.8017454147338867)]

In [None]:
# save the model into binary file
model.wv.save_word2vec_format('liren_model_better.bin', binary=True)