In [1]:
import pickle
import pandas as pd

with open("../data/all_reviews.pkl","rb") as f:
    reviews = pickle.load(f)
    
job_filter = pd.read_csv("../data/filter_job_titles.csv")

job_filters = job_filter.clean_job_title.tolist()

In [2]:
idx = (reviews.language == "en")
reviews = reviews.loc[idx,:]

idx = (reviews.clean_job_title.isin(job_filters))
reviews = reviews.loc[idx,:].reset_index()

print(reviews.shape)
display(reviews.head())

(45251, 16)


Unnamed: 0,index,company_name,review_title,job_title,employee_status,location,date,review,pros,cons,rating,yes_helpful,no_helpful,industry,clean_job_title,language
0,299,Fluor Corp.,Awesome company,Quality Manager,Former Employee,Afghanistan,2019-10-17,Fluor is a great company with people that care...,,,4.0,0,0,construction,quality manager,en
1,1145,Fluor Corp.,Was a much better company in the 90's,Control Systems Designer,Former Employee,"Sugar Land, TX",2018-06-06,Fluor was the first powerhouse EPC that went h...,Nice Location,They have no work,3.0,2,0,construction,control systems designer,en
2,1375,Fluor Corp.,Enjoyed the Work Environment,Senior Software Developer,Current Employee,"Aliso Viejo, CA",2018-03-08,There is not much that I could add here. I en...,Relaxed Environment,None really,5.0,0,1,construction,software developer,en
3,1874,Fluor Corp.,Good place,System Engineer I,Current Employee,"Piketon, OH",2017-09-15,The place can be pretty boring most of the tim...,,,4.0,0,0,construction,system engineer,en
4,2136,Fluor Corp.,Hurry up and wait,Software Developer I,Current Employee,"Piketon, OH",2017-06-26,"Everyone is in a hurry to get things done, but...",Great healthcare,Located in the middle of no-where,3.0,1,0,construction,software developer,en


Basic Cleaning
* Sentance Tokenize
* spacy nlp tokenize
    * Word & Lemmentize
    * Remove punctuation and misc words
* Spell Check
    * Validation

LDA
* Morph
* Bi-Gram & Tri-Gram

Neural network
* Word Vectors

In [161]:
import spacy

nlp = spacy.load("en_core_web_lg")

In [173]:
%%time
from multiprocessing import Pool
from nltk import sent_tokenize
import spacy
import re

wnl = WordNetLemmatizer()

co_reviews = reviews.review.tolist()

stop_words = ['...','....']

def tokenize_lemma_word(sent):
    doc = nlp(sent)
    #lemmatize
    tokens = [ str(word.lemma_).strip() if word.lemma_ != "-PRON-" else str(word).strip() for word in doc ]
    #exclude words with all punctuation
    tokens = [ word.lower() for word in tokens if not( len(word) < 3 and not word.isalnum() ) and word not in stop_words ]
    return tokens
    

def tokenize_all(corpus):
    token_sents = sent_tokenize(corpus.lower())
    token_words = list(map(tokenize_lemma_word, token_sents))
    return token_words

with Pool() as p:
    co_reviews_tokens = list(p.map(tokenize_all, co_reviews))

CPU times: user 777 ms, sys: 382 ms, total: 1.16 s
Wall time: 2min 50s


Check for spelling

In [174]:
import nltk
import string
from nltk.corpus import wordnet

word_list = nltk.corpus.words.words()
word_list = [ word.lower() for word in word_list ]
word_list = set(word_list)
word_list = word_list.union(set( wordnet.words() ))
print(len(list(word_list)))


company_name = reviews.company_name
company_name = list(set(company_name))

for company in company_name:
    company = company.lower()
    word_list = word_list.union(set( company.split(" ") ))

print(len(list(word_list)))

323592
324071


In [175]:
%%time

from collections import Counter

vocab = Counter()
for review in co_reviews_tokens:
    for sent in review:
        for word in sent:
            if word not in word_list:
                vocab[word]+=1
        

CPU times: user 268 ms, sys: 2.41 ms, total: 270 ms
Wall time: 269 ms


In [176]:
print(len(vocab))
vocab.most_common(100)   

16666


[('tcs', 1445),
 ('onsite', 820),
 ('etc', 724),
 ('coworker', 530),
 ('sql', 459),
 ('alot', 291),
 ('mnc', 235),
 ('troubleshooting', 225),
 ('multi', 210),
 ('networking', 209),
 ('hrs', 135),
 ('401k', 129),
 ('admin', 127),
 ('javascript', 121),
 ('qa', 119),
 ('outsourcing', 112),
 ('.net', 111),
 ('hpe', 109),
 ('org', 97),
 ('managment', 88),
 ('upto', 85),
 ('app', 84),
 ('pl', 82),
 ('oppurtunity', 82),
 ('ui', 80),
 ('emc', 77),
 ('teksystem', 77),
 ('techmahindra', 77),
 ('uhg', 77),
 ('jquery', 76),
 ('sdlc', 75),
 ('oppurtunitie', 75),
 ('worklife', 75),
 ('dept', 74),
 ('techm', 73),
 ('scripting', 72),
 ('symantec', 72),
 ('j2ee', 71),
 ('.....', 71),
 ('wfh', 71),
 ('cts', 71),
 ('etl', 70),
 ('db2', 68),
 ('gdit', 68),
 ('asp.net', 67),
 ('pune', 67),
 ('mvc', 66),
 ('xml', 64),
 ('2008', 64),
 ('and/or', 64),
 ('collegue', 62),
 ('css', 61),
 ('exp', 60),
 ('mentoring', 59),
 ('sme', 59),
 ('carrer', 59),
 ('jsp', 58),
 ('c++', 58),
 ('skillset', 58),
 ('enviroment',

In [177]:
from scipy.spatial.distance import cosine

print( cosine( nlp("managment").vector, nlp("management").vector ))
print( cosine( nlp("micromanagement").vector, nlp("micro management").vector ))


0.2943993806838989
0.840567022562027
