In [1]:
import pandas as pd
import re,string
import nltk
from patsy import dmatrices
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import sent_tokenize, word_tokenize
import matplotlib.pyplot as plt
import warnings
%pylab inline
import numpy as np
from sklearn.manifold import MDS
from sklearn.metrics import euclidean_distances

Populating the interactive namespace from numpy and matplotlib


In [4]:
ibm_data = pd.read_csv('ibm.csv')

In [15]:
ibm_data.head()

Unnamed: 0,Title,Date Written,Rating,Current/Former,Job Title,Location,Recommendation?,Outlook,Main Text,Pros,Cons,Advice to management,pros_clean
0,"""Offering Manager""",no-date,5.0,Current Employee,Offering Manager,"Westbury, NY",Recommends,Neutral Outlook,I have been working at IBM full-time (More tha...,IBM is good to their employees,Lacks strategy in certain areas of the business,,ibm is good to their employees
1,"""Senior Software Engineer""",no-date,4.0,Former Employee,Anonymous Employee,,,,I worked at IBM full-time,laid back atmosphere with great work life balance,upper management pushes out middle management ...,,laid back atmosphere with great work life balance
2,"""Great Company, excellent management""",no-date,4.0,Former Employee,Anonymous Employee,,Recommends,Positive Outlook,I worked at IBM full-time,Lot of empowerment. I was given really good wo...,"As per the public news, IBM had yearly layoffs...",,lot of empowerment i was given really good wo...
3,"""IBM""","Nov 4, 2018",5.0,Former Employee,Various,"Austin, TX",Recommends,Neutral Outlook,I worked at IBM full-time (More than 10 years),"Interesting work, opportunities to work on div...",Not sure if pay is industry-competitive,Review salary structure to ensure industry com...,interesting work opportunities to work on div...
4,"""Not amenable""",no-date,5.0,Former Employee,Anonymous Employee,,Doesn't Recommend,Neutral Outlook,No opinion of CEO,Tools were always cutting edge,would have been nice to know evaluation criteria,Get to know your employees who actually are in...,tools were always cutting edge


### Step 1: pros/cons data pre-processing

In [18]:
#clean punctuation and white spaces
ibm_data["pros_clean"] = ibm_data.Pros.apply(lambda x:re.sub(r'[^\w\s]', ' ', str(x).lower()))
ibm_data["pros_clean"] = ibm_data.pros_clean.apply(lambda x:re.sub(r'(\\n+)', ' ', x))
ibm_data["pros_clean"] = ibm_data.pros_clean.apply(lambda x:re.sub(r'^b[\'\"\'"\"''\s]', '', x.lower()))
ibm_data["pros_clean"] = ibm_data.pros_clean.apply(lambda x:re.sub(r'\s+', ' ', x))

ibm_data["cons_clean"] = ibm_data.Cons.apply(lambda x:re.sub(r'[^\w\s]', ' ', str(x).lower()))
ibm_data["cons_clean"] = ibm_data.cons_clean.apply(lambda x:re.sub(r'(\\n+)', ' ', x))
ibm_data["cons_clean"] = ibm_data.cons_clean.apply(lambda x:re.sub(r'^b[\'\"\'"\"''\s]', '', x.lower()))
ibm_data["cons_clean"] = ibm_data.cons_clean.apply(lambda x:re.sub(r'\s+', ' ', x))

#tokenizing and removing stop words
stop = set(stopwords.words('english'))
punc = string.punctuation
ibm_data['pros_clean'] = ibm_data.pros_clean.apply(lambda x: [word for word in word_tokenize(x) if word not in stop])
ibm_data['pros_clean'] = ibm_data['pros_clean'].apply(lambda x: [word for word in x if word not in punc])
ibm_data['cons_clean'] = ibm_data.cons_clean.apply(lambda x: [word for word in word_tokenize(x) if word not in stop])
ibm_data['cons_clean'] = ibm_data['cons_clean'].apply(lambda x: [word for word in x if word not in punc])

ibm_data.head()

Unnamed: 0,Title,Date Written,Rating,Current/Former,Job Title,Location,Recommendation?,Outlook,Main Text,Pros,Cons,Advice to management,pros_clean,cons_clean
0,"""Offering Manager""",no-date,5.0,Current Employee,Offering Manager,"Westbury, NY",Recommends,Neutral Outlook,I have been working at IBM full-time (More tha...,IBM is good to their employees,Lacks strategy in certain areas of the business,,"[ibm, good, employees]","[lacks, strategy, certain, areas, business]"
1,"""Senior Software Engineer""",no-date,4.0,Former Employee,Anonymous Employee,,,,I worked at IBM full-time,laid back atmosphere with great work life balance,upper management pushes out middle management ...,,"[laid, back, atmosphere, great, work, life, ba...","[upper, management, pushes, middle, management..."
2,"""Great Company, excellent management""",no-date,4.0,Former Employee,Anonymous Employee,,Recommends,Positive Outlook,I worked at IBM full-time,Lot of empowerment. I was given really good wo...,"As per the public news, IBM had yearly layoffs...",,"[lot, empowerment, given, really, good, work, ...","[per, public, news, ibm, yearly, layoffs, bit,..."
3,"""IBM""","Nov 4, 2018",5.0,Former Employee,Various,"Austin, TX",Recommends,Neutral Outlook,I worked at IBM full-time (More than 10 years),"Interesting work, opportunities to work on div...",Not sure if pay is industry-competitive,Review salary structure to ensure industry com...,"[interesting, work, opportunities, work, diver...","[sure, pay, industry, competitive]"
4,"""Not amenable""",no-date,5.0,Former Employee,Anonymous Employee,,Doesn't Recommend,Neutral Outlook,No opinion of CEO,Tools were always cutting edge,would have been nice to know evaluation criteria,Get to know your employees who actually are in...,"[tools, always, cutting, edge]","[would, nice, know, evaluation, criteria]"


### Step 2: get pros/cons attributes

In [19]:
#get POS
get_pros = ibm_data['pros_clean'].apply(lambda x: [word for word in nltk.pos_tag(x)])
get_cons = ibm_data['cons_clean'].apply(lambda x: [word for word in nltk.pos_tag(x)])

pro_adj = []
con_adj = []

def get_adj(empty_list,pos_data):
    for review in pos_data:
        for word,pos in review:
            if pos == 'JJ' or pos == 'JJR' or pos == 'JJS': # if the POS-tag is adjective
                empty_list.append(word)
get_adj(pro_adj,get_pros)
get_adj(con_adj,get_cons)

In [20]:
from collections import Counter
Counter(pro_adj).most_common(30)

[('good', 2891),
 ('great', 2788),
 ('flexible', 685),
 ('many', 630),
 ('new', 470),
 ('ibm', 425),
 ('smart', 418),
 ('large', 348),
 ('nice', 333),
 ('different', 316),
 ('big', 291),
 ('excellent', 279),
 ('best', 250),
 ('global', 227),
 ('salary', 210),
 ('decent', 182),
 ('strong', 178),
 ('technical', 174),
 ('competitive', 171),
 ('professional', 153),
 ('able', 148),
 ('interesting', 146),
 ('available', 142),
 ('innovative', 135),
 ('high', 135),
 ('hard', 133),
 ('easy', 124),
 ('long', 110),
 ('willing', 108),
 ('right', 96)]

In [21]:
Counter(con_adj).most_common(30)

[('many', 883),
 ('much', 504),
 ('good', 490),
 ('big', 451),
 ('large', 445),
 ('low', 434),
 ('salary', 399),
 ('new', 392),
 ('ibm', 390),
 ('difficult', 385),
 ('constant', 372),
 ('long', 356),
 ('poor', 338),
 ('hard', 331),
 ('little', 304),
 ('slow', 294),
 ('high', 240),
 ('great', 238),
 ('bad', 230),
 ('top', 208),
 ('old', 199),
 ('internal', 182),
 ('due', 176),
 ('upper', 168),
 ('senior', 166),
 ('corporate', 154),
 ('competitive', 152),
 ('bureaucratic', 134),
 ('different', 125),
 ('huge', 123)]

In [22]:
#manually took some attributes from the top frequent pros words
pros_attribute = ['great','good','happy','nice','decent','excellent','best','ethical','strong','flexible','new',
                  'easy','friendly','positive','different','professional','high','solid','corporate',
                  'smart','stable','large']

In [23]:
#manually took some attributes from the top frequent cons words
cons_attribute = ['low','little','hard','difficult','long','poor','limited','bad','slow','terrible','conservative'
                 ,'horrible','different','less','senior','political','bureaucratic']

### Step 3: lemmentize -> get replacement

In [24]:
#create a function that would return WORDNET POS compliance to WORDENT lemmatization (a,n,r,v) 
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
         return wordnet.ADV
    else:
        # As default pos in lemmatization is Noun
        return wordnet.NOUN
    
wnl = WordNetLemmatizer()

#Lemmentization
def wn_pos(filtered_pos,empty_list):
    for word,pos in filtered_pos:
        empty_list.append(wnl.lemmatize(word,get_wordnet_pos(pos)))
        #print pos
        #print get_wordnet_pos(pos)
    return empty_list

In [25]:
pros_lem = []
cons_lem = []

def get_lem(emptylist, pos_data):
    for review in pos_data:
        for word,pos in review:
            wn_pos(review,emptylist)
get_lem(pros_lem, get_pros)
get_lem(cons_lem,get_cons)

In [26]:
pros_words = nltk.FreqDist(pros_lem)
cons_words = nltk.FreqDist(cons_lem)

rslt_unique_pros = pd.DataFrame.from_dict(pros_words,orient='index').reset_index()
rslt_unique_pros.columns = ['word','frequency']

rslt_unique_cons = pd.DataFrame.from_dict(cons_words,orient='index').reset_index()
rslt_unique_cons.columns = ['word','frequency']

In [27]:
replacement = {'work_life_balance':['time','life','balance','sabbatical','sabbaticals','focus','hour','day','health','flexible','week'
                                    ,'vacation','schedule','overtime'],
                'culture_value':['people','culture','team','care','value','product','coworkers','atmosphere','competitive'
                                ,'family','collaboration','respect','community','colleague','supportive','vision','diversity'],
                'career_oppotunity':['opportunity','learn','industry','career','license','training','train','growth','grow'
                                    ,'level','position','development','advancement','advance','study','build','skill','resource'
                                    ,'education','potential'],
                'company_benefit':['company','benefit','pay','financial','financially','provide','salary','bonus','offer'
                                  ,'401k','package','stock','compensation','invest','investment','money','performance','reward'
                                  ,'retirement','promote','insurance'],
                'senior_management':['place','environment','management','help','manager','experience','match'
                                                ,'plan','office','support','location','leadership','treat','helpful','senior'
                                                ,'manage','leader','communication']}

In [28]:
def getKeysByValue(dictOfElements, valueToFind):
    for k,v  in dictOfElements.items():
        if valueToFind in v:
            return(k)
    return  valueToFind

def replace_attributes(s):
    return([getKeysByValue(replacement,y) for y in s])

In [29]:
ibm_data['pros_replace'] = ibm_data['pros_clean'].map(replace_attributes)
ibm_data['cons_replace'] = ibm_data['cons_clean'].map(replace_attributes)

### Step 4: Lift Score

In [30]:
def ratio(x,y):
    if x==0:
        return float(y)
    if y==0:
        return float(x)
    return(float(x)*float(y))

def get_lift(a,b,tokenized_data):
    '''Function to calculate lift scores given any two words from a list of tokenized words'''
    if (a==b):
        return 1
    p_a = len([i for i in tokenized_data if a in i])
    p_b = len([i for i in tokenized_data if (b in i)])
    p_a_b = len([i for i in tokenized_data if a in i if b in i])
    n = len(tokenized_data)
    return float(float(n)*float(p_a_b)/ratio(p_a,p_b))

In [31]:
pros_list = replacement.keys()
lift_score = [get_lift(x,y,ibm_data.pros_replace) for x in pros_attribute for y in pros_list]
formatted_lift_score = [ round(elem,2) for elem in lift_score ]

In [32]:
pd.DataFrame(reshape(formatted_lift_score,(len(pros_attribute),len(pros_list))),index =pros_attribute , columns = pros_list )

Unnamed: 0,culture_value,company_benefit,senior_management,work_life_balance,career_oppotunity
great,1.43,1.15,1.36,0.99,1.22
good,1.08,1.3,1.22,1.33,0.91
happy,1.46,0.82,1.47,1.56,1.14
nice,1.55,1.01,1.53,1.05,0.65
decent,0.95,2.12,0.92,1.42,0.86
excellent,1.31,1.29,1.28,1.35,1.4
best,1.29,1.11,1.35,0.83,1.51
ethical,1.69,1.9,1.49,0.26,1.76
strong,1.45,1.08,1.23,0.72,1.18
flexible,0.0,0.0,0.0,0.0,0.0


In [33]:
cons_list = replacement.keys()
lift_score = [get_lift(x,y,ibm_data.cons_replace) for x in cons_attribute for y in cons_list]
formatted_lift_score = [ round(elem,2) for elem in lift_score ]

In [34]:
pd.DataFrame(reshape(formatted_lift_score,(len(cons_attribute),len(cons_list))),index =cons_attribute , columns = cons_list )

Unnamed: 0,culture_value,company_benefit,senior_management,work_life_balance,career_oppotunity
low,1.1,2.17,1.06,0.89,1.34
little,1.11,1.23,1.3,1.69,1.73
hard,1.29,1.28,0.98,0.99,1.18
difficult,1.13,1.06,0.92,1.23,1.12
long,1.31,0.95,1.09,2.12,1.26
poor,1.31,1.41,2.27,0.96,1.61
limited,0.82,1.19,1.11,1.22,2.99
bad,1.17,1.09,1.61,1.05,0.66
slow,0.65,0.91,0.81,0.66,1.48
terrible,2.15,1.59,2.19,1.09,1.6


### Lift for mission Statement

    for Pros

In [35]:
ibm_values = ['inclusive', 'diversity', 'women', 'think', 'client', 'innovation']

In [36]:
mission_lift = [get_lift(x,y,ibm_data.pros_replace) for x in ibm_values for y in pros_list]
formatted_lift_score = [ round(elem,2) for elem in mission_lift ]

In [37]:
pd.DataFrame(reshape(formatted_lift_score,(len(ibm_values),len(pros_list))),index =ibm_values , columns = pros_list )

Unnamed: 0,culture_value,company_benefit,senior_management,work_life_balance,career_oppotunity
inclusive,1.6,0.9,2.02,0.0,1.05
diversity,0.0,0.0,0.0,0.0,0.0
women,1.21,1.37,1.25,1.35,0.87
think,0.67,1.03,1.05,1.02,0.44
client,1.4,0.85,1.45,1.24,1.54
innovation,1.22,1.11,1.02,0.99,1.09


    for Cons

In [38]:
mission_lift = [get_lift(x,y,ibm_data.cons_replace) for x in ibm_values for y in cons_list]
formatted_lift_score = [ round(elem,2) for elem in mission_lift ]

In [39]:
pd.DataFrame(reshape(formatted_lift_score,(len(ibm_values),len(cons_list))),index =ibm_values , columns = cons_list )

Unnamed: 0,culture_value,company_benefit,senior_management,work_life_balance,career_oppotunity
inclusive,0.0,0.0,0.0,0.0,0.0
diversity,0.0,0.0,0.0,0.0,0.0
women,1.34,2.21,3.58,0.61,2.85
think,0.65,0.81,0.71,0.89,0.77
client,1.74,1.1,1.49,1.45,1.55
innovation,1.46,1.23,1.06,1.35,1.34
