In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
import warnings; 
warnings.simplefilter('ignore')

from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold

import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [None]:
df_review = pd.read_csv("../input/employee-reviews/employee_reviews.csv", sep=',', error_bad_lines=False)

## Data Cleaning

In [None]:
df_review.replace(to_replace = 'none', value = np.nan, inplace = True)

In [None]:
df_review.rename(columns = {'dates':'date'}, inplace = True)

In [None]:
df_copy = df_review.copy()

In [None]:
df = df_copy.dropna()

In [None]:
df.head(1)

In [None]:
df.info()

In [None]:
df['date'] = df['date'].astype(dtype=np.datetime64, inplace=True)

In [None]:
df['overallratings'] = df['overall-ratings'].astype(dtype=np.float64)
df['work-balance-stars'] = df['work-balance-stars'].astype(dtype=np.float64)
df['culture-values-stars'] = df['culture-values-stars'].astype(dtype=np.float64)
df['carrer-opportunities-stars'] = df['carrer-opportunities-stars'].astype(dtype=np.float64)
df['comp-benefit-stars'] = df['comp-benefit-stars'].astype(dtype=np.float64)
df['senior-mangemnet-stars'] = df['senior-mangemnet-stars'].astype(dtype=np.float64)

In [None]:
df['is_current_employee'] = df['job-title'].apply(lambda x: 1 if 'Current' in x else 0)
df['is_high_Overall'] = df['overall-ratings'].apply(lambda x: 1 if x>3 else 0)
df['is_high_worbalance']= df['work-balance-stars'].apply(lambda x: 1 if x >3 else 0)
df['is_high_culturevalue']= df['culture-values-stars'].apply(lambda x: 1 if x >3 else 0)
df['is_high_careeropp']= df['carrer-opportunities-stars'].apply(lambda x: 1 if x >3 else 0)
df['is_high_compbenefit']= df['comp-benefit-stars'].apply(lambda x: 1 if x >3 else 0)
df['is_high_srmngmt']= df['senior-mangemnet-stars'].apply(lambda x: 1 if x >3 else 0)

In [None]:
sns.factorplot(x = 'overall-ratings', y = 'company',hue= 'is_current_employee', data = df, kind ='box', \
               aspect =2)

#### Conclusion: The graph shows that the overall rating is best for Google & Facebook and worst for Netflix. The ex-employees as well as the former employees have given good ratings for both these companies. If we consider the reviews given by the current employees of these two companies, almost all of them fall under 3 to 5 and maximum out of them fall under 4 to 5. Like the dots show, very few of the employees have given 1 or 2.

In [None]:
sns.factorplot(x = 'work-balance-stars', y = 'company',hue= 'is_current_employee', data = df, kind ='box', \
               aspect =2)

#### Conclusion: The graph shows that the work life balance at Google is the best and at Amazon is the worst so those candidates who are looking out for jobs and prefer good work life balance, should consider other options apart from Amazon

In [None]:
sns.factorplot(x = 'culture-values-stars', y = 'company', hue= 'is_current_employee', data = df, kind ='box', \
               aspect =2)

#### Conclusion: The graph shows that the cultural values are the best at Google and Facebook and worst at Amazon and Netflix so those candidates who are looking out for jobs and prefer good cultural values at workplace, should consider other options apart from Amazon & Netflix

In [None]:
sns.factorplot(x = 'carrer-opportunities-stars', y = 'company', hue= 'is_current_employee', data = df, kind ='box', \
               aspect =2)

#### Conclusion: The graph shows that the career opportunities are the best at Facebook & Microsoft and worst at Netflix so those candidates who are looking out for jobs and want great career opportunities, should consider other options apart from Netflix

In [None]:
sns.factorplot(x = 'comp-benefit-stars', y = 'company', hue= 'is_current_employee', data = df, kind ='box', \
               aspect =2)

#### Conclusion: The graph shows that the compensation benefits are the best at Google & Facebook and worst for Apple.

In [None]:
sns.factorplot(x = 'senior-mangemnet-stars', y = 'company', hue= 'is_current_employee', data = df, kind ='box', \
               aspect =2)

#### Conclusion: The graph shows that the senior management is the best at Facebook and worst at Amazon followed by Netflix.

## Apart from worklife balance, culture values of organization, career opportunities, senior management, compensation benefits, what is it that the employees find appealing?

In [None]:
import re
# Natural Language Tool Kit 
import nltk  
nltk.download('stopwords') 
# nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('punkt')
# to remove stopword 
from nltk.corpus import stopwords 

# for Stemming propose  
from nltk.stem.porter import PorterStemmer 
from nltk.stem.snowball import SnowballStemmer
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

In [None]:
df_review["review"] = df_review["pros"] + ' ' + df_review["cons"] + ' ' + df_review["advice-to-mgmt"]

In [None]:
df_review.dropna(how='any',subset=['review'],inplace = True)

###### Defining method to remove non alpha words, changing it to lowercase and removing stopwords

In [None]:
sw = stopwords.words('english')
def clean(text):
    
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    text = text.split()
    text = [t for t in text if len(t) > 0]
    text = [t for t in text if t not in sw]
    text = ' '.join(text)
    return text

###### Defining method to get wordnet for a pos_tag

In [None]:
def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

###### Defining method to lemmatize text

In [None]:
# ps = PorterStemmer()
sw = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
def lemmatize(text):
    text = nltk.word_tokenize(text)
    pos_tags = pos_tag(text)
    #     text = [ps.stem(word) for word in text if not word in set(sw)]
    text = [lemmatizer.lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
    text = ' '.join(text)
    return text

###### Sample clean and lemmatize review of first row

In [None]:
clean(df_review.iloc[0].review)

In [None]:
lemmatize(df_review.iloc[0].review)

###### Cleaning and lemmatizing review column of dataframe 

In [None]:
df_review['review_clean'] = df_review['review'].apply(lambda x: clean(x))

In [None]:
df_review['review_lemmatize'] = df_review['review_clean'].apply(lambda x: lemmatize(x))

In [None]:
df_review.info()

#### Importing Wordcloud package to draw wordcloud

In [None]:
from wordcloud import WordCloud, STOPWORDS

In [None]:
stopwords = set(STOPWORDS)
extras = ["great","team","work","company","place","good","people","employee","none","make","one","go",\
         "day","call","new","come","think","happen","within","look","store","retail","feel",\
         "life","sometime","environment","move","keep","still","review","group","year","role",\
         "want","try","office","create","look","even","level","many","thing","much","even",\
         "hour","year","always","every","things","project","product","need","time","give",\
          "take","never"]
stopwords.update(extras)
companies = list(df_review.company.unique())
for company in companies:
    stopwords.add(company)

###### defining method to generate wordclouds for each company

In [None]:
def wordclouds(df_review,companies):
    for company in companies:
        temp = df_review.loc[df_review["company"]==company]
        text = " ".join(str(review) for review in temp.review_clean)
        # Create and generate a word cloud image:
        wordcloud = WordCloud(stopwords = stopwords, collocations = False).generate(text)
        # Display the generated image:
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis("off")
        plt.title(company.upper())
        plt.show()

###### calling wordclouds method. Prints wordcloud for each company 

In [None]:
wordclouds(df_review,companies)


#mmodel training


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

### We take most used 2000 words 
cv = CountVectorizer(max_features=2000)

X = cv.fit_transform(result).toarray()
X

In [None]:
y = comment.iloc[:,1].values
y

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)

y_pred = gnb.predict(X_test)
y_pred

from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
cm