In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from textblob import TextBlob
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression, LogisticRegression

# Data Preprocessing

## Get Data from package

In [3]:
from WorkforceSentimentMonitoring.data import get_data

In [6]:
from WorkforceSentimentMonitoring.preprocessing import preprocessing

In [None]:
df = pd.read_csv("../raw_data/train.csv", nrows=100)

In [None]:
df = df.drop(columns=['ID', 'Place', 'location', 'date', 'status', 'job_title'])

In [None]:
df.head()

In [None]:
text = df.summary + ' ' + df.positives + ' ' + df.negatives

df['text'] = text

df = df.dropna(axis=0)

df = df.drop_duplicates()

df.head()

# Model train

## Naive Bayes

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.naive_bayes import MultinomialNB

In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df.text).toarray()

y = df.score_1.astype('uint8')

In [None]:
model = MultinomialNB()
model.fit(X, y)
model.score(X, y)

In [None]:
df = pd.read_csv("../raw_data/train.csv", nrows=100)

df = df.drop(columns=['ID', 'Place', 'location', 'date', 'status', 'job_title', 'score_6'])

In [None]:
# drop rows with na values in score cols
score_cols = ['score_1', 'score_2', 'score_3', 'score_4', 'score_5', 'overall']
df = df.dropna(axis=0, subset=score_cols)
df = df.drop_duplicates()

In [None]:
# cast score cols as integers
float_cols = df.select_dtypes(float).columns
df[float_cols] = df[float_cols].astype('uint8')

In [None]:
df.head()

In [None]:
df.summary.shape

In [None]:
feature_cols = ['summary', 'positives', 'negatives', 'advice_to_mgmt']

# combine all text columns
df['text_combined'] = df[feature_cols].astype('U').agg(' '.join, axis=1)

feature_cols.append('text_combined')

In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df.text_combined).toarray()
y = df.score_1

In [None]:
model = MultinomialNB()
model.fit(X, y)
# append predictions to df
df['text_combined_score_1'] = model.predict(X)

In [None]:
model.score(X,y)

In [None]:
# iterate over features and append results to df as new cols

result_scores = {}

for feature in feature_cols:
    scores_dic = {}
    
    for score in score_cols:
        vectorizer = TfidfVectorizer()
        X = vectorizer.fit_transform(df[feature].astype('U'))
        y = df[score]
        model = MultinomialNB()
        model.fit(X, y)
        df[f'{feature}_{score}'] = model.predict(X)
        scores_dic[f'{score}'] = model.score(X, y)
        
    result_scores[f'{feature}'] = scores_dic

In [None]:
df.head()

In [None]:
result_scores

In [None]:
# iterate over features and append results to df as new cols


scores_dic = {}
for score in score_cols:

    result_scores = {}
    for feature in feature_cols:
        vectorizer = TfidfVectorizer()
        X = vectorizer.fit_transform(df[feature].astype('U'))
        y = df[score]
        model = MultinomialNB()
        model.fit(X, y)
        df[f'{feature}_{score}'] = model.predict(X)
        result_scores[f'{feature}'] = model.score(X, y)
        
    scores_dic[f'{score}'] = result_scores

In [None]:
scores_df = pd.DataFrame(scores_dic).T

In [None]:
# visualise prediction capability of every text slice

fig = plt.figure(figsize=(8, 7))
ax = fig.add_subplot(1,1,1)
scores_df.plot(kind='bar', ax=ax)

# Feature Engineering

In [None]:
# Create a function to get the subjectivity
def getSubjectivity(text):
  return TextBlob(text).sentiment.subjectivity

# Create a function to get the polarity
def getPolarity(text):
  return TextBlob(text).sentiment.polarity

for feature in feature_cols:
    df[f'subjectivity_{feature}'] = df[feature].astype('U').apply(getSubjectivity)
    df[f'polarity_{feature}'] = df[feature].astype('U').apply(getPolarity)

In [None]:
# Create a function to get the total length of the reviews
def get_lengths(df):
    '''returns a df with columns with the length of the reviews'''
    func = lambda x: len(x) if type(x) == str else 0
    df['summary_length'] = df['summary'].apply(func)
    df['postives_length'] = df['positives'].apply(func)
    df['negatives_length'] = df['negatives'].apply(func)
    df['advice_length'] = df['advice_to_mgmt'].apply(func)
    df['combined_length'] = df['text_combined'].apply(func)
    return df

df = get_lengths(df)

In [None]:
# scale new features
length_cols = [col for col in df.columns if 'length' in col]

for col in length_cols:
    scaler = MinMaxScaler()
    df[col] = scaler.fit_transform(df[[col]])

In [None]:
# select X
X = df.iloc[:, 11:]

In [None]:
# scale score features
pred_scores_cols = [col for col in X.columns if 'score' in col and not 'reg' in col]

for col in pred_scores_cols:
    scaler = MinMaxScaler()
    X[col] = scaler.fit_transform(X[[col]])

In [None]:
# linear regression with just the predictions for each model
for col in score_cols:
    model = LinearRegression()
    model.fit(X, df[col])
    X[f'reg_{col}'] = model.predict(X)
    scaler = MinMaxScaler()
    X[f'reg_{col}'] = scaler.fit_transform(X[[f'reg_{col}']])
# classification with scores as targets
predictions = pd.DataFrame()
pred_scores = {}
for target in score_cols:
    model = LogisticRegression(max_iter=1000)
    y = df[target]
    model.fit(X, y)
    predictions[target] = model.predict(X)
    pred_scores[target] = model.score(X, y)
    

In [None]:
predictions

In [None]:
pred_scores

In [None]:
plt.bar(range(len(pred_scores)), list(pred_scores.values()), align='center')

# comment for commit