In [9]:
import pandas as pd
import json
import re
import numpy as np

from sklearn.externals import joblib
import pickle

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import string
import spacy

In [10]:
pd.set_option('display.max_columns', 50)

In [68]:
test_string1 = "Hi all -- \n\n Here is today's demo group schedule! It's a little different than last week. \
You'll each spend 2 hrs in a room watching demos as a group. You'll spend the other 2 hours out at the desks (\
fun fact: this room is called Ocean; now you know) in small groups of 4-5, where you can deep \
dive each other's demos / projects and give feedback. Let me know if anything is weird on the schedule -- your \
name should be on there once during each time block: 2 hrs in a large group, and 2 hrs in a small group.\
\n\n I'll post a copy of the schedule on the doors to Boa and Party as well! \
See you then. Remember to eat lunch. \n\n Katie"

test_string2 = "Hi Fellows, \n\n The start of the session is two weeks away! Things are about to get real. \n\n \
We created a handbook that previews the learning environment you'll experience (and help create!) as an Insight Fellow. \
It includes advice from members of our alumni community that we encourage you to review as you prepare for the first week of the program. \n\n \
We're excited to meet you all in person! \n\n Thanks, \n Katie"

test_emails = [test_string1, test_string2]

senders_list = ['Katie Hawkes']
receiver_list = [['Ben Regner', 'April Swagman', 'DS-SV'], ['DS-SV']]

In [12]:
nlp = spacy.load('en')

In [69]:
caps = []
total_msg_lens = []
periods, commas = [], []
ex_marks, q_marks = [], []
word_tokens = []
pos = []
for email in test_emails:
    caps.append(sum(1 for s in email if s.isupper()))
    total_msg_lens.append(len(email))
    periods.append(sum(1 for s in email if s == '.'))
    commas.append(sum(1 for s in email if s == ','))
    ex_marks.append(sum(1 for s in email if s == '!'))
    q_marks.append(sum(1 for s in email if s == '?'))
    #email = email.lower()
    pos.append(nlp(email))
    word_tokens.append(word_tokenize(email))

In [70]:
punct = string.punctuation
word_counts = []
for i in range(len(word_tokens)):
    words = [w for w in word_tokens[i] if not w in punct]
    word_counts.append(len(words))

In [71]:
text_features = pd.DataFrame(
    {
        'caps':caps,
        'msg_len':total_msg_lens,
        'periods':periods,
        'commas':commas,
        'ex_marks':ex_marks,
        'q_marks':q_marks,
        'word_count':word_counts
    }
)

Add POS information.

In [72]:
from collections import defaultdict

pos_counts = []
for doc in pos:
    d = defaultdict(int)
    for word in doc:
        d[word.pos_] +=1
    pos_counts.append(d)

In [73]:
df_pos_counts = pd.DataFrame(pos_counts)
df_pos_counts.columns = df_pos_counts.columns.str.lower()
df_pos_counts.replace(np.nan, 0, inplace = True)

In [84]:
df_pos_counts.columns

Unnamed: 0,adj,adp,adv,cconj,det,intj,noun,num,part,pron,propn,punct,space,sym,verb
0,11,16,8,3,15,1,30,6,3,8,4,17,3,2.0,25
1,5,10,1,1,8,1,15,1,3,8,4,10,5,0.0,15


In [74]:
text_features = pd.concat([text_features, df_pos_counts], axis = 1)

Get number of sentences and average length of sentence in message.

In [75]:
num_sentences = []
sentence_len_mean = []
for email in test_emails:
    tokens = sent_tokenize(email)
    num_sentences.append(len(tokens))
    len_sentence = []
    for token in tokens:
        s = ''.join(ch for ch in token if ch not in string.punctuation)
        s = s.strip()
        len_sentence.append(len(s.split(' ')))
    if len_sentence:
        sentence_len_mean.append(np.mean(len_sentence))
    else:
        sentence_len_mean.append(0)

text_features['num_sentences'] = num_sentences
text_features['len_sentence_mean'] = sentence_len_mean

Get out how many recipients the email has.

In [76]:
num_recipients = []
for receiver in receiver_list:
    num_recipients.append(len(receiver))
    
text_features['num_recipients'] = num_recipients

Gender of senders and recipients (if multiple recipients, put 'Group' as gender because any gender effects will likely be washed out).

In [78]:
import gender_guesser.detector as gender
d = gender.Detector(case_sensitive = False)

send_gender = []
for sender in senders_list:
    
    #name structure is 'First Last'
    send_name = sender.split(' ', 1)[0]
    send_gender.append(d.get_gender(send_name))

receive_gender = []
for receiver in receiver_list:
    if len(receiver) > 1:
        receive_gender.append('group')
    
    else:
        gender = d.get_gender(receiver[0])
        receive_gender.append(gender)

Get sentiment scores

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [79]:
analyzer = SentimentIntensityAnalyzer()

sentiments_list = []
for email in test_emails:
    sentences = sent_tokenize(email)
    
    sentence_sentiments = []
    for sentence in sentences:
        vs = analyzer.polarity_scores(sentence)
        sentence_sentiments.append(vs)
    
    msg_sentiments = {}
    for k in vs.keys():
        if sentence_sentiments:
            mean = sum(d[k] for d in sentence_sentiments) / len(sentence_sentiments)
            msg_sentiments[k] = mean
        else:
            msg_sentiments[k] = np.nan
    
    sentiments_list.append(msg_sentiments)

sentiments = pd.DataFrame(sentiments_list)

In [80]:
text_features = pd.concat([text_features, sentiments], axis = 1)

text_features.head()

Unnamed: 0,caps,commas,ex_marks,msg_len,periods,q_marks,word_count,adj,adp,adv,cconj,det,intj,noun,num,part,pron,propn,punct,space,sym,verb,num_sentences,len_sentence_mean,num_recipients,compound,neg,neu,pos
0,13,2,2,629,6,0,131,11,16,8,3,15,1,30,6,3,8,4,17,3,2.0,25,9,14.111111,3,0.007389,0.015222,0.953556,0.031222
1,11,2,3,412,3,0,72,5,10,1,1,8,1,15,1,3,8,4,10,5,0.0,15,7,10.285714,1,0.280386,0.0,0.792143,0.207857


In [83]:
text_features.shape

(2, 29)

In [30]:
from sklearn.metrics.pairwise import cosine_similarity

In [81]:
cluster_vectors = pickle.load(open('12-cluster_vectors.p', 'rb'))
new_vectors = text_features.as_matrix()

In [82]:
cluster_assignments = []
for new in new_vectors:
    best_score = 0
    cluster_num = None
    for cluster, vector in cluster_vectors.items():
        score = cosine_similarity(vector.reshape(1, -1), new.reshape(1, -1))[0][0]
        if score > best_score:
            best_score = score
            cluster_num = cluster
    cluster_assignments.append((cluster_num, best_score))

ValueError: Incompatible dimension for X and Y matrices: X.shape[1] == 30 while Y.shape[1] == 29

In [None]:
cluster_assignments

In [50]:
cluster_labels = {
    -1:'Work may be a little stressful',
    0:'Direct and to the point',
    1:'Business as usual',
    2:'Casual and personal',
    3:'Company announcements',
    4:'Meetings and interviews'
}