In [1]:
# python 3
import pandas as pd
import nltk
import re

from nltk.corpus import stopwords
from nltk import ngrams

pd.set_option('display.max_colwidth', -1)
pd.set_option("display.max_rows",3000)

In [2]:
# get majority labels and all labels

def clean_label(df_agent1, df_agent2, df_agent3):
    label_column = df_agent1.filter(regex='Label').columns
    label1, label2, label3, label4, label5 = label_column[0], \
                                             label_column[1], \
                                             label_column[2], \
                                             label_column[3], \
                                             label_column[4]
    
    df_agent1[label_column] = df_agent1[label_column].fillna('null_value')
    df_agent2[label_column] = df_agent2[label_column].fillna('null_value')
    df_agent3[label_column] = df_agent3[label_column].fillna('null_value')
    
    df_agent1['agent1_labels'] = df_agent1.apply(lambda x: set([x[label1],
                                 x[label2],
                                 x[label3],
                                 x[label4],                         
                                 x[label5]]),axis=1)
    
    df_agent1['agent2_labels'] = df_agent2.apply(lambda x: set([x[label1],
                                 x[label2],
                                 x[label3],
                                 x[label4],                         
                                 x[label5]]),axis=1)
    
    df_agent1['agent3_labels'] = df_agent3.apply(lambda x: set([x[label1],
                                 x[label2],
                                 x[label3],
                                 x[label4],                         
                                 x[label5]]),axis=1)
    
    df_agent1['majority_vote'] = df_agent1.apply(lambda x: x['agent1_labels'].intersection(x['agent2_labels'])\
                                         .union(x['agent2_labels'].intersection(x['agent3_labels']))\
                                         .union(x['agent1_labels'].intersection(x['agent3_labels']))\
                               , axis=1)
    
    df_agent1['union_vote'] = df_agent1.apply(lambda x: x['agent1_labels'].union(x['agent2_labels'])\
                                         .union(x['agent3_labels'])\
                               , axis=1)
    
    # trick to associate new variable with column method operations
    mv = df_agent1['majority_vote']
    mv.apply(lambda x: x.discard('null_value'));
    
    uv = df_agent1['union_vote']
    uv.apply(lambda x: x.discard('null_value'));
    return df_agent1.drop(label_column, axis=1)

In [3]:
# get data

one_ic = pd.read_csv('data/1-ic.csv')
one_gc = pd.read_csv('data/1-gc.csv')
one_sc = pd.read_csv('data/1-sc.csv')

two_jm = pd.read_csv('data/2-jm.csv')
two_mg = pd.read_csv('data/2-mg.csv')
two_nb = pd.read_csv('data/2-nb.csv')

three_rs = pd.read_csv('data/3-rs.csv')
three_rt = pd.read_csv('data/3-rt.csv')
three_sj = pd.read_csv('data/3-sj.csv')

tph_batch1 = pd.read_csv('data/tph_batch1.csv')

In [4]:
tph_batch1.rename({'sampled_bid_id': 'bid_id'}, axis='columns', inplace=True)

In [5]:
# clean labels

df1 = clean_label(one_ic, one_gc, one_sc)
df2 = clean_label(two_jm, two_mg, two_nb)
df3 = clean_label(three_rs, three_rt, three_sj)

In [6]:
# merge data

df = df1.append(df2).append(df3)

In [7]:
df = df.merge(tph_batch1[['bid_id','message_timestamp','message']], \
              how='left', on = ['bid_id','message_timestamp'])

In [8]:
#df['is_hire_majority'] = df.apply(lambda x: ('Hire' in x['majority_vote'])*1, axis=1)
#df['is_hire_any'] = df.apply(lambda x: ('Hire' in x['union_vote'])*1, axis=1)

In [9]:
one_gc['Label 1'].unique()

array(['Scheduling - Meeting or Job', 'Price', 'Location', 'Job Details',
       'Confirmation - Meeting', 'Contact Information', 'Payment',
       'null_value', 'Follow-up', 'Confirmation - Contact', 'Hire',
       'Rejection', 'Considering'], dtype=object)

In [10]:
df['is_contact_info'] = df.apply(lambda x: ('Contact Information' in x['majority_vote'])*1, axis=1)
df['is_scheduling'] = df.apply(lambda x: ('Scheduling - Meeting or Job' in x['majority_vote'])*1, axis=1)
df['is_price'] = df.apply(lambda x: ('Price' in x['majority_vote'])*1, axis=1)
df['is_payment'] = df.apply(lambda x: ('Payment' in x['majority_vote'])*1, axis=1)
df['is_generic'] = df.apply(lambda x: ('Generic Answer' in x['majority_vote'])*1, axis=1)
df['is_considering'] = df.apply(lambda x: ('Considering' in x['majority_vote'])*1, axis=1)
df['is_follow_up'] = df.apply(lambda x: ('Follow-up' in x['majority_vote'])*1, axis=1)
df['is_rejection'] = df.apply(lambda x: ('Rejection' in x['majority_vote'])*1, axis=1)
df['is_details'] = df.apply(lambda x: ('Job Details' in x['majority_vote'])*1, axis=1)
df['is_contact'] = df.apply(lambda x: ('Confirmation - Contact' in x['majority_vote'])*1, axis=1)
df['is_meeting'] = df.apply(lambda x: ('Confirmation - Meeting' in x['majority_vote'])*1, axis=1)

In [11]:
# helper function to dislay frequency of words from a blob of text 
def get_info(txt):
    words = nltk.tokenize.word_tokenize(txt)

    bigrams = nltk.bigrams(words)
    trigrams = nltk.trigrams(words)

    word_dist = nltk.FreqDist(words)
    top_N = 200
    custom_stopwords = set((u'.', u',', u'?', u'!', u')', u':', u'\'s', u'('))

    words_except_stop_dist = nltk.FreqDist(w for w in words if w not in custom_stopwords) 

    print('Unigram frequencies:')
    rslt = pd.DataFrame(words_except_stop_dist.most_common(top_N),
                        columns=['Word', 'Frequency']).set_index('Word')
    print(rslt)
    print('=' * 60)

    # Bigram Frequencies
    print('Bigram frequencies:')
    bigrams_freq = nltk.FreqDist(bigrams)
    rslt = pd.DataFrame(bigrams_freq.most_common(top_N),
                        columns=['Word', 'Frequency']).set_index('Word')

    print(rslt)
    print('=' * 60)

    # Trigram Frequencies
    print('Trigram frequencies:')
    trigrams_freq = nltk.FreqDist(trigrams)
    rslt = pd.DataFrame(trigrams_freq.most_common(top_N),
                        columns=['Word', 'Frequency']).set_index('Word')

    print(rslt)
    print('=' * 60)


In [12]:
# helper function to dislay precision and recall information from a term and a specific column we evaluate against 
# added df_set options: "all", "customers", and "pro" to evaluate only in that group

def display_info(term, col, df_set='all'):
    pat1 = re.compile("^.*"+term+".*$" , flags = re.DOTALL) # equivalent to str.contains # re.DOTALL applies REGEX to muliple line
    
    if df_set == 'all':
        working_df = df
    elif df_set == 'customers':
        working_df = df[df['message_sender'] == 'Customer']
    else:
        working_df = df[df['message_sender'] == 'Pro']
    predicted = working_df['message'].str.lower().str.match(pat1).astype(int)
    precision = working_df.loc[predicted[predicted==1].index, col].sum()/\
            working_df.loc[predicted[predicted==1].index, col].count()

    recall = working_df.loc[predicted[predicted==1].index, col].sum()/\
            working_df[col].sum()    
        
    false_positive_cases = working_df.loc[(working_df['message'].str.lower().str.match(pat1)) \
                              & (working_df[col] == 0)][['bid_id', 'message','majority_vote', 'union_vote']]
    
    # print((false_positive_cases.to_string()))
    
    num_terms = working_df.loc[predicted[predicted==1].index, col].sum()
    return [term, 
            "{0:.0f}%".format(precision*100), 
            "{0:.0f}%".format(recall*100), 
            num_terms]

In [13]:
df.columns

Index(['bid_id', 'bid_id_header', 'message_timestamp', 'message_sender',
       'pii_cleaned_message', 'Note', 'category', 'agent1_labels',
       'agent2_labels', 'agent3_labels', 'majority_vote', 'union_vote',
       'message', 'is_contact_info', 'is_scheduling', 'is_price', 'is_payment',
       'is_generic', 'is_considering', 'is_follow_up', 'is_rejection',
       'is_details', 'is_contact', 'is_meeting'],
      dtype='object')

In [14]:
# helper function to dislay precision and recall information from a term and a specific column we evaluate against 
# added df_set options: "all", "customers", and "pro" to evaluate only in that group

def display_info_multiple(key_words, col, df_set = 'all'):
    regex_pattern = "^.*("
    for i,w in enumerate(key_words):
        if i != len(key_words) -1:
            regex_pattern += w +'|'
        else:
            regex_pattern += w 

    regex_pattern += ").*$"
    
    if df_set == 'all':
        working_df = df
    elif df_set == 'customers':
        working_df = df[df['message_sender'] == 'Customer']
    else:
        working_df = df[df['message_sender'] == 'Pro']
    
    pat1 = re.compile(regex_pattern , flags = re.DOTALL) # equivalent to str.contains
    predicted = working_df['message'].str.lower().str.match(pat1).astype(int)
    precision = working_df.loc[predicted[predicted==1].index, col].sum()/\
            working_df.loc[predicted[predicted==1].index, col].count()

    recall = working_df.loc[predicted[predicted==1].index, col].sum()/\
            working_df[col].sum()    
        
    false_positive_cases = working_df.loc[(df['message'].str.lower().str.match(pat1)) \
                              & (working_df[col] == 0)][['bid_id', 'message','majority_vote', 'union_vote']]
    
    num_terms = working_df.loc[predicted[predicted==1].index, col].sum()
    return [regex_pattern, 
            "{0:.0f}%".format(precision*100), 
            "{0:.0f}%".format(recall*100), 
            num_terms
           ]

In [15]:
df.columns

Index(['bid_id', 'bid_id_header', 'message_timestamp', 'message_sender',
       'pii_cleaned_message', 'Note', 'category', 'agent1_labels',
       'agent2_labels', 'agent3_labels', 'majority_vote', 'union_vote',
       'message', 'is_contact_info', 'is_scheduling', 'is_price', 'is_payment',
       'is_generic', 'is_considering', 'is_follow_up', 'is_rejection',
       'is_details', 'is_contact', 'is_meeting'],
      dtype='object')

In [16]:
# test function
display_info('available', 'is_scheduling')

['available', '91%', '13%', 671]

In [17]:
# test function
display_info_multiple(['available','weekday'], 'is_scheduling')

['^.*(available|weekday).*$', '91%', '13%', 693]

## Scheduling

In [18]:
df[(df.is_scheduling==1) & (df.is_generic == 1)].shape

(88, 24)

In [19]:
df[(df.is_scheduling==1) & (df.is_generic == 0)].shape

(5094, 24)

In [20]:
df[(df.is_scheduling==1)].shape

(5182, 24)

In [21]:
get_info(df[df.is_scheduling == 1].message.str.lower().str.cat(sep=' '))

Unigram frequencies:
              Frequency
Word                   
you           5457     
i             5224     
to            3996     
the           3374     
and           2593     
a             2449     
for           2435     
can           1780     
we            1682     
is            1577     
be            1528     
have          1386     
me            1352     
if            1307     
at            1283     
that          1248     
in            1234     
would         1232     
will          1209     
on            1187     
your          1172     
do            1171     
it            990      
my            989      
are           980      
time          977      
of            942      
or            905      
this          861      
tomorrow      853      
with          776      
hi            760      
so            740      
work          729      
available     712      
am            702      
call          693      
what          691      
know          686  

In [22]:
# TO DO: Tina to investigate more word/phrases here. Put everything you tried in here.
# TO DO: add month, date, time pattern (e.g. "digit am" etc)

schedule_keywords = \
["book", "appointment","schedule","what time",'available', 'availability',\
 'works for you','morning','afternoon','evening', 'tomorrow', 'next week',\
 'this morning','this afternoon','this evening',
 'tomorrow morning', 'tomorrow afternoon', 'tomorrow evening',
 'monday','tuesday','wednesday','thursday','friday','saturday','sunday','weekday','weekend',\
 'me know when','a call', 'give you a', 'works for you', 'work for you', 'forward to hearing',\
 'time for you',
 'today',' am ', ' pm ', 'can you make',\
 's your next availability', 'when i can call you', \
 'are you available on my date', 'when can i give you a call', ' give you a call', \
 'see you at', 'would like to book', 'book you',
'7am','8am', '9am', '10am', '11am', '12pm', '1pm', '2pm', 
'3pm', '4pm', '5pm', '6pm', '7pm', '8pm', '9pm', '10pm',
'7 am','8 am', '9 am', '10 am', '11 am', '12 pm', '1 pm', '2 pm', 
'3 pm', '4 pm', '5 pm', '6 pm', '7 pm', '8 pm', '9 pm', '10 pm',
'at 7', 'at 8', 'at 9', 'at 10', 'at 11', 'at 12', 
'at 2', 'at 3', 'at 4', 'at 5', 'at 6', 
'@7', '@8', '@9', '@10', '@11', '@12', 
'@2', '@3', '@4', '@5', '@6', 
'7 :', '8 :', '9 :', '10 :', '11 :', '12 :', 
'1 :', '2 :', '3 :', '4 :', '5 :', '6 :',
'1:', '2:', '3:', '4:', '5:', '6:', '7:', 
'8:', '9:', '10:', '11:', '12:', 
'13:', '14:', '15:', '16:', '17:',
'18:', '19:', '20:', '21:', '22:',
'monday','tuesday','wednesday','thursday','friday','saturday','sunday',
' mon ', ' tues ', ' wed ', ' thurs ', ' fri ', ' sat ', ' sun ',
'january', 'february', 'march', 'april', 'june', 
'july', 'august', 'september', 'october', 'november', 'december',
' jan ', ' feb ', ' mar ', ' apr ', ' jul ', ' aug ', ' sept ',
' oct ', ' nov ', ' dec ']

In [23]:
scheduling_terms = pd.DataFrame(columns=('Term', 'Precision', 'Recall', 'Count'))
for i, term in enumerate(schedule_keywords):
    scheduling_terms.loc[i] = display_info(term, 'is_scheduling')

  from ipykernel import kernelapp as app


### Create a function that 
- gets all the unigrams, bigrams, trigrams and fourgrams from scheduling and put that into scheduling terms

In [24]:
scheduling_terms.sort_values(by=['Precision', 'Recall'], ascending=False)

Unnamed: 0,Term,Precision,Recall,Count
39,when i can call you,nan%,0%,0
40,are you available on my date,nan%,0%,0
93,@11,nan%,0%,0
94,@12,nan%,0%,0
96,@3,nan%,0%,0
99,@6,nan%,0%,0
100,7 :,nan%,0%,0
101,8 :,nan%,0%,0
104,11 :,nan%,0%,0
105,12 :,nan%,0%,0


### Scheduling Regex

In [25]:
# TO DO: Tina to finalize the set of words to use by picking from the scheduling_terms list

scheduling_regex_words = ['avail', 
                          'weekday', "weekend work", 'what time', 
                          'your address', 'would you like', 'what\'s the best',
                          'work for you','works for you', 'what works', 
                          'schedul','tomorrow',
                          'can you make', 'give you a call', 
                          'when i can call you', 'an appointment',
                          'reschedule', 'would you like', 'to set up', 'this week', 'next week',
                          'time for you',
                          'can you make', 'me know when', 'how about',
                          'give you a call', 'good for you', 'a good time',
                          'best for you', 'see u', 'your phone', 'see you at',
                          '([6-9]|[1][0-1])((am)|(a.m)|( a.m))',
                          '([1-9]|[1][0-2])((pm)|(p.m)|( p.m))',
                          '([7-9]|[1][0-1])( am)',
                          '([1-9]|[10])( pm)',
                          '(at )([2-9]|[1][0-2])',
                          '([1-9]|[1-2][0-2])(( :)|(:))',
                          'monday','tuesday','wednesday','thursday','friday','saturday','sunday',
                          ' mon ', ' tues ', ' wed ', ' thurs ', ' fri ', ' sat ', ' sun ',
                          'january', 'february', 'march', 'april', 'june', 
                          'july', 'august', 'september', 'october', 'november', 'december',
                          ' jan ', ' feb ', ' mar ', ' apr ', ' jul ', ' aug ', ' sept ',
                          ' oct ', ' nov ', ' dec ']

display_info_multiple(scheduling_regex_words, 'is_scheduling')

["^.*(avail|weekday|weekend work|what time|your address|would you like|what's the best|work for you|works for you|what works|schedul|tomorrow|can you make|give you a call|when i can call you|an appointment|reschedule|would you like|to set up|this week|next week|time for you|can you make|me know when|how about|give you a call|good for you|a good time|best for you|see u|your phone|see you at|([6-9]|[1][0-1])((am)|(a.m)|( a.m))|([1-9]|[1][0-2])((pm)|(p.m)|( p.m))|([7-9]|[1][0-1])( am)|([1-9]|[10])( pm)|(at )([2-9]|[1][0-2])|([1-9]|[1-2][0-2])(( :)|(:))|monday|tuesday|wednesday|thursday|friday|saturday|sunday| mon | tues | wed | thurs | fri | sat | sun |january|february|march|april|june|july|august|september|october|november|december| jan | feb | mar | apr | jul | aug | sept | oct | nov | dec ).*$",
 '80%',
 '77%',
 3979]

Currently the best is 83%, 71%.

In [26]:
display_info_multiple(scheduling_regex_words, 'is_scheduling')

["^.*(avail|weekday|weekend work|what time|your address|would you like|what's the best|work for you|works for you|what works|schedul|tomorrow|can you make|give you a call|when i can call you|an appointment|reschedule|would you like|to set up|this week|next week|time for you|can you make|me know when|how about|give you a call|good for you|a good time|best for you|see u|your phone|see you at|([6-9]|[1][0-1])((am)|(a.m)|( a.m))|([1-9]|[1][0-2])((pm)|(p.m)|( p.m))|([7-9]|[1][0-1])( am)|([1-9]|[10])( pm)|(at )([2-9]|[1][0-2])|([1-9]|[1-2][0-2])(( :)|(:))|monday|tuesday|wednesday|thursday|friday|saturday|sunday| mon | tues | wed | thurs | fri | sat | sun |january|february|march|april|june|july|august|september|october|november|december| jan | feb | mar | apr | jul | aug | sept | oct | nov | dec ).*$",
 '80%',
 '77%',
 3979]

In [27]:
# helper function to dislay frequency of words from a blob of text 
def get_info(txt, top_N=200):
    words = nltk.tokenize.word_tokenize(txt)

    bigrams = nltk.bigrams(words)
    trigrams = nltk.trigrams(words)
    fourgrams = ngrams(words,4)

    word_dist = nltk.FreqDist(words)
    top_N = top_N
    custom_stopwords = set((u'.', u',', u'?', u'!', u')', u':', u'\'s', u'('))

    words_except_stop_dist = nltk.FreqDist(w for w in words if w not in custom_stopwords) 

    print('Unigram frequencies:')
    rslt = pd.DataFrame(words_except_stop_dist.most_common(top_N),
                        columns=['Word', 'Frequency']).set_index('Word')
    print(rslt)
    print('=' * 60)

    # Bigram Frequencies
    print('Bigram frequencies:')
    bigrams_freq = nltk.FreqDist(bigrams)
    rslt = pd.DataFrame(bigrams_freq.most_common(top_N),
                        columns=['Word', 'Frequency']).set_index('Word')

    print(rslt)
    print('=' * 60)

    # Trigram Frequencies
    print('Trigram frequencies:')
    trigrams_freq = nltk.FreqDist(trigrams)
    rslt = pd.DataFrame(trigrams_freq.most_common(top_N),
                        columns=['Word', 'Frequency']).set_index('Word')

    print(rslt)
    print('=' * 60)
    
    # Fourgram Frequencies
    print('Fourgram frequencies:')
    fourgrams_freq = nltk.FreqDist(fourgrams)
    rslt = pd.DataFrame(fourgrams_freq.most_common(top_N),
                        columns=['Word', 'Frequency']).set_index('Word')

    print(rslt)
    print('=' * 60)

In [28]:
# A function that checks all the top unigrams, bigrams, and trigrams
# didn't work well!

def create_lst_to_check(topN, txt):
    lst_words_to_check = []
    for word in unigrams_freq.most_common(topN):
        lst_words_to_check.append(word[0])
    for word in bigrams_freq.most_common(topN):
        lst_words_to_check.append(word[0][0] + ' ' + word[0][1])
    for word in trigrams_freq.most_common(topN):
        lst_words_to_check.append (word[0][0] + ' ' + word[0][1] + ' ' + word[0][2])
    return lst_words_to_check

increases recall but decreases precision
- appointment
- morning, afternoon, evening (figure out where it's used)

In [29]:
payment_regex_words = ['the deposit','the contract and','do you accept',
                       'you can pay', 'check or cash', 'do i pay',
                       'debit', 'do you accept credit', 'do you take cash',
                       'debit', 'credit card', 'the check', 'paypal', 'venmo', 'a deposit',
                       'cash', 'payment', 'invoice'
                    ]

In [30]:
# Example of further investigation
# the left over cases, i.e. 1 - recall cases
pat1 = '^.*(availability|available|weekday|what time|weekday|work for you|works for you|schedul|tomorrow|afternoon|monday|tuesday|wednesday|thursday|friday|saturday|sunday|can you make|when can i give you a call).*$'
df[(df.is_scheduling == 1) & (~df['message'].str.lower().str.match(pat1))].message.head(100)

0      Greetings..\nAre you available for resume writing?                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            
3      April. Can likely do est this wk & Work maybe the next. let me know your phone # & address.  Tx. Wade                                      

In [31]:
## Example of further investigations
## investigate the regex related to word "book"
pat1 = re.compile('^.*(book).*$' , flags = re.DOTALL) # equivalent to str.contains
pat2 = re.compile('^.*(availability|available|weekday|what time|weekday|work for you|works for you|schedul|tomorrow|afternoonmonday|tuesday|wednesday|thursday|friday|saturday|sunday).*$' , flags = re.DOTALL) # equivalent to str.contains

# df[(df.is_scheduling == 1) & (df['message'].str.lower().str.match(pat1))\
#   & (~df['message'].str.lower().str.match(pat2))].message.head(100)

get_info(df[(df.is_scheduling == 1) & (df['message'].str.lower().str.match(pat1))\
   & (~df['message'].str.lower().str.match(pat2))].message.str.lower().str.cat(sep=' '))

Unigram frequencies:
               Frequency
Word                    
i              90       
the            85       
you            85       
to             75       
for            42       
and            40       
your           32       
book           32       
is             29       
a              28       
we             25       
booked         24       
if             24       
so             23       
in             22       
do             22       
me             21       
as             20       
on             19       
that           18       
my             18       
be             17       
would          17       
it             17       
have           17       
$              17       
will           16       
can            16       
are            15       
let            15       
at             14       
need           13       
thank          13       
'm             13       
date           12       
with           12       
this           12       
abou

## Payment

In [32]:
df[(df.is_payment==1) & (df.is_generic == 1)].shape

(8, 24)

In [33]:
df[(df.is_payment==1)].shape

(437, 24)

In [34]:
get_info(df[df.is_payment == 1].message.str.lower().str.cat(sep=' '))

Unigram frequencies:
           Frequency
Word                
the        758      
you        753      
i          726      
to         626      
and        490      
a          340      
for        328      
is         248      
can        246      
your       232      
we         231      
of         228      
do         222      
will       219      
that       201      
be         198      
in         187      
me         187      
have       184      
it         178      
if         172      
or         169      
on         154      
my         148      
with       144      
would      131      
check      117      
so         116      
payment    113      
at         113      
as         112      
send       106      
deposit    99       
cash       96       
$          91       
pay        90       
are        86       
get        86       
need       81       
like       79       
this       78       
know       76       
work       73       
also       71       
thanks     70

In [35]:
# TO DO: Tina to investigate more word/phrases here. Put everything you tried in here.
# Hint: use the investigation examples to get inspirations 

payment_keywords = \
["pay", "payment", "credit card", "cash", "check", "debit", "invoice", "payment", "deposit", "paypal", \
 "the deposit", "send you", "the contract", "a deposit", "the check", \
 "do you accept", "i will need", "you can pay", "will send you",\
 "send me your", "can send you", "a check", "check or cash",\
 "do i pay", "send you an", "send you the", "deposit to hold the",\
 "do you accept credit", "do you take cash", "check or credit card", "payment do you accept", \
 "at your earliest convenience", "hold the date", "send you an invoice", "& quote"]

In [36]:
payment_terms = pd.DataFrame(columns=('Term', 'Precision', 'Recall', 'Count'))
for i, term in enumerate(payment_keywords):
    payment_terms.loc[i] = display_info(term, 'is_payment')


  from ipykernel import kernelapp as app


In [37]:
payment_terms.sort_values(by=['Precision', 'Recall'], ascending=False)

Unnamed: 0,Term,Precision,Recall,Count
34,& quote,nan%,0%,0
2,credit card,95%,10%,42
9,paypal,91%,7%,31
14,the check,91%,5%,20
23,do i pay,90%,2%,9
21,a check,89%,9%,41
3,cash,87%,22%,97
10,the deposit,85%,7%,29
1,payment,82%,24%,104
7,payment,82%,24%,104


In [38]:
# TO DO: Tina to finalize the set of words to use by picking from the payment_terms list

payment_regex_words = ['debit','credit card','the check','paypal', 'venmo',\
                        'the deposit', 'do i pay', 'a deposit'
                    ]

In [39]:
# TO DO: Tina to investigate more word/phrases here. Put everything you tried in here.
# Hint: use the investigation examples to get inspirations 

payment_keywords = \
["pay", "payment", "credit card", "cash", "check", "debit", "invoice", "payment", "deposit", "paypal", \
 "the deposit", "send you", "the contract", "a deposit", "the check", \
 "do you accept", "i will need", "you can pay", "will send you",\
 "send me your", "can send you", "a check", "check or cash",\
 "do i pay", "send you an", "send you the", "deposit to hold the",\
 "do you accept credit", "do you take cash", "check or credit card", "payment do you accept", \
 "at your earliest convenience", "hold the date", "send you an invoice"]

# cannot imrpove recall without decreasing precision

### Payment Regex

In [40]:
payment_regex_words = ['a check', 'the check', "cheque",
                       'debit card', 'check or cash', 'cash',
                       'credit card', 'paypal', 'venmo', 
                       'deposit', 'the contract and',
                       'you can pay', 'do i pay', 'can i pay', 'should i pay',
                       'do we pay', 'can we pay', 'should we pay',
                       'pay me', 'pay you', 'pay us',
                       'do you take', 'do you accept',
                        'payment', 'invoice']

display_info_multiple(payment_regex_words, 'is_payment')

['^.*(a check|the check|cheque|debit card|check or cash|cash|credit card|paypal|venmo|deposit|the contract and|you can pay|do i pay|can i pay|should i pay|do we pay|can we pay|should we pay|pay me|pay you|pay us|do you take|do you accept|payment|invoice).*$',
 '81%',
 '84%',
 368]

### Other Categories
- Rejection, Considering, Follow-up

## Rejection

In [41]:
df[(df.is_rejection==1) & (df.is_generic == 1)].shape

(1, 24)

In [42]:
df[(df.is_rejection==1)].shape

(321, 24)

In [43]:
get_info(df[df.is_rejection == 1].message.str.lower().str.cat(sep=' '), 200)

Unigram frequencies:
               Frequency
Word                    
i              530      
to             259      
you            259      
for            254      
the            220      
a              154      
and            150      
have           132      
we             103      
your           100      
but            100      
thank          95       
thanks         91       
that           91       
in             91       
sorry          90       
it             88       
do             82       
n't            81       
so             79       
of             79       
this           79       
not            77       
'm             75       
will           75       
is             74       
with           73       
me             71       
my             66       
be             64       
time           58       
on             56       
hi             55       
am             55       
can            48       
work           45       
was            44       
are 

In [44]:
df[df.is_rejection == 1]

Unnamed: 0,bid_id,bid_id_header,message_timestamp,message_sender,pii_cleaned_message,Note,category,agent1_labels,agent2_labels,agent3_labels,...,is_scheduling,is_price,is_payment,is_generic,is_considering,is_follow_up,is_rejection,is_details,is_contact,is_meeting
91,53737054,,2017-10-15 23:10:42.213031 UTC,Pro,"HI again, thanks for getting back to me so quickly. Unfortunately the rest of Oct and Nov, we are fully booked already, that's why I wanted to confirm the date. I wish you luck with finding someone to cater your special event, let me know if you need any help.",,Wedding and Event Catering,"{null_value, Scheduling - Meeting or Job, Rejection}","{null_value, Scheduling - Meeting or Job, Rejection}","{null_value, Scheduling - Meeting or Job, Rejection}",...,1,0,0,0,0,0,1,0,0,0
103,54064853,,2017-07-27 22:16:14.750218 UTC,Customer,"Dear [PERSON_NAME]: I have forwarded your information to a friend of mine who is looking for piano lessons. I thank you for you offer, but I do not think harpsichord lessons on a piano would work because the touch and action are so different. Sincerely yours, [PERSON_NAME]",,Piano Lessons,"{null_value, Rejection, Job Details}","{null_value, Rejection, Job Details}","{null_value, Rejection, Job Details}",...,0,0,0,0,0,0,1,1,0,0
166,56770358,,2017-05-11 00:45:20.840965 UTC,Customer,"Hi!! Thank you so much for all the info as well as providing discounts and such! However, I was crunching #s for our budget and I was able to find someone that fit our budget and is also located closer to town. Thanks again for all the info!",,Wedding and Event Makeup,"{null_value, Rejection}","{null_value, Rejection}","{null_value, Rejection}",...,0,0,0,0,0,0,1,0,0,0
220,57589831,,2017-05-01 15:52:49.528142 UTC,Customer,No and no thank you all together,,Photo Booth Rental,"{null_value, Rejection}","{null_value, Rejection}","{null_value, Rejection}",...,0,0,0,0,0,0,1,0,0,0
225,57592064,,2017-05-18 18:28:38.119675 UTC,Pro,"Unfortunately, I'm not available on that date. So sorry! But I think you have been in contact with the Top Tier band leader, [PERSON_NAME]. He will have access to other singers that can fill in. Hopefully, you guys can book the date! They are an amazing band!",,Music Entertainment,"{null_value, Rejection}","{null_value, Rejection}","{null_value, Rejection, Job Details}",...,0,0,0,0,0,0,1,0,0,0
455,57881717,,2017-07-27 15:03:35.908843 UTC,Customer,"No, we have a videographer. Thanks for your quote.",,Wedding Videography,"{null_value, Rejection}","{null_value, Price, Rejection}","{null_value, Price, Rejection, Job Details}",...,0,1,0,0,0,0,1,0,0,0
507,57971140,,2017-06-19 12:58:36.272885 UTC,Customer,[PERSON_NAME] - I've selected another videographer for my event. Thank you so much for your time. I will definitely keep you in mind for any future projects. [PERSON_NAME],,Wedding and Event Videography,"{null_value, Rejection}","{null_value, Rejection}","{null_value, Rejection, Job Details}",...,0,0,0,0,0,0,1,0,0,0
726,58226376,,2017-05-08 20:16:26.459148 UTC,Customer,Wow--no thanks.,,Skylight Installation or Repair,"{null_value, Rejection}","{null_value, Rejection}","{null_value, Rejection}",...,0,0,0,0,0,0,1,0,0,0
773,58271575,,2017-06-02 21:01:20.50324 UTC,Pro,"Hi [PERSON_NAME], I'm sorry I wasn't a good fit for you. Do you have any feedback to help me win future customers?",,Fence and Gate Installation,"{null_value, Rejection}","{null_value, Rejection}","{null_value, Rejection}",...,0,0,0,0,0,0,1,0,0,0
819,58333887,,2017-05-09 23:34:02.20173 UTC,Customer,"No that doesn't work for me, its too early",,Wedding and Event Makeup,"{null_value, Rejection}","{null_value, Scheduling - Meeting or Job}","{null_value, Scheduling - Meeting or Job, Rejection}",...,1,0,0,0,0,0,1,0,0,0


rejections are often reschedules... which is why they are hard to detect in terms of precision.

In [45]:
# TO DO: Tina to investigate more word/phrases here. Put everything you tried in here.
# Hint: use the investigation examples to get inspirations 

rejection_keywords = \
["unfortunately i'm", "hired", "different route", "cancel", "so sorry",
 "need to cancel", "have to cancel",
 "i have decided", "we have decided", 'thank you for your time',
 'keep you in mind for', 'already hired', 'hired someone else', 
 'found someone else', 'a good fit for', 'i\'m sorry but',
 'i am not available', 'keep your contact', 'to waste your time',
 'offer, but i', 'good fit for you', 'hired an', 'wasted your time',
 'change of plans', 'as of now', 'no thank you', 'selected another',
 'no thanks', 'look elsewhere', 'do not contact me', 'sorry', 'but i appreciate',
 'holding off', 'thanks anyway', 'went with another', 'but thank you',
 'reschedule', 'too far', 'found someone']
 
rejection_terms = pd.DataFrame(columns=('Term', 'Precision', 'Recall', 'Count'))
for i, term in enumerate(rejection_keywords):
    rejection_terms.loc[i] = display_info(term, 'is_rejection')
    
rejection_terms.sort_values(by=['Precision', 'Recall'], ascending=False)

  from ipykernel import kernelapp as app


Unnamed: 0,Term,Precision,Recall,Count
13,found someone else,nan%,0%,0
5,need to cancel,83%,2%,5
35,but thank you,77%,3%,10
11,already hired,75%,2%,6
22,wasted your time,67%,1%,2
7,i have decided,60%,1%,3
16,i am not available,60%,1%,3
25,no thank you,60%,1%,3
8,we have decided,57%,1%,4
23,change of plans,50%,1%,2


### Rejection Regex

In [46]:
# TO DO: Tina to investigate more word/phrases here. Put everything you tried in here.
# Hint: use the investigation examples to get inspirations 

rejection_keywords = ["(i'm sorry).*(i don't think)"]
 
rejection_terms = pd.DataFrame(columns=('Term', 'Precision', 'Recall', 'Count'))
for i, term in enumerate(rejection_keywords):
    rejection_terms.loc[i] = display_info(term, 'is_rejection')
    
rejection_terms.sort_values(by=['Precision', 'Recall'], ascending=False)

Unnamed: 0,Term,Precision,Recall,Count
0,(i'm sorry).*(i don't think),100%,0%,1


In [47]:
rejection_regex_words = ['different route', 'have to cancel',
                         'keep you in mind for', 'hired someone else', 'thanks anyway'
                         'keep your contact', 'to waste your time', 'offer, but i',
                         'different direction', 'look elsewhere',
                         'selected another', 'no thanks', 'do not contact me',
                         'went with another', 'holding off', "(i'm sorry).*(i don't think)",
                         "(can't make it).*(sorry)", "(sorry).*(good luck)"]

display_info_multiple(rejection_regex_words, 'is_rejection')

["^.*(different route|have to cancel|keep you in mind for|hired someone else|thanks anywaykeep your contact|to waste your time|offer, but i|different direction|look elsewhere|selected another|no thanks|do not contact me|went with another|holding off|(i'm sorry).*(i don't think)|(can't make it).*(sorry)|(sorry).*(good luck)).*$",
 '100%',
 '17%',
 53]

## Considering

In [48]:
df[(df.is_considering==1) & (df.is_generic == 1)].shape

(0, 24)

In [49]:
df[(df.is_considering==1)].shape

(227, 24)

In [50]:
# checking that only customers are considering
df[(df.is_considering==1) & (df.message_sender == 'Customer')].shape

(226, 24)

In [51]:
df[df.is_considering == 1]

Unnamed: 0,bid_id,bid_id_header,message_timestamp,message_sender,pii_cleaned_message,Note,category,agent1_labels,agent2_labels,agent3_labels,...,is_scheduling,is_price,is_payment,is_generic,is_considering,is_follow_up,is_rejection,is_details,is_contact,is_meeting
82,48887274,,2017-07-02 16:44:43.484491 UTC,Customer,Hi [PERSON_NAME]. I am on vacation this week but I will get back to you shortly. Thanks for your patience.\n[PERSON_NAME],,Apartment Cleaning,"{null_value, Scheduling - Meeting or Job, Considering}","{null_value, Considering, Scheduling - Meeting or Job}","{null_value, Scheduling - Meeting or Job, Considering}",...,1,0,0,0,1,0,0,0,0,0
159,56770358,56770358.0,2017-05-02 17:50:21.714192 UTC,Customer,"Hi!! I was looking into booking you guys for the 3+ booking price. Do you travel to [LOCATION], or will that be too far?",,Wedding and Event Makeup,"{null_value, Considering, Price, Location}","{null_value, Price, Location}","{null_value, Considering, Price, Location}",...,0,1,0,0,1,0,0,0,0,0
215,57565517,,2017-05-02 17:48:46.704659 UTC,Customer,"Ok. This looks good. I'm going to run this by my husband this evening, and we can hopefuly get this locked down tomorrow! Thank You!",,Wedding and Event Catering,"{null_value, Considering}","{null_value, Scheduling - Meeting or Job, Considering}","{null_value, Scheduling - Meeting or Job, Considering}",...,1,0,0,0,1,0,0,0,0,0
288,57752819,,2017-05-02 01:53:41.380248 UTC,Customer,Ok I will do some thinking and get back to you,,Dog Training,"{null_value, Considering}","{null_value, Considering}","{null_value, Considering}",...,0,0,0,0,1,0,0,0,0,0
521,58023109,,2017-05-05 23:11:15.970708 UTC,Customer,"[PERSON_NAME], I enjoyed talking with you just now and I see your initial proposal of $3,000 monthly, (retainer $2500/remainder to cover gas/fees 2x month). I will get back to you within a week.",,Commercial Photography,"{null_value, Scheduling - Meeting or Job, Price, Confirmation - Contact}","{null_value, Considering, Price, Confirmation - Contact}","{null_value, Price, Confirmation - Contact, Job Details, Considering}",...,0,1,0,0,1,0,0,0,1,0
556,58082706,,2017-05-06 00:47:16.008468 UTC,Customer,Not yet just looking,,House Cleaning,"{null_value, Considering}",{null_value},"{null_value, Considering}",...,0,0,0,0,1,0,0,0,0,0
604,58099307,,2017-05-06 15:18:02.947286 UTC,Customer,"Hi, The piano was mine as a child and I gave to my son & family. So I need to check with them on day/time. I will get back to you asap. Thank you, [PERSON_NAME]",,Piano Tuning,"{null_value, Scheduling - Meeting or Job, Considering, Job Details}","{null_value, Scheduling - Meeting or Job, Job Details}","{null_value, Scheduling - Meeting or Job, Considering, Job Details}",...,1,0,0,0,1,0,0,1,0,0
625,58135749,,2017-05-07 03:39:07.189024 UTC,Customer,Ok let me think about it. My issue is that I'm worried you'll find a lot of grammar and syntax errors so you could say 10 hours but if it turns into more then that and I can't pay... That's my concern I'm on a budget,,Editing,"{null_value, Price, Job Details}","{null_value, Considering, Job Details, Price}","{null_value, Considering, Job Details, Price}",...,0,1,0,0,1,0,0,1,0,0
692,58191035,,2017-05-08 03:07:09.798284 UTC,Customer,Coffee sounds great I need to look at my schedule and funds and I will get back to you .,,Nutritionist,"{null_value, Considering, Scheduling - Meeting or Job}","{null_value, Scheduling - Meeting or Job, Considering}","{null_value, Scheduling - Meeting or Job, Considering}",...,1,0,0,0,1,0,0,0,0,0
750,58259169,,2017-05-16 02:51:56.631359 UTC,Customer,"[PERSON_NAME], gracias [PERSON_NAME]' ..no le olvidé, estas días estoy muy ocupado, y tratando de arreglar mi horario para hacer una sita ..yo le voy avisar bastante pronto ..gracias ..[PERSON_NAME]","I did not forget, these days I'm very busy, and trying to arrange my schedule to make a visit .. I'll let you know soon enough .. thank you",Spanish Lessons,"{null_value, Considering}",{null_value},"{null_value, Considering}",...,0,0,0,0,1,0,0,0,0,0


In [52]:
df[df.is_considering == 1]['pii_cleaned_message']

82       Hi [PERSON_NAME].   I am on vacation this week but I will get back to you shortly.   Thanks for your patience.\n[PERSON_NAME]                                                                                                                                                                                                                                                                                                                                                       
159      Hi!!  I was looking into booking you guys for the 3+ booking price.  Do you travel to [LOCATION], or will that be too far?                                                                                                                                                                                                                                                                                                                                                          
215      Ok. This looks good. I'm going to r

In [53]:
get_info(df[df.is_considering == 1].message.str.lower().str.cat(sep=' '), 200)

Unigram frequencies:
             Frequency
Word                  
i            362      
you          274      
to           234      
the          159      
will         152      
and          136      
a            119      
for          111      
my           85       
thank        84       
back         82       
in           82       
get          73       
we           72       
let          71       
with         63       
thanks       60       
your         58       
have         56       
know         56       
be           55       
me           55       
of           54       
'll          54       
that         45       
this         44       
'm           44       
do           43       
so           42       
it           42       
if           41       
on           38       
am           35       
as           34       
but          33       
is           33       
not          30       
just         29       
are          28       
out          28       
or           

## Considering Analysis
('94%', '13%')

### Low Recall / (Non-covered examples) / maybe wrongly classified?

Hi!!  I was looking into booking you guys for the 3+ booking price.  Do you travel to [LOCATION], or will that be too far? 
-> just asking for clarification, labelling 

Ok. This looks good. I'm going to run this by my husband this evening, and we can hopefuly get this locked down tomorrow! Thank You!
-> another way to say discuss with my husband 
-> I can include it, but will it overfit / how do we come up with comprehensive examples of wording?

not yet just looking 
-> shouldn't really be considered as considering?
-> labelled as non-intent...

2637     I'm sorry I have to cancel this morning, my baby is sick. I will def be in touch to reschedule once he's feeling better. So sorry.                                                 
-> sometimes it should be labelled as rescheduling

3480     I will confirm by the 15th of July if that is ok. Is there trip charges?
-> but that's the only thing, if it just says will confirm, then the precision is very low

5042     Hi! I’m currently traveling but will reach out as soon as I get back. Thanks!
-> reach out, but that is low precision

7014     hello i have bad news. due to recent financial struggles i wont be able to take singing lessons in the summer the way i had planned. but if you will still be using this in the fall i will make an effort to contact you again 
-> should be rejection?

7502     Thanks [PERSON_NAME],  still researching.                    

7804     I am out of town until Tuesday will call you then.
-> can also be considered scheduling instead of considering

### Low Precision (False Positives)

- "will get back to you" is common, but it also has a lot of false positives in majority vote

high recall, but low precision sometimes
-> have less categories?

3005   60940048  Okie!! I will get back to you in few. What days are you available                                   {Scheduling - Meeting or Job}                                    {Scheduling - Meeting or Job}   
-> can be considering, just like some can be classified as scheduling


4407   62465248  I will see the board today and see what they say about deposit we run 32 tournies a year and no course has ever asked us for a deposit but I will get back to you tomorrow. 
{Payment}                                                        {Considering, Scheduling - Meeting or Job, Payment}            
4409   62465248  Meeting tomorrow night will get back to you on Friday about deposit and when we can meet             
{Scheduling - Meeting or Job, Payment}                           {Follow-up, Scheduling - Meeting or Job, Payment}              
5087   55258636  Sounds good to me. The McDonalds will work just fine. Ok great, I'll bring this info to my fiancé and I will get back to you tonight. Most likely after 7pm. Thank you Stan                                             

{Confirmation - Meeting, Scheduling - Meeting or Job, Location}  {Confirmation - Meeting, Scheduling - Meeting or Job, Location}

7817   60876584  Yes, I haven't gotten a chance to really look at as I've been very busy this weekend. I will get back to you later tonight.                                                                                           {Job Details}                                                    {Considering, Job Details}                                     
10642  58104674  Hi Tony,\nYour menu sounds great. My husband and I will get back to you with some proposed dates in June. We are looking forward to it. Best, -Elena                                                                     {Scheduling - Meeting or Job, Job Details}                       {Considering, Scheduling - Meeting or Job, Job Details}

11953  59534698  Great, we will get back to you shortly thank you!   
{}                                                               {Considering, Scheduling - Meeting or Job}                     
13153  61059158  Hi Dan  this is darlene  I  have some personal issues to deal with, I will be putting my training on hold so I can deal with these issues. I will get back to you as soon as i can\nThank you                             
{Rejection, Scheduling - Meeting or Job, Job Details}          

14093  62213167  This all sounds very good. So it sounds like it'd be easier for you to just go ahead and send them to me on the Google drive or Dropbox after the fact. Of course after you've touched them up. and I would you want me to bring you a drive or do you have them and you would just send it to me in the mail? The crazy thing is that you're going to probably want to stay after you finish taking pictures because everybody's a lot of fun. And you're going to want to eat which we would definitely invite you to do because we wouldn't want you to not eat because there's going to be stuff to eat that's actually good. And I don't know if you drink but you can have a cocktail as well. Let me just let my dad know that I am going to have a photographer come and I will get back to you before tomorrow. Then if all goes as planned which I believe it will I'll make the deposit of the $50.  
{Price, Job Details}                                             {Considering, Price, Job Details}                              
14464  62613908  Thank you very much for your response. I am not quite ready for this recording but it shouldn't be too long yet, and I am gathering information. This was very helpful and when I am ready, I will get back to you. Thanks again.\nKelly
{Rejection}                                                      {Considering, Rejection, Job Details}                          
14474  62615439  that's great! I have to head out right now so i will get back to you alright? :)                     {}                                                               {Considering, Scheduling - Meeting or Job}   

In [54]:
# TO DO: Tina to investigate more word/phrases here. Put everything you tried in here.
# Hint: use the investigation examples to get inspirations 

considering_keywords = \
['get back to you', 'will let you know', 'i\'ll get back to you', 'will be in touch',
 'and let you know', 'getting back to me', 'back to you soon', 'as soon as i',
 'talk to my husband', 'i\'ll think about', 'make a decision', 'in touch with you',
 'i\'ll dicuss', 'for following up', 'with my fiancé', 'i\'ll talk to',
 'still in the process', 'let me discuss with', 'a decision', 'let you know',
 'will contact you', 'be in touch', 'will let you', 'i will keep', 'my wife',
 'my husband', '(mak).*(final decision)', 'final decision', 'will get back to you',
 'just looking', 'will confirm', 'show my husband', 'think about it']
 
considering_terms = pd.DataFrame(columns=('Term', 'Precision', 'Recall', 'Count'))
for i, term in enumerate(considering_keywords):
    considering_terms.loc[i] = display_info(term, 'is_considering', 'customers')
    
considering_terms.sort_values(by=['Precision', 'Recall'], ascending=False)

  from ipykernel import kernelapp as app


Unnamed: 0,Term,Precision,Recall,Count
12,i'll dicuss,nan%,0%,0
2,i'll get back to you,93%,6%,13
32,think about it,89%,4%,8
9,i'll think about,80%,2%,4
26,(mak).*(final decision),75%,1%,3
27,final decision,71%,2%,5
29,just looking,7%,0%,1
14,with my fiancé,67%,2%,4
17,let me discuss with,67%,1%,2
1,will let you know,65%,9%,20


In [55]:
# TO DO: Tina to investigate more word/phrases here. Put everything you tried in here.
# Hint: use the investigation examples to get inspirations 

considering_keywords = \
['get back to you', 'will let you know', 'i\'ll get back to you', 'will be in touch',
 'and let you know', 'getting back to me', 'back to you soon', 'as soon as i',
 'talk to my husband', 'i\'ll think about', 'make a decision', 'in touch with you',
 'i\'ll dicuss', 'for following up', 'with my fiancé', 'i\'ll talk to',
 'still in the process', 'let me discuss with', 'a decision', 'let you know',
 'will contact you', 'be in touch', 'will let you', 'i will keep', 'my wife',
 'my husband', 'final decision', '[(review)|(discuss)|(check)|(speak)].*(with my husband)', 'with my wife',
 'still deciding', 'get back in touch', 'be in touch', 'will confirm by', 'show my husband',
 'keep your info', 'give you an answer', 'i\'ll let you know', 'will reach out', 'make a decision soon']
 
considering_terms = pd.DataFrame(columns=('Term', 'Precision', 'Recall', 'Count'))
for i, term in enumerate(considering_keywords):
    considering_terms.loc[i] = display_info(term, 'is_considering', 'customers')
    
considering_terms.sort_values(by=['Precision', 'Recall'], ascending=False)

  from ipykernel import kernelapp as app


Unnamed: 0,Term,Precision,Recall,Count
12,i'll dicuss,nan%,0%,0
2,i'll get back to you,93%,6%,13
9,i'll think about,80%,2%,4
26,final decision,71%,2%,5
14,with my fiancé,67%,2%,4
17,let me discuss with,67%,1%,2
30,get back in touch,67%,1%,2
1,will let you know,65%,9%,20
4,and let you know,62%,4%,8
22,will let you,61%,9%,20


### Considering Regex

In [56]:
considering_regex_words = \
['talk to my husband', 'i\'ll talk to', 'still in the process',
 'i\'ll get back to you', 'i\'ll think about', 'let me discuss with my wife',
 'let me discuss with my husband', 'let me discuss with my fiancé', 'still deciding',
 'show my husband', 'will confirm by', 'let me think about it']

# worried about overfitting

display_info_multiple(considering_regex_words, 'is_considering', 'customers')

["^.*(talk to my husband|i'll talk to|still in the process|i'll get back to you|i'll think about|let me discuss with my wife|let me discuss with my husband|let me discuss with my fiancé|still deciding|show my husband|will confirm by|let me think about it).*$",
 '94%',
 '13%',
 30]

"will get back to you"... 
- sometimes it should be considering
- get back about something else, like payment / deposit

The main issue is with high precision.

In [57]:
bid_level_messages = df.groupby('bid_id')['message'].apply(lambda x: " ".join(x)).reset_index()

In [58]:
bid_level_messages.head()

Unnamed: 0,bid_id,message
0,18314236,Hello Brian. You helped us a few years back and we are in need of an electrician again! Are you able to install a new electrical hook up for a new spa we have purchased? It requires a dedicated hook up for a 240v spa and will be outdoors. We are in Bay Park area just off I-5. Please let me know... Thanks!
1,27365009,"Sounds good! What's next? Hi Cayatana, nice to &quotesee&quote you back! Since it's been two years since we last spoke, perhaps you can tell me if you are still looking for the same thing. How would you like to work together...remotely or in-person? What is your schedule like? Once we determine our first session, you would send payment before hand to my PayPal account before we proceed. I will also send you some forms to complete prior to our session, as well. Best, Amy Hi! I’m currently traveling but will reach out as soon as I get back. Thanks! Sounds good and safe travels! Best, Amy"
2,29404730,Greetings..\nAre you available for resume writing?
3,30265067,What's your next availability? I would like to get an estimate for siding repair April. Can likely do est this wk & Work maybe the next. let me know your phone # & address. Tx. Wade 4 Cameroons Pl. Durham 919-358-2996
4,31521550,Can you get the door too if I give you photos and measurement


In [59]:
bid_level_messages[bid_level_messages['bid_id'] == 60940048]

Unnamed: 0,bid_id,message
954,60940048,"Hello! Its a two bedroom apartment and the 3rd room is the living room. We are on the 1st floor. We are moving this weekend so the apartment will be empty. It is an apartment complex. Parking is accessible but it is kind of little bit far. The only thing is we do need an actual receipt in the end. We are in new hempshire Londonderry We need to get our hoses from the truck to the apartment how far away is the parking? We can use a window or sliding door. 50 - 60 ft Our price would be $+\n135.00. Our price would be 135.00. Okie!! I will get back to you in few. What days are you available I have Monday AM (tomorrow) Also Tues afternoon arrival time 12-1 and Thursday AM and Sat AM Thursday just booked so we have tomorrow AM, Tues afternoon or Sat AM."


In [60]:
display_info('will get back to you', 'is_considering', 'customers')

['will get back to you', '63%', '8%', 19]

## Follow-up

In [61]:
df[(df.is_follow_up==1) & (df.is_generic == 1)].shape

(0, 24)

In [62]:
df[(df.is_follow_up==1)].shape

(734, 24)

In [63]:
# checking that only customers are considering
df[(df.is_follow_up==1) & (df.message_sender == 'Pro')].shape

(618, 24)

In [64]:
df[(df.is_follow_up==1)]['pii_cleaned_message']

17       Are you here                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   

## Followup Analysis

('85%', '31%')

- time might be a better indicator
- higher precision but lots of cases that are not strictlyl followup 

### Low Recall / (Non-covered examples)

Are you here 
-> not really specific to followup

40       Hi [PERSON_NAME]/[PERSON_NAME],\n\nCan someone please call me? Your phone number does not have voicemail and I would like to schedule service again. We are supposed to be on your bimonthly plan. \n\nThanks,\n[PERSON_NAME] \n[PHONE_NUMBER]  
-> schedule again

120      I am back in the USA now. Next step is to  chat on the phone to hear a bit more from you and get to know each other a little. 
-> can be similar to scheduling

134      I know it has been a while since I was looking at getting a resume done. Are you still in the business? I would like to talk to you if so.\nThanks,\n[PERSON_NAME]  
- "it has been a while", "will still like to talk to you"

#### Scheduling
442      Hi [PERSON_NAME],\n\nDo you have any other questions or concerns regarding our quote?\n\n[PERSON_NAME],   

443      Hi [PERSON_NAME],\n\nIs there a good time to schedule a chat?\n\n[PERSON_NAME], 

445      [PERSON_NAME],\n\nHow was your Memorial Day weekend?\n\nIs there a good time to schedule a call regarding your wedding day services?\n\n[PERSON_NAME],  

450      Happy Friday [PERSON_NAME],\n\nDo you have questions or concerns? We look forward to speaking with you regarding your wedding in August.\n\nCheers, 

658      the team hasn't arrived yet

659      do you know what their status is?                            

689      Hey [PERSON_NAME] how are you doing

### Low Precision (False Positives)

- "following up" is common, but it also has a lot of false positives in majority vote


In [65]:
df[(df.is_follow_up==1)]

Unnamed: 0,bid_id,bid_id_header,message_timestamp,message_sender,pii_cleaned_message,Note,category,agent1_labels,agent2_labels,agent3_labels,...,is_scheduling,is_price,is_payment,is_generic,is_considering,is_follow_up,is_rejection,is_details,is_contact,is_meeting
17,37749231,,2017-09-13 15:32:42.731062 UTC,Customer,Are you here,,House Cleaning,"{null_value, Follow-up}","{null_value, Follow-up}","{null_value, Follow-up}",...,0,0,0,0,0,1,0,0,0,0
40,43225206,,2017-08-18 16:05:18.639235 UTC,Customer,"Hi [PERSON_NAME]/[PERSON_NAME],\n\nCan someone please call me? Your phone number does not have voicemail and I would like to schedule service again. We are supposed to be on your bimonthly plan. \n\nThanks,\n[PERSON_NAME] \n[PHONE_NUMBER]",,Pest Control Services,"{Contact Information, null_value, Scheduling - Meeting or Job, Confirmation - Contact, Follow-up}","{Contact Information, null_value, Scheduling - Meeting or Job, Confirmation - Contact, Follow-up}","{Contact Information, null_value, Scheduling - Meeting or Job, Confirmation - Contact, Follow-up}",...,1,0,0,0,0,1,0,0,1,0
120,54443807,,2017-06-02 16:07:53.254243 UTC,Pro,I am back in the USA now. Next step is to chat on the phone to hear a bit more from you and get to know each other a little.,,Wedding and Event Makeup,"{null_value, Contact Information}","{null_value, Job Details, Follow-up}","{null_value, Job Details, Follow-up}",...,0,0,0,0,0,1,0,1,0,0
134,55138226,,2018-02-27 17:03:36.695679 UTC,Customer,"I know it has been a while since I was looking at getting a resume done. Are you still in the business? I would like to talk to you if so.\nThanks,\n[PERSON_NAME]",,Resume Writing,"{null_value, Scheduling - Meeting or Job}","{null_value, Follow-up}","{null_value, Job Details, Follow-up}",...,0,0,0,0,0,1,0,0,0,0
165,56770358,,2017-05-07 15:17:36.355296 UTC,Pro,[PERSON_NAME]!\nJust sending a follow up message to see if you have any other questions for us?,,Wedding and Event Makeup,"{null_value, Follow-up}","{null_value, Follow-up}","{null_value, Follow-up}",...,0,0,0,0,0,1,0,0,0,0
183,57273156,57273156.0,2017-05-01 01:15:32.881611 UTC,Pro,[PERSON_NAME] haven't heard back from you would you like me to put your pallets house on our schedule? I emailed you the contract and spokes you briefly haven't heard back from you. Please touch base with me,,Roof Repair or Maintenance,"{null_value, Scheduling - Meeting or Job, Confirmation - Contact, Job Details, Follow-up}","{null_value, Confirmation - Contact, Follow-up}","{null_value, Confirmation - Contact, Follow-up}",...,0,0,0,0,0,1,0,0,1,0
192,57337527,,2017-04-27 07:14:37.531335 UTC,Pro,Good morning jus checking to see if u found someone to fix ur toilet,,Toilet Installation or Replacement,"{null_value, Job Details, Follow-up}","{null_value, Job Details, Follow-up}","{null_value, Job Details, Follow-up}",...,0,0,0,0,0,1,0,1,0,0
200,57562369,,2017-05-01 02:23:17.495089 UTC,Pro,"Hi [PERSON_NAME], Thank you for viewing my quote here at [PERSON_NAME][LOCATION] we take great care of our customer properties. We treat every property as if it was our own, delicately and with care. Let us bring your home back to life with a great wash.",,Pressure Washing,"{null_value, Location, Job Details}","{null_value, Price, Follow-up, Location}","{null_value, Price, Follow-up, Location}",...,0,1,0,0,0,1,0,0,0,0
216,57565517,,2017-05-09 12:02:52.736607 UTC,Pro,"Hi Faith...Hope you're well:)\n\nI'm circling back around with you on the June 24th Crab Boil, last communication you were going to make a decision after discussing with your husband. Did you two decide on the direction you are going in?\n\nThank You,\n[PERSON_NAME]",,Wedding and Event Catering,"{null_value, Job Details, Follow-up}","{null_value, Scheduling - Meeting or Job, Job Details, Follow-up}","{null_value, Scheduling - Meeting or Job, Job Details, Follow-up}",...,1,0,0,0,0,1,0,1,0,0
292,57752819,,2018-03-10 16:06:15.395479 UTC,Customer,"Hey I used you to train my dog however we have never finished with his training , We had three trainings and not four. I like to get him fully trained and use you or find someone else",,Dog Training,"{null_value, Job Details, Hire}","{null_value, Job Details, Follow-up}","{null_value, Job Details, Follow-up}",...,0,0,0,0,0,1,0,1,0,0


In [66]:
get_info(df[df.is_follow_up == 1].message.str.lower().str.cat(sep=' '), 200)

Unigram frequencies:
             Frequency
Word                  
you          1359     
to           1026     
i            727      
a            501      
if           456      
and          431      
the          431      
your         364      
for          336      
have         327      
are          298      
in           276      
we           272      
hi           264      
with         261      
me           257      
still        243      
just         225      
can          218      
know         202      
of           173      
see          169      
would        163      
is           162      
up           161      
any          161      
on           160      
let          160      
my           154      
wanted       149      
please       145      
at           133      
or           132      
our          129      
this         123      
do           118      
be           113      
hello        112      
thanks       107      
questions    103      
so           

In [67]:
# TO DO: Tina to investigate more word/phrases here. Put everything you tried in here.
# Hint: use the investigation examples to get inspirations 

follow_up_keywords = \
['wanted to follow up', 'wanted to check in', 'if you are still', 'you to see if',
 'follow up with you', 'are you still interested', 'still interested', 'just wanted to check',
 'feel free to contact', 'if you were still', 'were still interested in',
 'hope all is well', 'you still looking to', 'still looking to hire',
 '[(haven\'t)|(hadn\'t)] heard back', 'follow up', 'are still interested', 'i wanted to check',
 'wanted to reach out', 'if you\'re still', 'checking to see', 'just checking in',
 'following up', 'still need anything', 'wanted to touch base', 'just wanted to',
 'openings available', 'hear back from you', '(just).*(follow).*(up)',
 '(want).*(follow).*(up)', 'are following up', 'am following up',
 '(have you decided).*(yet)', 'have you decided', 'checking in',
 '(want).*(to).*(verify)', '(haven\'t heard).*(from you)', 'still need anything',
 'following up to see', 'did you decide', 'wanted to reach out to see',
 'how are things going', 'still need', 'are you still looking', 'are you still available',
 'it has been a while', 'i would still like to talk']
 
follow_up_terms = pd.DataFrame(columns=('Term', 'Precision', 'Recall', 'Count'))
for i, term in enumerate(follow_up_keywords):
    follow_up_terms.loc[i] = display_info(term, 'is_follow_up', 'all')
    
follow_up_terms.sort_values(by=['Precision', 'Recall'], ascending=False)

  from ipykernel import kernelapp as app


Unnamed: 0,Term,Precision,Recall,Count
46,i would still like to talk,nan%,0%,0
0,wanted to follow up,94%,6%,47
1,wanted to check in,93%,4%,26
36,(haven't heard).*(from you),92%,2%,12
5,are you still interested,91%,3%,21
17,i wanted to check,91%,1%,10
3,you to see if,89%,3%,24
14,[(haven't)|(hadn't)] heard back,88%,2%,14
26,openings available,88%,1%,7
9,if you were still,87%,3%,20


### Followup Regex

In [68]:
follow_up_regex_words = \
['were still interested in', 'still need anything', 'wanted to follow up',
 'wanted to check in', 'are you still interested', 'haven\'t heard back',
 'i wanted to check', 'you to see if', 'openings available', 'if you were still',
 'just checking in', 'follow up with you', 'wanted to reach out', 'checking to see',
 '(just).*(follow).*(up)', 'have you decided', '(want).*(to).*(verify)',
 'haven\'t heard from you', 'following up to see', 'did you decide', 'wanted to reach out to see',
 '[(haven\'t)|(hadn\'t)] heard back]']

display_info_multiple(follow_up_regex_words, 'is_follow_up')

["^.*(were still interested in|still need anything|wanted to follow up|wanted to check in|are you still interested|haven't heard back|i wanted to check|you to see if|openings available|if you were still|just checking in|follow up with you|wanted to reach out|checking to see|(just).*(follow).*(up)|have you decided|(want).*(to).*(verify)|haven't heard from you|following up to see|did you decide|wanted to reach out to see|[(haven't)|(hadn't)] heard back]).*$",
 '85%',
 '31%',
 229]