In [1]:
# python 3
import pandas as pd
import nltk
import re

from nltk.corpus import stopwords
from nltk import ngrams

pd.set_option('display.max_colwidth', -1)
pd.set_option("display.max_rows",3000)

In [2]:
# get majority labels and all labels

def clean_label(df_agent1, df_agent2, df_agent3):
    label_column = df_agent1.filter(regex='Label').columns
    label1, label2, label3, label4, label5 = label_column[0], \
                                             label_column[1], \
                                             label_column[2], \
                                             label_column[3], \
                                             label_column[4]
    
    df_agent1[label_column] = df_agent1[label_column].fillna('null_value')
    df_agent2[label_column] = df_agent2[label_column].fillna('null_value')
    df_agent3[label_column] = df_agent3[label_column].fillna('null_value')
    
    df_agent1['agent1_labels'] = df_agent1.apply(lambda x: set([x[label1],
                                 x[label2],
                                 x[label3],
                                 x[label4],                         
                                 x[label5]]),axis=1)
    
    df_agent1['agent2_labels'] = df_agent2.apply(lambda x: set([x[label1],
                                 x[label2],
                                 x[label3],
                                 x[label4],                         
                                 x[label5]]),axis=1)
    
    df_agent1['agent3_labels'] = df_agent3.apply(lambda x: set([x[label1],
                                 x[label2],
                                 x[label3],
                                 x[label4],                         
                                 x[label5]]),axis=1)
    
    df_agent1['majority_vote'] = df_agent1.apply(lambda x: x['agent1_labels'].intersection(x['agent2_labels'])\
                                         .union(x['agent2_labels'].intersection(x['agent3_labels']))\
                                         .union(x['agent1_labels'].intersection(x['agent3_labels']))\
                               , axis=1)
    
    df_agent1['union_vote'] = df_agent1.apply(lambda x: x['agent1_labels'].union(x['agent2_labels'])\
                                         .union(x['agent3_labels'])\
                               , axis=1)
    
    # trick to associate new variable with column method operations
    mv = df_agent1['majority_vote']
    mv.apply(lambda x: x.discard('null_value'));
    
    uv = df_agent1['union_vote']
    uv.apply(lambda x: x.discard('null_value'));
    return df_agent1.drop(label_column, axis=1)

In [3]:
# get data

one_ic = pd.read_csv('data/1-ic.csv')
one_gc = pd.read_csv('data/1-gc.csv')
one_sc = pd.read_csv('data/1-sc.csv')

two_jm = pd.read_csv('data/2-jm.csv')
two_mg = pd.read_csv('data/2-mg.csv')
two_nb = pd.read_csv('data/2-nb.csv')

three_rs = pd.read_csv('data/3-rs.csv')
three_rt = pd.read_csv('data/3-rt.csv')
three_sj = pd.read_csv('data/3-sj.csv')

tph_batch1 = pd.read_csv('data/tph_batch1.csv')

In [4]:
tph_batch1.rename({'sampled_bid_id': 'bid_id'}, axis='columns', inplace=True)

In [5]:
# clean labels

df1 = clean_label(one_ic, one_gc, one_sc)
df2 = clean_label(two_jm, two_mg, two_nb)
df3 = clean_label(three_rs, three_rt, three_sj)

In [6]:
# merge data

df = df1.append(df2).append(df3)

In [7]:
df = df.merge(tph_batch1[['bid_id','message_timestamp','message']], \
              how='left', on = ['bid_id','message_timestamp'])

In [8]:
#df['is_hire_majority'] = df.apply(lambda x: ('Hire' in x['majority_vote'])*1, axis=1)
#df['is_hire_any'] = df.apply(lambda x: ('Hire' in x['union_vote'])*1, axis=1)

In [9]:
one_gc['Label 1'].unique()

array(['Scheduling - Meeting or Job', 'Price', 'Location', 'Job Details',
       'Confirmation - Meeting', 'Contact Information', 'Payment',
       'null_value', 'Follow-up', 'Confirmation - Contact', 'Hire',
       'Rejection', 'Considering'], dtype=object)

In [10]:
df['is_contact_info'] = df.apply(lambda x: ('Contact Information' in x['majority_vote'])*1, axis=1)
df['is_scheduling'] = df.apply(lambda x: ('Scheduling - Meeting or Job' in x['majority_vote'])*1, axis=1)
df['is_price'] = df.apply(lambda x: ('Price' in x['majority_vote'])*1, axis=1)
df['is_payment'] = df.apply(lambda x: ('Payment' in x['majority_vote'])*1, axis=1)
df['is_generic'] = df.apply(lambda x: ('Generic Answer' in x['majority_vote'])*1, axis=1)
df['is_considering'] = df.apply(lambda x: ('Considering' in x['majority_vote'])*1, axis=1)
df['is_follow_up'] = df.apply(lambda x: ('Follow-up' in x['majority_vote'])*1, axis=1)
df['is_rejection'] = df.apply(lambda x: ('Rejection' in x['majority_vote'])*1, axis=1)
df['is_details'] = df.apply(lambda x: ('Job Details' in x['majority_vote'])*1, axis=1)
df['is_contact'] = df.apply(lambda x: ('Confirmation - Contact' in x['majority_vote'])*1, axis=1)
df['is_meeting'] = df.apply(lambda x: ('Confirmation - Meeting' in x['majority_vote'])*1, axis=1)

In [11]:
# helper function to dislay frequency of words from a blob of text 
def get_info(txt):
    words = nltk.tokenize.word_tokenize(txt)

    bigrams = nltk.bigrams(words)
    trigrams = nltk.trigrams(words)

    word_dist = nltk.FreqDist(words)
    top_N = 200
    custom_stopwords = set((u'.', u',', u'?', u'!', u')', u':', u'\'s', u'('))

    words_except_stop_dist = nltk.FreqDist(w for w in words if w not in custom_stopwords) 

    print('Unigram frequencies:')
    rslt = pd.DataFrame(words_except_stop_dist.most_common(top_N),
                        columns=['Word', 'Frequency']).set_index('Word')
    print(rslt)
    print('=' * 60)

    # Bigram Frequencies
    print('Bigram frequencies:')
    bigrams_freq = nltk.FreqDist(bigrams)
    rslt = pd.DataFrame(bigrams_freq.most_common(top_N),
                        columns=['Word', 'Frequency']).set_index('Word')

    print(rslt)
    print('=' * 60)

    # Trigram Frequencies
    print('Trigram frequencies:')
    trigrams_freq = nltk.FreqDist(trigrams)
    rslt = pd.DataFrame(trigrams_freq.most_common(top_N),
                        columns=['Word', 'Frequency']).set_index('Word')

    print(rslt)
    print('=' * 60)


In [12]:
# helper function to dislay precision and recall information from a term and a specific column we evaluate against 

def display_info(term, col):
    pat1 = re.compile("^.*"+term+".*$" , flags = re.DOTALL) # equivalent to str.contains # re.DOTALL applies REGEX to muliple line
    predicted = df['message'].str.lower().str.match(pat1).astype(int)
    precision = df.loc[predicted[predicted==1].index, col].sum()/\
            df.loc[predicted[predicted==1].index, col].count()

    recall = df.loc[predicted[predicted==1].index, col].sum()/\
            df[col].sum()    
        
    false_positive_cases = df.loc[(df['message'].str.lower().str.match(pat1)) \
                              & (df.is_scheduling == 0)]['message']
    
    num_terms = df.loc[predicted[predicted==1].index, col].sum()
    return [term, 
            "{0:.0f}%".format(precision*100), 
            "{0:.0f}%".format(recall*100), 
            num_terms]

In [13]:
# helper function to dislay precision and recall information from a group of terms and a specific column we evaluate against 

def display_info_multiple(key_words, col):
    regex_pattern = "^.*("
    for i,w in enumerate(key_words):
        if i != len(key_words) -1:
            regex_pattern += w +'|'
        else:
            regex_pattern += w 

    regex_pattern += ").*$"
    
    pat1 = re.compile(regex_pattern , flags = re.DOTALL) # equivalent to str.contains
    predicted = df['message'].str.lower().str.match(pat1).astype(int)
    precision = df.loc[predicted[predicted==1].index, col].sum()/\
            df.loc[predicted[predicted==1].index, col].count()

    recall = df.loc[predicted[predicted==1].index, col].sum()/\
            df[col].sum()    
        
    false_positive_cases = df.loc[(df['message'].str.lower().str.match(pat1)) \
                              & (df.is_scheduling == 0)]['message']
    
    num_terms = df.loc[predicted[predicted==1].index, col].sum()
    return [regex_pattern, 
            "{0:.0f}%".format(precision*100), 
            "{0:.0f}%".format(recall*100), 
            num_terms
           ]

In [14]:
# test function
display_info('available', 'is_scheduling')

['available', '91%', '13%', 671]

In [15]:
# test function
display_info_multiple(['available','weekday'], 'is_scheduling')

['^.*(available|weekday).*$', '91%', '13%', 693]

## Scheduling

In [16]:
df[(df.is_scheduling==1) & (df.is_generic == 1)].shape

(88, 24)

In [17]:
df[(df.is_scheduling==1) & (df.is_generic == 0)].shape

(5094, 24)

Remove generic cases because we can't predict them...

In [18]:
df[(df.is_scheduling==1)].shape

(5182, 24)

In [19]:
get_info(df[df.is_scheduling == 1].message.str.lower().str.cat(sep=' '))

Unigram frequencies:
              Frequency
Word                   
you           5457     
i             5224     
to            3996     
the           3374     
and           2593     
a             2449     
for           2435     
can           1780     
we            1682     
is            1577     
be            1528     
have          1386     
me            1352     
if            1307     
at            1283     
that          1248     
in            1234     
would         1232     
will          1209     
on            1187     
your          1172     
do            1171     
it            990      
my            989      
are           980      
time          977      
of            942      
or            905      
this          861      
tomorrow      853      
with          776      
hi            760      
so            740      
work          729      
available     712      
am            702      
call          693      
what          691      
know          686  

                        Frequency
Word                             
(let, me, know)         424      
(., thank, you)         217      
(for, you, ?)           188      
(., i, 'm)              159      
(would, like, to)       153      
(., i, will)            144      
(give, you, a)          144      
(work, for, you)        141      
(are, you, available)   134      
(i, will, be)           134      
(., let, me)            132      
(., i, am)              132      
(be, able, to)          127      
(do, you, have)         125      
(me, know, if)          122      
(thank, you, for)       121      
(for, you, .)           119      
(., if, you)            116      
(please, let, me)       112      
(., i, can)             111      
(works, for, you)       106      
(you, a, call)          105      
(would, you, like)      104      
(., i, have)            99       
(give, me, a)           91       
(i, have, a)            91       
(., do, you)            90       
(i, can, do)  

In [20]:
# TO DO: Tina to investigate more word/phrases here. Put everything you tried in here.
# TO DO: add month, date, time pattern (e.g. "digit am" etc)
# things to try:
# - month, date, time pattern

schedule_keywords = \
["book", "appointment","schedule","what time",'available', 'availability',\
 'works for you','morning','afternoon','evening', 'tomorrow', 'next week',\
 'this morning','this afternoon','this evening',
 'tomorrow morning', 'tomorrow afternoon', 'tomorrow evening',
 'monday','tuesday','wednesday','thursday','friday','saturday','sunday','weekday','weekend',\
 'me know when','a call', 'give you a', 'works for you', 'work for you', 'forward to hearing',\
 'time for you',
 'today',' am ', ' pm ', 'can you make',\
 's your next availability', 'when i can call you', \
 'are you available on my date', 'when can i give you a call', ' give you a call', \
 'see you at', 'would like to book', 'book you',
'7am','8am', '9am', '10am', '11am', '12pm', '1pm', '2pm', 
'3pm', '4pm', '5pm', '6pm', '7pm', '8pm', '9pm', '10pm',
'7 am','8 am', '9 am', '10 am', '11 am', '12 pm', '1 pm', '2 pm', 
'3 pm', '4 pm', '5 pm', '6 pm', '7 pm', '8 pm', '9 pm', '10 pm',
'at 7', 'at 8', 'at 9', 'at 10', 'at 11', 'at 12', 
'at 2', 'at 3', 'at 4', 'at 5', 'at 6', 
'@7', '@8', '@9', '@10', '@11', '@12', 
'@2', '@3', '@4', '@5', '@6', 
'7 :', '8 :', '9 :', '10 :', '11 :', '12 :', 
'1 :', '2 :', '3 :', '4 :', '5 :', '6 :',
'1:', '2:', '3:', '4:', '5:', '6:', '7:', 
'8:', '9:', '10:', '11:', '12:', 
'13:', '14:', '15:', '16:', '17:',
'18:', '19:', '20:', '21:', '22:',
'monday','tuesday','wednesday','thursday','friday','saturday','sunday',
' mon ', ' tues ', ' wed ', ' thurs ', ' fri ', ' sat ', ' sun ',
'january', 'february', 'march', 'april', 'june', 
'july', 'august', 'september', 'october', 'november', 'december',
' jan ', ' feb ', ' mar ', ' apr ', ' jul ', ' aug ', ' sept ',
' oct ', ' nov ', ' dec ']

In [21]:
scheduling_terms = pd.DataFrame(columns=('Term', 'Precision', 'Recall', 'Count'))
for i, term in enumerate(schedule_keywords):
    scheduling_terms.loc[i] = display_info(term, 'is_scheduling')

  import sys


### Create a function that 
- gets all the unigrams, bigrams, trigrams and fourgrams from scheduling and put that into scheduling terms

In [22]:
scheduling_terms.sort_values(by=['Precision', 'Recall'], ascending=False)

Unnamed: 0,Term,Precision,Recall,Count
39,when i can call you,nan%,0%,0
40,are you available on my date,nan%,0%,0
93,@11,nan%,0%,0
94,@12,nan%,0%,0
96,@3,nan%,0%,0
99,@6,nan%,0%,0
100,7 :,nan%,0%,0
101,8 :,nan%,0%,0
104,11 :,nan%,0%,0
105,12 :,nan%,0%,0


In [23]:
# TO DO: add month, date, time pattern (e.g. "digit am" etc)

### Scheduling Regex

In [24]:
# TO DO: Tina to finalize the set of words to use by picking from the scheduling_terms list

scheduling_regex_words = ['avail', 
                          'weekday', "weekend work", 'what time', 
                          'your address', 'would you like', 'what\'s the best',
                          'work for you','works for you', 'what works', 
                          'schedul','tomorrow',
                          'can you make', 'give you a call', 
                          'when i can call you', 'an appointment',
                          'reschedule', 'would you like', 'to set up', 'this week', 'next week',
                          'time for you',
                          'can you make', 'me know when', 'how about',
                          'give you a call', 'good for you', 'a good time',
                          'best for you', 'see u', 'your phone', 'see you at',
                          '([6-9]|[1][0-1])((am)|(a.m)|( a.m))',
                          '([1-9]|[1][0-2])((pm)|(p.m)|( p.m))',
                          '([7-9]|[1][0-1])( am)',
                          '([1-9]|[10])( pm)',
                          '(at )([2-9]|[1][0-2])',
                          '([1-9]|[1-2][0-2])(( :)|(:))',
                          'monday','tuesday','wednesday','thursday','friday','saturday','sunday',
                          ' mon ', ' tues ', ' wed ', ' thurs ', ' fri ', ' sat ', ' sun ',
                          'january', 'february', 'march', 'april', 'june', 
                          'july', 'august', 'september', 'october', 'november', 'december',
                          ' jan ', ' feb ', ' mar ', ' apr ', ' jul ', ' aug ', ' sept ',
                          ' oct ', ' nov ', ' dec ']

display_info_multiple(scheduling_regex_words, 'is_scheduling')

["^.*(avail|weekday|weekend work|what time|your address|would you like|what's the best|work for you|works for you|what works|schedul|tomorrow|can you make|give you a call|when i can call you|an appointment|reschedule|would you like|to set up|this week|next week|time for you|can you make|me know when|how about|give you a call|good for you|a good time|best for you|see u|your phone|see you at|([6-9]|[1][0-1])((am)|(a.m)|( a.m))|([1-9]|[1][0-2])((pm)|(p.m)|( p.m))|([7-9]|[1][0-1])( am)|([1-9]|[10])( pm)|(at )([2-9]|[1][0-2])|([1-9]|[1-2][0-2])(( :)|(:))|monday|tuesday|wednesday|thursday|friday|saturday|sunday| mon | tues | wed | thurs | fri | sat | sun |january|february|march|april|june|july|august|september|october|november|december| jan | feb | mar | apr | jul | aug | sept | oct | nov | dec ).*$",
 '80%',
 '77%',
 3979]

Currently the best is 83%, 71%.

In [25]:
display_info_multiple(scheduling_regex_words, 'is_scheduling')

["^.*(avail|weekday|weekend work|what time|your address|would you like|what's the best|work for you|works for you|what works|schedul|tomorrow|can you make|give you a call|when i can call you|an appointment|reschedule|would you like|to set up|this week|next week|time for you|can you make|me know when|how about|give you a call|good for you|a good time|best for you|see u|your phone|see you at|([6-9]|[1][0-1])((am)|(a.m)|( a.m))|([1-9]|[1][0-2])((pm)|(p.m)|( p.m))|([7-9]|[1][0-1])( am)|([1-9]|[10])( pm)|(at )([2-9]|[1][0-2])|([1-9]|[1-2][0-2])(( :)|(:))|monday|tuesday|wednesday|thursday|friday|saturday|sunday| mon | tues | wed | thurs | fri | sat | sun |january|february|march|april|june|july|august|september|october|november|december| jan | feb | mar | apr | jul | aug | sept | oct | nov | dec ).*$",
 '80%',
 '77%',
 3979]

In [26]:
# helper function to put all the unigrams, bigrams, and trigrams into a list
def get_info(txt):
    words = nltk.tokenize.word_tokenize(txt)
    bigrams = nltk.bigrams(words)
    trigrams = nltk.trigrams(words)
    word_dist = nltk.FreqDist(words)
    top_N = 200
    custom_stopwords = set((u'.', u',', u'?', u'!', u')', u':', u'\'s', u'('))

    words_except_stop_dist = nltk.FreqDist(w for w in words if w not in custom_stopwords)
    
    rslt = pd.DataFrame(words_except_stop_dist.most_common(top_N),
                        columns=['Word', 'Frequency']).set_index('Word')

    bigrams_freq = nltk.FreqDist(bigrams)
    
    trigrams_freq = nltk.FreqDist(trigrams)
    rslt = pd.DataFrame(trigrams_freq.most_common(top_N),
                        columns=['Word', 'Frequency']).set_index('Word')

In [27]:
def create_lst_to_check(topN, txt):
    lst_words_to_check = []
    for word in unigrams_freq.most_common(topN):
        lst_words_to_check.append(word[0])
    for word in bigrams_freq.most_common(topN):
        lst_words_to_check.append(word[0][0] + ' ' + word[0][1])
    for word in trigrams_freq.most_common(topN):
        lst_words_to_check.append (word[0][0] + ' ' + word[0][1] + ' ' + word[0][2])
    return lst_words_to_check

increases recall but decreases precision
- appointment
- morning, afternoon, evening (figure out where it's used)

need to get to 90% precision and 80% recall.
- at least 80% on both.

https://www.oreilly.com/library/view/regular-expressions-cookbook/9781449327453/ch06s07.html

In [28]:
### Current Status for Scheduling
# - Precision = 81%
# - Recall = 73%

payment_regex_words = ['the deposit','the contract and','do you accept',
                       'you can pay', 'check or cash', 'do i pay',
                       'debit', 'do you accept credit', 'do you take cash',
                       'debit', 'credit card', 'the check', 'paypal', 'venmo', 'a deposit',
                       'cash', 'payment', 'invoice'
                    ]

In [29]:
# Example of further investigation
# the left over cases, i.e. 1 - recall cases
pat1 = '^.*(availability|available|weekday|what time|weekday|work for you|works for you|schedul|tomorrow|afternoon|monday|tuesday|wednesday|thursday|friday|saturday|sunday|can you make|when can i give you a call).*$'
df[(df.is_scheduling == 1) & (~df['message'].str.lower().str.match(pat1))].message.head(100)

0      Greetings..\nAre you available for resume writing?                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            
3      April. Can likely do est this wk & Work maybe the next. let me know your phone # & address.  Tx. Wade                                      

In [30]:
## Example of further investigations
## investigate the regex related to word "book"
pat1 = re.compile('^.*(book).*$' , flags = re.DOTALL) # equivalent to str.contains
pat2 = re.compile('^.*(availability|available|weekday|what time|weekday|work for you|works for you|schedul|tomorrow|afternoonmonday|tuesday|wednesday|thursday|friday|saturday|sunday).*$' , flags = re.DOTALL) # equivalent to str.contains

# df[(df.is_scheduling == 1) & (df['message'].str.lower().str.match(pat1))\
#   & (~df['message'].str.lower().str.match(pat2))].message.head(100)

get_info(df[(df.is_scheduling == 1) & (df['message'].str.lower().str.match(pat1))\
   & (~df['message'].str.lower().str.match(pat2))].message.str.lower().str.cat(sep=' '))

## Payment

In [31]:
df[(df.is_payment==1) & (df.is_generic == 1)].shape

(8, 24)

In [32]:
df[(df.is_payment==1)].shape

(437, 24)

In [33]:
get_info(df[df.is_payment == 1].message.str.lower().str.cat(sep=' '))

In [34]:
# TO DO: Tina to investigate more word/phrases here. Put everything you tried in here.
# Hint: use the investigation examples to get inspirations 

payment_keywords = \
["pay", "payment", "credit card", "cash", "check", "debit", "invoice", "payment", "deposit", "paypal", \
 "the deposit", "send you", "the contract", "a deposit", "the check", \
 "do you accept", "i will need", "you can pay", "will send you",\
 "send me your", "can send you", "a check", "check or cash",\
 "do i pay", "send you an", "send you the", "deposit to hold the",\
 "do you accept credit", "do you take cash", "check or credit card", "payment do you accept", \
 "at your earliest convenience", "hold the date", "send you an invoice", "& quote"]


In [35]:
payment_terms = pd.DataFrame(columns=('Term', 'Precision', 'Recall', 'Count'))
for i, term in enumerate(payment_keywords):
    payment_terms.loc[i] = display_info(term, 'is_payment')


  import sys


In [36]:
payment_terms.sort_values(by=['Precision', 'Recall'], ascending=False)

Unnamed: 0,Term,Precision,Recall,Count
34,& quote,nan%,0%,0
2,credit card,95%,10%,42
9,paypal,91%,7%,31
14,the check,91%,5%,20
23,do i pay,90%,2%,9
21,a check,89%,9%,41
3,cash,87%,22%,97
10,the deposit,85%,7%,29
1,payment,82%,24%,104
7,payment,82%,24%,104


In [37]:
# TO DO: Tina to finalize the set of words to use by picking from the payment_terms list

payment_regex_words = ['debit','credit card','the check','paypal', 'venmo',\
                        'the deposit', 'do i pay', 'a deposit'
                    ]

In [38]:
# TO DO: Tina to investigate more word/phrases here. Put everything you tried in here.
# Hint: use the investigation examples to get inspirations 

payment_keywords = \
["pay", "payment", "credit card", "cash", "check", "debit", "invoice", "payment", "deposit", "paypal", \
 "the deposit", "send you", "the contract", "a deposit", "the check", \
 "do you accept", "i will need", "you can pay", "will send you",\
 "send me your", "can send you", "a check", "check or cash",\
 "do i pay", "send you an", "send you the", "deposit to hold the",\
 "do you accept credit", "do you take cash", "check or credit card", "payment do you accept", \
 "at your earliest convenience", "hold the date", "send you an invoice"]

# cannot imrpove recall without decreasing precision

### Payment Regex

In [39]:
payment_regex_words = ['a check', 'the check', "cheque",
                       'debit card', 'check or cash', 'cash',
                       'credit card', 'paypal', 'venmo', 
                       'deposit', 'the contract and',
                       'you can pay', 'do i pay', 'can i pay', 'should i pay',
                       'do we pay', 'can we pay', 'should we pay',
                       'pay me', 'pay you', 'pay us',
                       'do you take', 'do you accept',
                        'payment', 'invoice']

display_info_multiple(payment_regex_words, 'is_payment')

['^.*(a check|the check|cheque|debit card|check or cash|cash|credit card|paypal|venmo|deposit|the contract and|you can pay|do i pay|can i pay|should i pay|do we pay|can we pay|should we pay|pay me|pay you|pay us|do you take|do you accept|payment|invoice).*$',
 '81%',
 '84%',
 368]

In [40]:
# Current Status
# - Precision = 81%
# - Recall = 84%

payment_regex_words = ['a check', 'the check', "cheque",
                       'debit card', 'check or cash', 'cash',
                       'credit card', 'paypal', 'venmo', 
                       'deposit', 'the contract and','do you accept',
                       'you can pay', 'do i pay', 'can i pay', 'should i pay',
                       'pay me', 'pay you', 'pay us',
                       'do you take', 'do you accept',
                        'payment', 'invoice', 
                        '& quote'
                    ]

In [41]:
display_info_multiple(payment_regex_words, 'is_payment')

['^.*(a check|the check|cheque|debit card|check or cash|cash|credit card|paypal|venmo|deposit|the contract and|do you accept|you can pay|do i pay|can i pay|should i pay|pay me|pay you|pay us|do you take|do you accept|payment|invoice|& quote).*$',
 '81%',
 '84%',
 366]

### Other Categories
-  Considering, Follow-up, Rejection

## Rejection

In [42]:
df[(df.is_rejection==1) & (df.is_generic == 1)].shape

(1, 24)

In [43]:
df[(df.is_rejection==1)].shape

(321, 24)

In [44]:
get_info(df[df.is_rejection == 1].message.str.lower().str.cat(sep=' '))

In [45]:
# TO DO: Tina to investigate more word/phrases here. Put everything you tried in here.
# Hint: use the investigation examples to get inspirations 

rejection_keywords = \
["pay", "payment", "credit card", "cash", "check", "debit", "invoice", "payment", "deposit", "paypal", \
 "the deposit", "send you", "the contract", "a deposit", "the check", \
 "do you accept", "i will need", "you can pay", "will send you",\
 "send me your", "can send you", "a check", "check or cash",\
 "do i pay", "send you an", "send you the", "deposit to hold the",\
 "do you accept credit", "do you take cash", "check or credit card", "payment do you accept", \
 "at your earliest convenience", "hold the date", "send you an invoice", "& quote"]


In [46]:
payment_terms = pd.DataFrame(columns=('Term', 'Precision', 'Recall', 'Count'))
for i, term in enumerate(payment_keywords):
    payment_terms.loc[i] = display_info(term, 'is_payment')

  import sys


In [47]:
payment_terms.sort_values(by=['Precision', 'Recall'], ascending=False)

Unnamed: 0,Term,Precision,Recall,Count
34,& quote,nan%,0%,0
2,credit card,95%,10%,42
9,paypal,91%,7%,31
14,the check,91%,5%,20
23,do i pay,90%,2%,9
21,a check,89%,9%,41
3,cash,87%,22%,97
10,the deposit,85%,7%,29
1,payment,82%,24%,104
7,payment,82%,24%,104


In [48]:
# TO DO: Tina to finalize the set of words to use by picking from the payment_terms list

payment_regex_words = ['debit','credit card','the check','paypal', 'venmo',\
                        'the deposit', 'do i pay', 'a deposit'
                    ]

In [49]:
# TO DO: Tina to investigate more word/phrases here. Put everything you tried in here.
# Hint: use the investigation examples to get inspirations 

payment_keywords = \
["pay", "payment", "credit card", "cash", "check", "debit", "invoice", "payment", "deposit", "paypal", \
 "the deposit", "send you", "the contract", "a deposit", "the check", \
 "do you accept", "i will need", "you can pay", "will send you",\
 "send me your", "can send you", "a check", "check or cash",\
 "do i pay", "send you an", "send you the", "deposit to hold the",\
 "do you accept credit", "do you take cash", "check or credit card", "payment do you accept", \
 "at your earliest convenience", "hold the date", "send you an invoice"]

# cannot imrpove recall without decreasing precision

### Payment Regex

In [50]:
payment_regex_words = ['a check', 'the check', "cheque",
                       'debit card', 'check or cash', 'cash',
                       'credit card', 'paypal', 'venmo', 
                       'deposit', 'the contract and',
                       'you can pay', 'do i pay', 'can i pay', 'should i pay',
                       'do we pay', 'can we pay', 'should we pay',
                       'pay me', 'pay you', 'pay us',
                       'do you take', 'do you accept',
                        'payment', 'invoice']

display_info_multiple(payment_regex_words, 'is_payment')

['^.*(a check|the check|cheque|debit card|check or cash|cash|credit card|paypal|venmo|deposit|the contract and|you can pay|do i pay|can i pay|should i pay|do we pay|can we pay|should we pay|pay me|pay you|pay us|do you take|do you accept|payment|invoice).*$',
 '81%',
 '84%',
 368]

In [51]:
# Current Status
# - Precision = 81%
# - Recall = 84%

payment_regex_words = ['a check', 'the check', "cheque",
                       'debit card', 'check or cash', 'cash',
                       'credit card', 'paypal', 'venmo', 
                       'deposit', 'the contract and','do you accept',
                       'you can pay', 'do i pay', 'can i pay', 'should i pay',
                       'pay me', 'pay you', 'pay us',
                       'do you take', 'do you accept',
                        'payment', 'invoice', 
                        '& quote'
                    ]

In [52]:
display_info_multiple(payment_regex_words, 'is_payment')

['^.*(a check|the check|cheque|debit card|check or cash|cash|credit card|paypal|venmo|deposit|the contract and|do you accept|you can pay|do i pay|can i pay|should i pay|pay me|pay you|pay us|do you take|do you accept|payment|invoice|& quote).*$',
 '81%',
 '84%',
 366]