## Importing Data

In [1]:
import pandas as pd
import sys
import codecs
import nltk
from nltk.corpus import stopwords
from nltk import ngrams

import matplotlib
import matplotlib.pyplot as plt

pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 500)

In [2]:
# getting data

one_ic = pd.read_csv('../data/1-ic.csv')
one_gc = pd.read_csv('../data/1-gc.csv')
one_sc = pd.read_csv('../data/1-sc.csv')

two_jm = pd.read_csv('../data/2-jm.csv')
two_mg = pd.read_csv('../data/2-mg.csv')
two_nb = pd.read_csv('../data/2-nb.csv')

three_rs = pd.read_csv('../data/3-rs.csv')
three_rt = pd.read_csv('../data/3-rt.csv')
three_sj = pd.read_csv('../data/3-sj.csv')

In [3]:
tph_batch1 = pd.read_csv('../data/tph_batch1.csv')

In [4]:
tph_batch1.head()

Unnamed: 0,sampled_bid_id,message_timestamp,message_sender,message,category,agent_group
0,18314236,2018-01-31 23:16:50.788616 UTC,Customer,Hello Brian. You helped us a few years back and we are in need of an electrician again! Are you able to install a new electrical hook up for a new spa we have purchased? It requires a dedicated hook up for a 240v spa and will be outdoors. We are in Bay Park area just off I-5. Please let me know... Thanks!,Electrical and Wiring Repair,agent_3
1,27365009,2018-02-11 16:54:15.485935 UTC,Customer,Sounds good! What's next?,Nutritionist,agent_2
2,27365009,2018-02-11 22:20:38.481901 UTC,Pro,"Hi Cayatana, nice to &quotesee&quote you back! Since it's been two years since we last spoke, perhaps you can tell me if you are still looking for the same thing. How would you like to work together...remotely or in-person? What is your schedule like? Once we determine our first session, you would send payment before hand to my PayPal account before we proceed. I will also send you some forms to complete prior to our session, as well. Best, Amy",Nutritionist,agent_2
3,27365009,2018-02-13 20:02:41.064269 UTC,Customer,Hi! I’m currently traveling but will reach out as soon as I get back. Thanks!,Nutritionist,agent_2
4,27365009,2018-02-13 22:28:17.518399 UTC,Pro,"Sounds good and safe travels! Best, Amy",Nutritionist,agent_2


### One

In [5]:
one_gc_labels = one_gc[['Label 1', 'Label 2', 'Label 3', 'Label 4', 'Label 5']]
one_sc_labels = one_sc[['Label 1', 'Label 2', 'Label 3', 'Label 4', 'Label 5']]
one = one_ic.merge(one_gc_labels, left_index=True, right_index=True)
one = one.merge(one_sc_labels, left_index=True, right_index=True)

### Two

In [6]:
two_mg_labels = two_mg[['Label 1', 'Label 2', 'Label 3', 'Label 4', 'Label 5']]
two_nb_labels = two_nb[['Label 1', 'Label 2', 'Label 3', 'Label 4', 'Label 5']]
two = two_jm.merge(two_mg_labels, left_index=True, right_index=True)
two = two.merge(two_nb_labels, left_index=True, right_index=True)

### Three

In [7]:
three_rt_labels = three_rt[['Label 1', 'Label 2', 'Label 3', 'Label 4', 'Label 5']]
three_sj_labels = three_sj[['Label 1', 'Label 2', 'Label 3', 'Label 4', 'Label 5']]
three = three_rs.merge(three_rt_labels, left_index=True, right_index=True)
three = three.merge(three_sj_labels, left_index=True, right_index=True)

### Concatenate DFs

In [8]:
# all three dataframes have the same shape
print(three.shape)
print(one.shape)
print(two.shape)

(4999, 22)
(5040, 22)
(5003, 22)


In [9]:
# df is batch 1 data, but conversation level
labels = pd.concat([one, two, three], ignore_index=True)

In [10]:
labels['Label 1_x'].unique()

array(['Scheduling - Meeting or Job', 'Price', 'Location', 'Job Details',
       'Generic Answer', 'Contact Information', 'Confirmation - Meeting',
       nan, 'Follow-up', 'Confirmation - Contact', 'Considering', 'Hire',
       'Rejection', 'Payment'], dtype=object)

In [11]:
#tph_batch1.columns = ['bid_id']
tph_batch1.rename({'sampled_bid_id': 'bid_id'}, axis='columns', inplace=True)

### Hire

In [12]:
def is_hire(value):
    if value == "Hire":
        return 1
    else:
        return 0

In [13]:
hire = labels.applymap(is_hire)
hires = hire[["Label 1_x","Label 2_x", "Label 3_x", "Label 4_x", "Label 5_x",
              "Label 1_y","Label 2_y", "Label 3_y", "Label 4_y", "Label 5_y",
              'Label 1', 'Label 2', 'Label 3', 'Label 4', 'Label 5']].any(axis=1)
hires = hires[hires == True]
labels['is_hired'] = hires
labels['is_hired'].fillna(False, inplace=True)
hired = labels.loc[hires.index]

In [14]:
labels['is_hired'].shape

(15042,)

### Location

- location, contact information, scheduling - meeting or job, price, payment

In [15]:
def is_location(value):
    if value == "Location":
        return 1
    else:
        return 0

In [16]:
location = labels.applymap(is_location)
locations = location[["Label 1_x","Label 2_x", "Label 3_x", "Label 4_x", "Label 5_x",
              "Label 1_y","Label 2_y", "Label 3_y", "Label 4_y", "Label 5_y",
              'Label 1', 'Label 2', 'Label 3', 'Label 4', 'Label 5']].any(axis=1)
locations = locations[locations == True]
labels['is_location'] = locations
labels['is_location'].fillna(False, inplace=True)
is_location = labels.loc[locations.index]

In [10]:
labels['Label 1_x'].unique()

array(['Scheduling - Meeting or Job', 'Price', 'Location', 'Job Details',
       'Generic Answer', 'Contact Information', 'Confirmation - Meeting',
       nan, 'Follow-up', 'Confirmation - Contact', 'Considering', 'Hire',
       'Rejection', 'Payment'], dtype=object)

### Contact Information

- location, contact information, scheduling - meeting or job, price, payment

In [20]:
def is_contact_info(value):
    if value == "Contact Information":
        return 1
    else:
        return 0

In [21]:
contact_info = labels.applymap(is_contact_info)
contact_infos = contact_info[["Label 1_x","Label 2_x", "Label 3_x", "Label 4_x", "Label 5_x",
              "Label 1_y","Label 2_y", "Label 3_y", "Label 4_y", "Label 5_y",
              'Label 1', 'Label 2', 'Label 3', 'Label 4', 'Label 5']].any(axis=1)
contact_infos = contact_infos[contact_infos == True]
labels['is_contact_info'] = contact_infos
labels['is_contact_info'].fillna(False, inplace=True)
is_contact_info = labels.loc[contact_infos.index]

### Scheduling

- price, payment

In [23]:
def is_scheduling(value):
    if value == "Scheduling - Meeting or Job":
        return 1
    else:
        return 0

In [24]:
scheduling = labels.applymap(is_scheduling)
schedulings = scheduling[["Label 1_x","Label 2_x", "Label 3_x", "Label 4_x", "Label 5_x",
              "Label 1_y","Label 2_y", "Label 3_y", "Label 4_y", "Label 5_y",
              'Label 1', 'Label 2', 'Label 3', 'Label 4', 'Label 5']].any(axis=1)
schedulings = schedulings[schedulings == True]
labels['is_scheduling'] = schedulings
labels['is_scheduling'].fillna(False, inplace=True)
is_scheduling = labels.loc[schedulings.index]

### Price

In [37]:
def is_price(value):
    if value == "Price":
        return 1
    else:
        return 0

In [38]:
price = labels.applymap(is_price)
prices = price[["Label 1_x","Label 2_x", "Label 3_x", "Label 4_x", "Label 5_x",
              "Label 1_y","Label 2_y", "Label 3_y", "Label 4_y", "Label 5_y",
              'Label 1', 'Label 2', 'Label 3', 'Label 4', 'Label 5']].any(axis=1)
prices = prices[prices == True]
labels['is_price'] = prices
labels['is_price'].fillna(False, inplace=True)
is_price = labels.loc[prices.index]

### Payment

In [39]:
def is_payment(value):
    if value == "Payment":
        return 1
    else:
        return 0

In [40]:
payment = labels.applymap(is_payment)
payments = payment[["Label 1_x","Label 2_x", "Label 3_x", "Label 4_x", "Label 5_x",
              "Label 1_y","Label 2_y", "Label 3_y", "Label 4_y", "Label 5_y",
              'Label 1', 'Label 2', 'Label 3', 'Label 4', 'Label 5']].any(axis=1)
payments = payments[payments == True]
labels['is_payment'] = payments
labels['is_payment'].fillna(False, inplace=True)
is_payment = labels.loc[payments.index]

In [41]:
labels.head()

Unnamed: 0,bid_id,bid_id_header,message_timestamp,message_sender,pii_cleaned_message,Label 1_x,Label 2_x,Label 3_x,Label 4_x,Label 5_x,...,Label 2,Label 3,Label 4,Label 5,is_hired,is_location,is_contact_info,is_scheduling,is_price,is_payment
0,29404730,29404730.0,2017-11-04 01:24:06.036065 UTC,Customer,Greetings..\nAre you available for resume writing?,Scheduling - Meeting or Job,,,,,...,,,,,False,False,False,True,False,False
1,30265067,30265067.0,2018-04-03 17:58:48.35945 UTC,Customer,What's your next availability?,Scheduling - Meeting or Job,,,,,...,,,,,False,False,False,True,False,False
2,30265067,,2018-04-03 17:59:45.382072 UTC,Customer,I would like to get an estimate for siding repair,Price,Job Details,,,,...,Job Details,,,,False,False,False,False,True,False
3,30265067,,2018-04-03 18:29:00.359911 UTC,Pro,April. Can likely do est this wk & Work maybe the next. let me know your phone # & address. [LOCATION]. [PERSON_NAME],Scheduling - Meeting or Job,Contact Information,Location,Price,,...,Contact Information,Location,Price,,False,True,True,True,True,False
4,30265067,,2018-04-03 18:56:44.639868 UTC,Customer,[LOCATION] [PHONE_NUMBER],Location,Contact Information,,,,...,Contact Information,,,,False,True,True,False,False,False


### Locations

In [42]:
is_location[is_location.message_sender == 'Customer'][['message_sender', 'pii_cleaned_message', 'category']].shape

(1092, 3)

In [43]:
is_location.shape

(2206, 24)

In [50]:
tph_batch1.head()

Unnamed: 0,bid_id,message_timestamp,message_sender,message,category,agent_group
0,18314236,2018-01-31 23:16:50.788616 UTC,Customer,Hello Brian. You helped us a few years back and we are in need of an electrician again! Are you able to install a new electrical hook up for a new spa we have purchased? It requires a dedicated hook up for a 240v spa and will be outdoors. We are in Bay Park area just off I-5. Please let me know... Thanks!,Electrical and Wiring Repair,agent_3
1,27365009,2018-02-11 16:54:15.485935 UTC,Customer,Sounds good! What's next?,Nutritionist,agent_2
2,27365009,2018-02-11 22:20:38.481901 UTC,Pro,"Hi Cayatana, nice to &quotesee&quote you back! Since it's been two years since we last spoke, perhaps you can tell me if you are still looking for the same thing. How would you like to work together...remotely or in-person? What is your schedule like? Once we determine our first session, you would send payment before hand to my PayPal account before we proceed. I will also send you some forms to complete prior to our session, as well. Best, Amy",Nutritionist,agent_2
3,27365009,2018-02-13 20:02:41.064269 UTC,Customer,Hi! I’m currently traveling but will reach out as soon as I get back. Thanks!,Nutritionist,agent_2
4,27365009,2018-02-13 22:28:17.518399 UTC,Pro,"Sounds good and safe travels! Best, Amy",Nutritionist,agent_2


In [53]:
tph_batch1.shape

(61048, 6)

In [54]:
pd.merge(tph_batch1, labels[['bid_id', 'message_timestamp', 'is_hired']], how='inner', on=["bid_id", "message_timestamp"]).shape

(15042, 7)

In [62]:
df = pd.merge(tph_batch1, labels[['bid_id', 'message_timestamp', 'is_hired']], on=["bid_id", "message_timestamp"])
df = pd.merge(df, labels[['bid_id', 'message_timestamp', 'is_location']], on=["bid_id", "message_timestamp"])
df = pd.merge(df, labels[['bid_id', 'message_timestamp', 'is_contact_info']], on=["bid_id", "message_timestamp"])
df = pd.merge(df, labels[['bid_id', 'message_timestamp', 'is_scheduling']], on=["bid_id", "message_timestamp"])
df = pd.merge(df, labels[['bid_id', 'message_timestamp', 'is_price']], on=["bid_id", "message_timestamp"])
df = pd.merge(df, labels[['bid_id', 'message_timestamp', 'is_payment']], on=["bid_id", "message_timestamp"])

In [64]:
df.head()

Unnamed: 0,bid_id,message_timestamp,message_sender,message,category,agent_group,is_hired,is_location,is_contact_info,is_scheduling,is_price,is_payment
0,18314236,2018-01-31 23:16:50.788616 UTC,Customer,Hello Brian. You helped us a few years back and we are in need of an electrician again! Are you able to install a new electrical hook up for a new spa we have purchased? It requires a dedicated hook up for a 240v spa and will be outdoors. We are in Bay Park area just off I-5. Please let me know... Thanks!,Electrical and Wiring Repair,agent_3,False,True,False,False,False,False
1,27365009,2018-02-11 16:54:15.485935 UTC,Customer,Sounds good! What's next?,Nutritionist,agent_2,False,False,False,False,False,False
2,27365009,2018-02-11 22:20:38.481901 UTC,Pro,"Hi Cayatana, nice to &quotesee&quote you back! Since it's been two years since we last spoke, perhaps you can tell me if you are still looking for the same thing. How would you like to work together...remotely or in-person? What is your schedule like? Once we determine our first session, you would send payment before hand to my PayPal account before we proceed. I will also send you some forms to complete prior to our session, as well. Best, Amy",Nutritionist,agent_2,False,False,False,True,False,True
3,27365009,2018-02-13 20:02:41.064269 UTC,Customer,Hi! I’m currently traveling but will reach out as soon as I get back. Thanks!,Nutritionist,agent_2,False,False,False,True,False,False
4,27365009,2018-02-13 22:28:17.518399 UTC,Pro,"Sounds good and safe travels! Best, Amy",Nutritionist,agent_2,False,False,False,False,False,False


### Location NLTK Freqdist

In [65]:
default_stopwords = set(nltk.corpus.stopwords.words('english'))

In [68]:
df.head()

Unnamed: 0,bid_id,message_timestamp,message_sender,message,category,agent_group,is_hired,is_location,is_contact_info,is_scheduling,is_price,is_payment
0,18314236,2018-01-31 23:16:50.788616 UTC,Customer,Hello Brian. You helped us a few years back and we are in need of an electrician again! Are you able to install a new electrical hook up for a new spa we have purchased? It requires a dedicated hook up for a 240v spa and will be outdoors. We are in Bay Park area just off I-5. Please let me know... Thanks!,Electrical and Wiring Repair,agent_3,False,True,False,False,False,False
1,27365009,2018-02-11 16:54:15.485935 UTC,Customer,Sounds good! What's next?,Nutritionist,agent_2,False,False,False,False,False,False
2,27365009,2018-02-11 22:20:38.481901 UTC,Pro,"Hi Cayatana, nice to &quotesee&quote you back! Since it's been two years since we last spoke, perhaps you can tell me if you are still looking for the same thing. How would you like to work together...remotely or in-person? What is your schedule like? Once we determine our first session, you would send payment before hand to my PayPal account before we proceed. I will also send you some forms to complete prior to our session, as well. Best, Amy",Nutritionist,agent_2,False,False,False,True,False,True
3,27365009,2018-02-13 20:02:41.064269 UTC,Customer,Hi! I’m currently traveling but will reach out as soon as I get back. Thanks!,Nutritionist,agent_2,False,False,False,True,False,False
4,27365009,2018-02-13 22:28:17.518399 UTC,Pro,"Sounds good and safe travels! Best, Amy",Nutritionist,agent_2,False,False,False,False,False,False


In [70]:
df[df.is_hired == True].shape

(602, 12)

In [71]:
df[df.is_location == True].shape

(2206, 12)

In [72]:
df[df.is_contact_info == True].shape

(2148, 12)

In [73]:
df[df.is_scheduling == True].shape

(6369, 12)

In [74]:
df[df.is_price == True].shape

(2962, 12)

In [75]:
df[df.is_payment == True].shape

(522, 12)

### Hiring

In [76]:
# https://stackoverflow.com/questions/17531684/n-grams-in-python-four-five-six-grams
# https://stackoverflow.com/questions/14364762/counting-n-gram-frequency-in-python-nltk
# https://www.strehle.de/tim/weblog/archives/2015/09/03/1569
# https://stackoverflow.com/questions/40206249/count-of-most-popular-words-in-a-pandas-dataframe

txt = df[df.is_hired == True].message.str.lower().str.replace(r'\|', ' ').str.cat(sep=' ')
words = nltk.tokenize.word_tokenize(txt)
#Create your bigrams
bigrams = nltk.bigrams(words)
trigrams = nltk.trigrams(words)
# fourgrams = ngrams(words.split(), 4)
fourgrams = ngrams(words,4)
fivegrams = ngrams(words,5)
word_dist = nltk.FreqDist(words)

top_N = 100

default_stopwords = set(nltk.corpus.stopwords.words('english'))
custom_stopwords = set((u'.', u',', u'?', u'!', u'would', u')', u':', u'\'s', u'('))
all_stopwords = default_stopwords | custom_stopwords

words_except_stop_dist = nltk.FreqDist(w for w in words if w not in all_stopwords) 

# print('All frequencies, including STOPWORDS:')
# print('=' * 60)
# rslt = pd.DataFrame(word_dist.most_common(top_N),
#                     columns=['Word', 'Frequency'])
# print(rslt)
# print('=' * 60)

print('Unigram frequencies, excluding STOPWORDS:')
rslt = pd.DataFrame(words_except_stop_dist.most_common(top_N),
                    columns=['Word', 'Frequency']).set_index('Word')

print(rslt)
print('=' * 60)

# Bigram Frequencies
print('Bigram frequencies:')
bigrams_freq = nltk.FreqDist(bigrams)
rslt = pd.DataFrame(bigrams_freq.most_common(top_N),
                    columns=['Word', 'Frequency']).set_index('Word')

print(rslt)
print('=' * 60)

# Trigram Frequencies
print('Trigram frequencies:')
trigrams_freq = nltk.FreqDist(trigrams)
rslt = pd.DataFrame(trigrams_freq.most_common(top_N),
                    columns=['Word', 'Frequency']).set_index('Word')

print(rslt)
print('=' * 60)

# Fourgram Frequencies
print('Fourgram frequencies:')
fourgrams_freq = nltk.FreqDist(fourgrams)
rslt = pd.DataFrame(fourgrams_freq.most_common(top_N),
                    columns=['Word', 'Frequency']).set_index('Word')

print(rslt)
print('=' * 60)

# matplotlib.style.use('ggplot')
# rslt.plot.bar(rot=0)

Unigram frequencies, excluding STOPWORDS:
                  Frequency
Word                       
hi                162      
review            156      
thank             136      
thanks            128      
get               101      
work              90       
thumbtack         90       
please            88       
see               83       
&                 72       
taking            70       
help              67       
appreciate        64       
know              60       
good              58       
great             58       
really            57       
let               57       
new               56       
write             55       
'd                54       
much              53       
time              52       
minutes           52       
need              51       
reviews           51       
advance           51       
send              50       
include           47       
profile           47       
n't               47       
important         46       
'll   

## Other

In [238]:
# https://stackoverflow.com/questions/17531684/n-grams-in-python-four-five-six-grams
# https://stackoverflow.com/questions/14364762/counting-n-gram-frequency-in-python-nltk
# https://www.strehle.de/tim/weblog/archives/2015/09/03/1569
# https://stackoverflow.com/questions/40206249/count-of-most-popular-words-in-a-pandas-dataframe

txt = df[df.is_payment == True].message.str.lower().str.replace(r'\|', ' ').str.cat(sep=' ')
words = nltk.tokenize.word_tokenize(txt)
#Create your bigrams
bigrams = nltk.bigrams(words)
trigrams = nltk.trigrams(words)
# fourgrams = ngrams(words.split(), 4)
fourgrams = ngrams(words,4)
fivegrams = ngrams(words,5)
word_dist = nltk.FreqDist(words)

top_N = 100

default_stopwords = set(nltk.corpus.stopwords.words('english'))
custom_stopwords = set((u'.', u',', u'?', u'!', u'would', u')', u':', u'\'s', u'('))
all_stopwords = default_stopwords | custom_stopwords

words_except_stop_dist = nltk.FreqDist(w for w in words if w not in all_stopwords) 

# print('All frequencies, including STOPWORDS:')
# print('=' * 60)
# rslt = pd.DataFrame(word_dist.most_common(top_N),
#                     columns=['Word', 'Frequency'])
# print(rslt)
# print('=' * 60)

print('Unigram frequencies, excluding STOPWORDS:')
rslt = pd.DataFrame(words_except_stop_dist.most_common(top_N),
                    columns=['Word', 'Frequency']).set_index('Word')

print(rslt)
print('=' * 60)

# Bigram Frequencies
print('Bigram frequencies:')
bigrams_freq = nltk.FreqDist(bigrams)
rslt = pd.DataFrame(bigrams_freq.most_common(top_N),
                    columns=['Word', 'Frequency']).set_index('Word')

print(rslt)
print('=' * 60)

# Trigram Frequencies
print('Trigram frequencies:')
trigrams_freq = nltk.FreqDist(trigrams)
rslt = pd.DataFrame(trigrams_freq.most_common(top_N),
                    columns=['Word', 'Frequency']).set_index('Word')

print(rslt)
print('=' * 60)

# Fourgram Frequencies
print('Fourgram frequencies:')
fourgrams_freq = nltk.FreqDist(fourgrams)
rslt = pd.DataFrame(fourgrams_freq.most_common(top_N),
                    columns=['Word', 'Frequency']).set_index('Word')

print(rslt)
print('=' * 60)

# matplotlib.style.use('ggplot')
# rslt.plot.bar(rot=0)

Unigram frequencies, excluding STOPWORDS:
           Frequency
Word                
$          131      
send       127      
payment    126      
check      123      
deposit    118      
get        102      
pay        100      
cash       99       
need       99       
like       91       
-          89       
know       88       
n't        86       
please     85       
work       84       
thanks     82       
email      82       
&          81       
also       81       
time       77       
thank      76       
let        73       
hi         63       
address    62       
take       61       
'll        59       
day        59       
credit     58       
contract   57       
yes        52       
'm         49       
make       49       
much       47       
invoice    47       
back       46       
one        44       
quote      43       
phone      43       
want       42       
could      42       
card       41       
great      41       
see        40       
date       40

### Customer Messages

- 1092 / 2206 are from customers
- 1114 / 2206 are from pros

## Regex Testing

In [83]:
df.columns

Index(['bid_id', 'message_timestamp', 'message_sender', 'message', 'category',
       'agent_group', 'is_hired', 'is_location', 'is_contact_info',
       'is_scheduling', 'is_price', 'is_payment'],
      dtype='object')

In [85]:
df[df.is_hired == True][['bid_id', 'message_sender', 'message', 'is_hired']]

Unnamed: 0,bid_id,message_sender,message,is_hired
25,37749231,Pro,Hi im here but no parking may we reschedule for sat. Morning at 9,True
133,48887274,Pro,"Hello Liz, I cleaned for your son Bill today. Can you mark me as hired, and please leave me a review I would really appreciate it. \nThank You, Amy James",True
134,48887274,Customer,Thank you Amy. Bill was very pleased with the work that you did. I appreciate your patience in his getting back to you. I hope he continues to have you clean on a regular basis.,True
135,48887274,Pro,"Thank You Liz, I'm glad he was happy with my cleaning.",True
165,53924583,Customer,"Thank you so much! I will get the eyelash extensions and will pay the travel fee since no one will be able to take me during the day And my event starts at 4. Thank you so much for the help! I think I already booked you for the 19 on your website earlier today. I can just pay the additional charges in person. So in total, how much would I owe you with the traveling fee and eyelashes?",True
212,54853421,Pro,"Good morning Sheila,\n\nI was recently notified by Thumbtack that you hired me for you're surprise 50th Birthday party.\n\nFirst off, Thank you so much for hiring me I assure you everyone will have a terrific time especially the Birthday boy!\n\nFeel free to contact me anytime that is convenient to go over the details of the party.\n\nI plan on reaching out to you later on Today to connect, answer any questions and get venue details.\n\nI look forward to speaking with you soon Sheila.\n\nBest regards,\n\nDJ Rob\nNext Event Services",True
213,54853421,Pro,"Good afternoon Sheila,\r\n\r\nThank you very much for hiring me for your upcoming Surprise Birthday party June16th.\r\n\r\nI just called you and left a message with my contact info.\r\n\r\nAs this is a surprise Birthday party, feel free to contact me when it is most convenient anytime this weekend forward.\r\n\r\nI look forward to speaking with you Sheila.\r\n\r\nThank you,\r\n\r\nDJ Rob\r\nNext Event Services\r\nrobditalia@gmail.com\r\n(508) 951-4603",True
214,54853421,Pro,"Good afternoon Sheila,\n\nThank you very much for hiring me for your upcoming Surprise Birthday party June16th.\n\nI just called you and left a message with my contact info.\n\nWhen you get a chance email me or call me to discuss the details of the party, answer any questions and to solidify the date.\n\nMy contact info is listed below and I look forward to speaking with you Sheila.\n\nThank you,\n\nDJ Rob\nNext Event Services\nrobditalia@gmail.com\n(508) 951-4603",True
255,55577457,Customer,Do I need to click the hire button?,True
321,56897154,Pro,Ok I'm at Freddy's eating. Be there as soon as I finish,True


### Total 1599 Conversations

- doing preidctions on bid level
- can also do predictions on the chat / pro level. and then aggregate it afterwards. 

In [86]:
df.columns

Index(['bid_id', 'message_timestamp', 'message_sender', 'message', 'category',
       'agent_group', 'is_hired', 'is_location', 'is_contact_info',
       'is_scheduling', 'is_price', 'is_payment'],
      dtype='object')

In [89]:
df[df.is_location == True][['bid_id', 'message_sender', 'message', 'is_location']]

Unnamed: 0,bid_id,message_sender,message,is_location
0,18314236,Customer,Hello Brian. You helped us a few years back and we are in need of an electrician again! Are you able to install a new electrical hook up for a new spa we have purchased? It requires a dedicated hook up for a 240v spa and will be outdoors. We are in Bay Park area just off I-5. Please let me know... Thanks!,True
8,30265067,Pro,April. Can likely do est this wk & Work maybe the next. let me know your phone # & address. Tx. Wade,True
9,30265067,Customer,4 Cameroons Pl. Durham 919-358-2996,True
11,37749231,Customer,Hey can you do the cleaning today at 937 court street reading pennsylvania,True
29,37749231,Customer,Just double park at the address,True
31,37749231,Customer,Come to 937 court street,True
36,40733372,Customer,"Hi I am still looking for a piano teacher for my son. Can you please tell me your price and for how many minutes? Also, the location? Thanks!",True
40,40733372,Customer,"What is the price per class and how many minutes? Also, where is the class at?",True
41,42126696,Customer,"Hi Alvaro, can you meet me on Sunday 10:15 AM in Westport to see the job?",True
44,42126696,Pro,After 4 pm just send me your address and phone number.,True


In [90]:
bid_level_messages = df.groupby('bid_id')['message'].apply(lambda x: " ".join(x)).reset_index()

### Model

In [96]:
df.head(2)

Unnamed: 0,bid_id,message_timestamp,message_sender,message,category,agent_group,is_hired,is_location,is_contact_info,is_scheduling,is_price,is_payment
0,18314236,2018-01-31 23:16:50.788616 UTC,Customer,Hello Brian. You helped us a few years back and we are in need of an electrician again! Are you able to install a new electrical hook up for a new spa we have purchased? It requires a dedicated hook up for a 240v spa and will be outdoors. We are in Bay Park area just off I-5. Please let me know... Thanks!,Electrical and Wiring Repair,agent_3,False,True,False,False,False,False
1,27365009,2018-02-11 16:54:15.485935 UTC,Customer,Sounds good! What's next?,Nutritionist,agent_2,False,False,False,False,False,False


In [133]:
hired = df.groupby('bid_id')['is_hired'].apply(lambda x: x.any()).reset_index()
location = df.groupby('bid_id')['is_location'].apply(lambda x: x.any()).reset_index()
contact_info = df.groupby('bid_id')['is_contact_info'].apply(lambda x: x.any()).reset_index()
scheduling = df.groupby('bid_id')['is_scheduling'].apply(lambda x: x.any()).reset_index()
price = df.groupby('bid_id')['is_price'].apply(lambda x: x.any()).reset_index()
payment = df.groupby('bid_id')['is_payment'].apply(lambda x: x.any()).reset_index()

In [135]:
bid_level_messages = messages.groupby('bid_id')['message'].apply(lambda x: " ".join(x)).reset_index()
bid_level_messages = pd.merge(bid_level_messages, hired, on="bid_id")
bid_level_messages = pd.merge(bid_level_messages, location, on="bid_id")
bid_level_messages = pd.merge(bid_level_messages, contact_info, on="bid_id")
bid_level_messages = pd.merge(bid_level_messages, scheduling, on="bid_id")
bid_level_messages = pd.merge(bid_level_messages, price, on="bid_id")
bid_level_messages = pd.merge(bid_level_messages, payment, on="bid_id")

In [137]:
bid_level_messages.head()

Unnamed: 0,bid_id,message,is_hired,is_location,is_contact_info,is_scheduling,is_price,is_payment
0,18314236,Hello Brian. You helped us a few years back and we are in need of an electrician again! Are you able to install a new electrical hook up for a new spa we have purchased? It requires a dedicated hook up for a 240v spa and will be outdoors. We are in Bay Park area just off I-5. Please let me know... Thanks!,False,True,False,False,False,False
1,27365009,"Sounds good! What's next? Hi Cayatana, nice to &quotesee&quote you back! Since it's been two years since we last spoke, perhaps you can tell me if you are still looking for the same thing. How would you like to work together...remotely or in-person? What is your schedule like? Once we determine our first session, you would send payment before hand to my PayPal account before we proceed. I will also send you some forms to complete prior to our session, as well. Best, Amy Hi! I’m currently traveling but will reach out as soon as I get back. Thanks! Sounds good and safe travels! Best, Amy",False,False,False,True,False,True
2,29404730,Greetings..\nAre you available for resume writing?,False,False,False,True,False,False
3,30265067,What's your next availability? I would like to get an estimate for siding repair April. Can likely do est this wk & Work maybe the next. let me know your phone # & address. Tx. Wade 4 Cameroons Pl. Durham 919-358-2996,False,True,True,True,True,False
4,31521550,Can you get the door too if I give you photos and measurement,False,False,False,False,False,False


In [186]:
terms = ["review", "reviews", "leave us a review", "the review", "rating",
         "hire", "hired", "hiring", "as hired",
         "booking request", "book", 
         "appointment", "contract",
         "reschedule", "schedule", "scheduled",
         "confirming", "confirmed", "confirm", "let's do it",
         "see you", "see u", "see you at", "see you on",
         "on my way", "on our way", "on the way",
         "be there", "is there", "almost there", "get there", 
         "arrive by", "arrive", "eta", 
         "pay", "credit card", "cash", "check", "debit", "invoice", "payment",
         "charges", "charging", "charged",
         "phone number", "contact me", "call me", "number is", "a call", "my cell",
         "my number", "text me", "my phone", "your phone", "can call", "please call",
         "email address", "@gmail.com", "your email", "my email",
         "my address", "are you located", "me the address", 
         "send me your address", "what's your address", "i can meet you",
         "home address", "physical address", "the address", "your address",
         "finished", "finishing", "your work", "my work",
         "available", "full address", "address is"
        ]

### Hired Precision and Recall

In [199]:
def display_info_row_hire(term):
    predicted_hires = bid_level_messages['message'].str.contains(term, case=False)
    predicted_hires = predicted_hires[predicted_hires == True]
    ground_truth = bid_level_messages.loc[predicted_hires.index][['is_hired']] # out of the predicted positives, what's the ground truth
    tp = ground_truth[ground_truth['is_hired'] == True] # ones that are actually hired
    
    precision = tp.shape[0] / predicted_hires.shape[0]
    recall = tp.shape[0] / bid_level_messages[bid_level_messages.is_hired == True].shape[0]
    num_terms = tp.shape[0]
    return [term, 
            "{0:.0f}%".format(precision*100), 
            "{0:.0f}%".format(recall*100), 
            num_terms]

### Location Precision and Recall

In [175]:
def display_info_row_location(term):
    predicted_hires = bid_level_messages['message'].str.contains(term, case=False)
    predicted_hires = predicted_hires[predicted_hires == True]
    ground_truth = bid_level_messages.loc[predicted_hires.index][['is_location']] # out of the predicted positives, what's the ground truth
    tp = ground_truth[ground_truth['is_location'] == True] # ones that are actually hired
    
    precision = tp.shape[0] / predicted_hires.shape[0]
    recall = tp.shape[0] / bid_level_messages[bid_level_messages.is_location == True].shape[0]
    num_terms = tp.shape[0]
    return [term, 
            "{0:.0f}%".format(precision*100), 
            "{0:.0f}%".format(recall*100), 
            num_terms]

### Contact Information Precision and Recall

In [138]:
def display_info_row_contact_info(term):
    predicted_hires = bid_level_messages['message'].str.contains(term, case=False)
    predicted_hires = predicted_hires[predicted_hires == True]
    ground_truth = bid_level_messages.loc[predicted_hires.index][['is_contact_info']] # out of the predicted positives, what's the ground truth
    tp = ground_truth[ground_truth['is_contact_info'] == True] # ones that are actually hired
    
    precision = tp.shape[0] / predicted_hires.shape[0]
    recall = tp.shape[0] / bid_level_messages[bid_level_messages.is_contact_info == True].shape[0]
    num_terms = tp.shape[0]
    return [term, 
            "{0:.0f}%".format(precision*100), 
            "{0:.0f}%".format(recall*100), 
            num_terms]

### Scheduling

In [193]:
def display_info_row_scheduling(term):
    predicted_hires = bid_level_messages['message'].str.contains(term, case=False)
    predicted_hires = predicted_hires[predicted_hires == True]
    ground_truth = bid_level_messages.loc[predicted_hires.index][['is_scheduling']] # out of the predicted positives, what's the ground truth
    tp = ground_truth[ground_truth['is_scheduling'] == True] # ones that are actually hired
    
    precision = tp.shape[0] / predicted_hires.shape[0]
    recall = tp.shape[0] / bid_level_messages[bid_level_messages.is_scheduling == True].shape[0]
    num_terms = tp.shape[0]
    return [term, 
            "{0:.0f}%".format(precision*100), 
            "{0:.0f}%".format(recall*100), 
            num_terms]

In [203]:
key_terms = pd.DataFrame(columns=('Term', 'Precision', 'Recall', 'Count'))
for i, term in enumerate(terms):
    key_terms.loc[i] = display_info_row_scheduling(term)

In [204]:
key_terms

Unnamed: 0,Term,Precision,Recall,Count
0,review,92%,15%,210
1,reviews,94%,9%,121
2,leave us a review,100%,0%,2
3,the review,100%,0%,4
4,rating,87%,1%,20
5,hire,89%,11%,152
6,hired,89%,5%,73
7,hiring,84%,2%,27
8,as hired,95%,1%,20
9,booking request,100%,1%,10


In [205]:
key_terms.sort_values(by=['Precision'], ascending=False)

Unnamed: 0,Term,Precision,Recall,Count
54,can call,99%,7%,100
11,appointment,99%,11%,155
76,address is,98%,12%,166
60,my address,98%,5%,65
39,invoice,98%,3%,47
48,a call,98%,17%,237
52,my phone,98%,7%,90
68,the address,98%,11%,156
44,phone number,97%,18%,239
70,finished,97%,3%,38


In [None]:
# the number of people that are hired...
bid_level_messages[bid_level_messages.hired == True].shape[0]

Improvements:
- combination of words, more complex regex rules
    - multiple words to be present
    - eliminate certain words
    - use other metadata for prediction
- analyze the false positives -- predicted as hires but are actually not
    - sometimes people cancel
    - sometimes they use the word but not in context of hiring
    - sometimes I think it should be marked as hired because they seemed to meet
    - can do more research

### Analyzing Specific Words

- using terms from # https://docs.google.com/spreadsheets/d/1736DS7PE-zBbmLBEr00Z3LVBVCJhi-mB6QJADa--xdA/edit#gid=0

### Hired

In [142]:
# term = r'^(?=.*be there)(?=.*review)(?=.*[^reviewed])'
# term = r'^(?=.*hired)(?=.*review )((?!reviewed))*$'
term = r'^(?=.*be there)(?=.*review)(?!.*reviewed)'
predicted_hires = bid_level_messages['message'].str.contains(term, case=False)
predicted_hires = predicted_hires[predicted_hires == True]
ground_truth = bid_level_messages.loc[predicted_hires.index][['is_hired']] # out of the predicted positives, what's the ground truth
tp = ground_truth[ground_truth['is_hired'] == True] # ones that are actually hired

precision = tp.shape[0] / predicted_hires.shape[0]
recall = tp.shape[0] / bid_level_messages[bid_level_messages.is_hired == True].shape[0]
num_terms = tp.shape[0]
print("Term:", term, 
      "\nPrecision:", "{0:.0f}%".format(precision*100),
      "\nRecall:", "{0:.0f}%".format(recall*100),
      "\nCount:", num_terms)

Term: ^(?=.*be there)(?=.*review)(?!.*reviewed) 
Precision: 100% 
Recall: 2% 
Count: 7


### Location

In [170]:
# term = r'^(?=.*be there)(?=.*review)(?=.*[^reviewed])'
# term = r'^(?=.*hired)(?=.*review )((?!reviewed))*$'
# need to debug and figure out why the count is different...
# spend some time to make sure that the regex is correct and doesn't underrepresent anything

# term = r'^(?=.*home address)' # r'^.*home address.*$'
term = "on my way"
predicted_hires = bid_level_messages['message'].str.contains(term, case=False, regex=True)
predicted_hires = predicted_hires[predicted_hires == True]
ground_truth = bid_level_messages.loc[predicted_hires.index][['is_location']] # out of the predicted positives, what's the ground truth
tp = ground_truth[ground_truth['is_location'] == True] # ones that are actually hired

precision = tp.shape[0] / predicted_hires.shape[0]
recall = tp.shape[0] / bid_level_messages[bid_level_messages.is_location == True].shape[0]
num_terms = tp.shape[0]
print("Term:", term, 
      "\nPrecision:", "{0:.0f}%".format(precision*100),
      "\nRecall:", "{0:.0f}%".format(recall*100),
      "\nCount:", num_terms)

Term: on my way 
Precision: 94% 
Recall: 3% 
Count: 32


In [171]:
ground_truth

Unnamed: 0,is_location
5,True
14,True
101,True
125,True
216,True
264,True
294,True
313,True
487,True
513,True


In [173]:
bid_level_messages.loc[1415]

bid_id             62671285                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               
message            Will call in be am to set up a time. Ok Hi Marc just wanted to check up on you to see when you will be calling to set a time Call within the hour. Driving and no reception here. Ok can't wait to hear fr

In [None]:
df[df.bid_id == 63182593]

### Metadata of hiring messages:
- message sender, pro vs customer distribution
- category
- quick reply vs not quick reply (are there quick replies that we can just automatically classify as hire?)