In [2]:
import dill
import pandas as pd

# Load test dataset

In [35]:
MIGRASIA_ID_SAFE = 425040987834981

In [36]:
fb = pd.read_csv('downloader/msgs.csv')
fb.dropna(subset=['from_id', 'message'], inplace=True)

In [37]:
incoming_msgs = fb[fb.from_id != MIGRASIA_ID_SAFE]
incoming_msgs.head(3)

Unnamed: 0,from_id,from,time,message,attachments,shares,url
1,4414892000000000.0,Sheila Drilon Prudente,2021-11-03 15:12:35,Hello madam,,,
3,4414892000000000.0,Sheila Drilon Prudente,2021-11-03 15:12:52,San office nyo madam,,,
5,6110776000000000.0,Rayatrisha Joy Bautista,2021-11-03 15:25:48,Hello sir/maam goodevening can i asked a guest...,,,


# Pre-classify test dataset

In [6]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

def wordfilter(word):
    # Only consider words that pass the following conditions
    return (word != '') and (len(word) >= 3) and word.isalpha()

def stemMsg(msg):
    token_words=word_tokenize(msg)
    token_words
    stemmed_msg=[]
    for word in token_words:
        if not wordfilter(word): continue
        
        #stemmed_msg.append(lancaster.stem(word))
        stemmed_msg.append( wordnet_lemmatizer.lemmatize(word).lower() )
    
    return stemmed_msg

def stemMsgText(msg):
    return " ".join(stemMsg(msg))

In [7]:
AGENCY_WORD_LIST = ['agency', 'agent']
EMPLOYER_WORD_LIST = ['employer', ' amo ']
LENDER_WORD_LIST = ['lend', 'finance', 'borrow', 'debt', 'utang', 'borrow']

def classifyMsg(msg: str, classification_words) -> bool:
    assert type(msg) is str, f'message "{msg}"({type(msg)}) is not a string'
    if type(msg) is not str:
        print(msg)
        msg = str(msg)
    
    return any([(word.lower() in msg.lower()) for word in classification_words])

# Load classifiers

In [9]:
is_about_agency = dill.load(open('agency.dill', 'rb'))
is_about_lender = dill.load(open('lender.dill', 'rb'))
is_about_employer = dill.load(open('employer.dill', 'rb'))

In [10]:
is_about_employer('Then my employer pay all my debt in Pacific ace')

(True, 1)

# Evaluate classifiers

## Classify

In [11]:
classified = incoming_msgs.copy()
classified = classified.assign( isAgency = incoming_msgs.message.apply(lambda msg: is_about_agency(msg)[0]) )
classified = classified.assign( isLender = incoming_msgs.message.apply(lambda msg: is_about_lender(msg)[0]) )
classified = classified.assign( isEmployer = incoming_msgs.message.apply(lambda msg: is_about_employer(msg)[0]) )

In [12]:
classified = classified.assign( preAgency = incoming_msgs.message.apply(lambda msg: classifyMsg(msg, AGENCY_WORD_LIST)) )
classified = classified.assign( preLender = incoming_msgs.message.apply(lambda msg: classifyMsg(msg, LENDER_WORD_LIST)) )
classified = classified.assign( preEmployer = incoming_msgs.message.apply(lambda msg: classifyMsg(msg, EMPLOYER_WORD_LIST)) )

In [13]:
no_match_lender = classified[classified.isLender != classified.preLender]

In [14]:
no_match_lender[['message', 'isLender', 'preLender']]

Unnamed: 0,message,isLender,preLender
1421,hello po..ask lng po kng paano po mag reklamo ...,True,False
1991,payment ko kabayan loan n 1k para s penalty ko...,True,False
2659,Good pm po. Nag loan po ako sa familyhan credi...,True,False
6142,im not sure how you can help me with my compla...,True,False
8726,"Hello Sir/ Ma'am, good evening! i have a probl...",True,False
...,...,...,...
150435,The rich deposit 700 for the Interest and pen...,True,False
151955,I throw away already but many times loan cash...,True,False
152675,Last year i was terminated becausw of Rich Cre...,True,False
167649,Tinatakot po ako na dting ku pinas e estafa da...,True,False


## Compare to pre-classification

### Lender

In [31]:
print( len(classified[classified.isLender])/len(classified)*100, end='' )
print('% of incoming messages are about lenders')

5.783287715343745% of incoming messages are about lenders


In [15]:
lender_notPreclassified = len( classified[classified.isLender & (classified.preLender == False)] )

In [16]:
total_lenders = len( classified[classified.preLender | classified.isLender] )

In [17]:
# Overclassified
lender_notPreclassified/total_lenders

0.018095768374164812

### Agency

In [32]:
print( len(classified[classified.isAgency])/len(classified)*100, end='' )
print('% of incoming messages are about agencies')

3.440669779423603% of incoming messages are about agencies


In [18]:
is_about_agency('can I ask for list of legal agencies for domestic helper in hongkong')

(True, 0.9964238446377032)

In [19]:
no_match_agency = classified[classified.isAgency != classified.preAgency]

In [20]:
no_match_agency[['message', 'isAgency', 'preAgency']].iloc[1].message

'can I ask for list of legal agencies for domestic helper in hongkong'

In [21]:
len( classified[classified.isAgency & (classified.preAgency == False)] )

120

In [22]:
len( classified[classified.preAgency & (classified.isAgency == False)] )

0

In [23]:
len( classified[classified.preAgency | classified.isAgency] )

2137

### Employer

In [34]:
print( len(classified[classified.isEmployer])/len(classified)*100, end='' )
print('% of incoming messages are about employers')

5.6657543068748994% of incoming messages are about employers


In [24]:
no_match_employer = classified[classified.isEmployer != classified.preEmployer]

In [25]:
no_match_employer[['message', 'isEmployer', 'preEmployer']].head(10)

Unnamed: 0,message,isEmployer,preEmployer
1596,gandang hapon po tanong lng kong pwede ba yon ...,True,False
1730,Sunduin ko lang na terminate ko pinsan ihatid ...,True,False
2690,"Ma'am, I can leave the house at 4 pm because o...",True,False
3575,Tas sino po magrerecieve? Amo. Ko po ba? Or ak...,True,False
3703,Send q po yung pdos q later ma'am pag d na po ...,True,False
3726,"Ok ma'am,tnx.....sagutan q po mamaya lht pag m...",True,False
5003,Agency po \nBright po s Pinas \nAsia Top nmn p...,True,False
5339,gdpm po!dko na po nasave mga messages or evide...,True,False
5353,I also lend money to UA but my Suddenly my con...,True,False
5605,I have balance in.Prime credit of 4months unti...,True,False


In [26]:
len( classified[classified.isEmployer & (classified.preEmployer == False)] )

260

In [27]:
len( classified[classified.preEmployer & (classified.isEmployer == False)] )

0

In [28]:
len( classified[classified.preEmployer | (classified.isEmployer)] )

3519