In [1]:
import string, operator, math, random
from collections import Counter

import pandas as pd

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer, LancasterStemmer
nltk.download('punkt')
nltk.download('wordnet')

import nltk.classify
from sklearn.svm import SVC

[nltk_data] Downloading package punkt to /Users/paul/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/paul/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
MIGRASIA_ID = 837358736435667

## Load and clean data

In [3]:
fb = pd.read_csv('FB_30MB.csv')
fb.dropna(subset=['from_id', 'message'], inplace=True)

In [4]:
fb.head(3)

Unnamed: 0,from_id,from,time,message,attachments,shares,url
1,5083596000000000.0,Maxinne Kate Barretto,2021-03-15 2:41:48,Sure po ba ung refund?,,,
2,837358700000000.0,Report Bad Agencies,2021-03-15 2:41:48,"Goodday!\n\nFor Help with an Agency, Lending, ...",,,
3,837358700000000.0,Report Bad Agencies,2021-03-15 3:24:00,"Hello po maam, what year and country po kayo?",,,


In [5]:
outgoing_msgs = fb[fb.from_id == MIGRASIA_ID]

In [6]:
incoming_msgs = fb[fb.from_id != MIGRASIA_ID]

In [7]:
incoming_msgs.head(3)

Unnamed: 0,from_id,from,time,message,attachments,shares,url
1,5083596000000000.0,Maxinne Kate Barretto,2021-03-15 2:41:48,Sure po ba ung refund?,,,
4,5083596000000000.0,Maxinne Kate Barretto,2021-03-15 3:30:35,Kaka 1year lang po nung january,,,
5,5083596000000000.0,Maxinne Kate Barretto,2021-03-15 3:30:37,Taiwan po,,,


### Normalise messages

In [8]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

def wordfilter(word):
    # Only consider words that pass the following conditions
    return (word != '') and (len(word) >= 3) and word.isalpha()

def stemMsg(msg):
    token_words=word_tokenize(msg)
    token_words
    stemmed_msg=[]
    for word in token_words:
        if not wordfilter(word): continue
        
        #stemmed_msg.append(lancaster.stem(word))
        stemmed_msg.append( wordnet_lemmatizer.lemmatize(word).lower() )
    
    return stemmed_msg

def stemMsgText(msg):
    return " ".join(stemMsg(msg))

In [9]:
incoming_msgs = incoming_msgs.assign(lemmatised_message = incoming_msgs.message.apply(stemMsgText))

In [10]:
incoming_msgs.to_csv('incoming_msgs.csv', index=False)
incoming_msgs.head(3)

Unnamed: 0,from_id,from,time,message,attachments,shares,url,lemmatised_message
1,5083596000000000.0,Maxinne Kate Barretto,2021-03-15 2:41:48,Sure po ba ung refund?,,,,sure ung refund
4,5083596000000000.0,Maxinne Kate Barretto,2021-03-15 3:30:35,Kaka 1year lang po nung january,,,,kaka lang nung january
5,5083596000000000.0,Maxinne Kate Barretto,2021-03-15 3:30:37,Taiwan po,,,,taiwan


## Pre-classify
Label messages with tags `isAgency`, `isLender` and `isEmployer`

In [11]:
AGENCY_WORD_LIST = ['agency', 'agent']
EMPLOYER_WORD_LIST = ['employer', ' amo ']
LENDER_WORD_LIST = ['lend', 'finance', 'borrow', 'debt', 'utang', 'borrow']

def classifyMsg(msg: str, classification_words) -> bool:
    assert type(msg) is str, f'message "{msg}"({type(msg)}) is not a string'
    if type(msg) is not str:
        print(msg)
        msg = str(msg)
    
    return any([(word.lower() in msg.lower()) for word in classification_words])

In [12]:
preClassified = incoming_msgs.copy()
preClassified = preClassified.assign( isAgency = incoming_msgs.message.apply(lambda msg: classifyMsg(msg, AGENCY_WORD_LIST)) )
preClassified = preClassified.assign( isLender = incoming_msgs.message.apply(lambda msg: classifyMsg(msg, LENDER_WORD_LIST)) )
preClassified = preClassified.assign( isEmployer = incoming_msgs.message.apply(lambda msg: classifyMsg(msg, EMPLOYER_WORD_LIST)) )

preClassified.head(3)

Unnamed: 0,from_id,from,time,message,attachments,shares,url,lemmatised_message,isAgency,isLender,isEmployer
1,5083596000000000.0,Maxinne Kate Barretto,2021-03-15 2:41:48,Sure po ba ung refund?,,,,sure ung refund,False,False,False
4,5083596000000000.0,Maxinne Kate Barretto,2021-03-15 3:30:35,Kaka 1year lang po nung january,,,,kaka lang nung january,False,False,False
5,5083596000000000.0,Maxinne Kate Barretto,2021-03-15 3:30:37,Taiwan po,,,,taiwan,False,False,False


In [13]:
incoming_agencyMsgs = preClassified[ preClassified.isAgency ]
incoming_lenderMsgs = preClassified[ preClassified.isLender ]
incoming_employerMsgs = preClassified[ preClassified.isEmployer ]

In [14]:
print(f"Messages about Agencies: {len(incoming_agencyMsgs)/len(incoming_msgs) *100}%")
print(f"Messages about Lenders: {len(incoming_lenderMsgs)/len(incoming_msgs) *100}%")
print(f"Messages about Employers: {len(incoming_employerMsgs)/len(incoming_msgs) *100}%")

Messages about Agencies: 16.93389147776637%
Messages about Lenders: 5.098333464971171%
Messages about Employers: 3.0131901113656108%


In [15]:
word_list = ['angelex']
df = incoming_msgs[ incoming_msgs.message.apply(lambda msg: classifyMsg(msg, word_list)) ]
print(f"Messages: {len(df)/len(incoming_msgs) *100}%")

Messages: 0.6279124871653108%


## Check frequency of words

### Calculate frequencies

In [16]:
def add_word(freqs_dict, word):
    # Make lowercase
    word = word.lower()
    # Remove punctuation
    word = word.translate(str.maketrans('', '', string.punctuation))
    
    if word in freqs_dict: freqs_dict[word] += 1
    else:                  freqs_dict[word] = 1
    

def count_msgWords(df):
    word_freq = dict()
    total_words = 0

    for msg in df.lemmatised_message.values:
        for word in msg.split():            
            total_words+=1
            add_word(word_freq, word)

    return word_freq, total_words

In [17]:
# Incoming messages
word_freq, total_words = count_msgWords(incoming_msgs)
allMsgs_wordFreqs = {word: count/total_words *100 for (word,count) in word_freq.items()}

In [18]:
# Incoming messages about agencies
word_freq, total_words = count_msgWords(incoming_agencyMsgs)
agencyMsgs_wordFreqs = {word: count/total_words *100 for (word,count) in word_freq.items()}

In [19]:
# Incoming messages about lenders
word_freq, total_words = count_msgWords(incoming_employerMsgs)
employerMsgs_wordFreqs = {word: count/total_words *100 for (word,count) in word_freq.items()}

In [20]:
# Incoming messages about employers
word_freq, total_words = count_msgWords(incoming_lenderMsgs)
lenderMsgs_wordFreqs = {word: count/total_words *100 for (word,count) in word_freq.items()}

### Check frequencies

In [21]:
# Words that appear more often in allMsgs than in agencyMsgs
( Counter(dict(allMsgs_wordFreqs)) - Counter(dict(agencyMsgs_wordFreqs)) ).most_common()

[('get', 0.9664302332522962),
 ('started', 0.9571654599521743),
 ('yes', 0.34928348424892386),
 ('you', 0.3062131172068311),
 ('maam', 0.2909119315928117),
 ('mam', 0.2810939252964212),
 ('thank', 0.27018620088153134),
 ('sir', 0.25464755267306727),
 ('send', 0.16525987699270198),
 ('will', 0.1532114510783381),
 ('have', 0.15283813943320268),
 ('lending', 0.14074892014956497),
 ('how', 0.139871172935023),
 ('salamat', 0.12393516758618736),
 ('they', 0.11109058119582932),
 ('thanks', 0.10967634885905647),
 ('yan', 0.10598071352333582),
 ('opo', 0.10093713091060354),
 ('already', 0.09577190794891093),
 ('loan', 0.09398873587522742),
 ('can', 0.09033280093172441),
 ('hello', 0.08523495600684905),
 ('what', 0.0817272460798499),
 ('meron', 0.07767176804186302),
 ('pdos', 0.07029943119857192),
 ('letter', 0.06841136664996583),
 ('not', 0.06695489474866803),
 ('okay', 0.06460209502287317),
 ('interest', 0.0644618740639577),
 ('pwede', 0.0612907011028499),
 ('help', 0.057482522452813434),
 ('c

In [22]:
# Words that appear more often in AGENCY Msgs than in allMsgs
#! -> Can be used to build a classifier for agency msgs
agency_freqs = (
    Counter(dict(agencyMsgs_wordFreqs))
     - Counter(dict(allMsgs_wordFreqs))
     #- Counter(dict(lenderMsgs_wordFreqs))
     #- Counter(dict(employerMsgs_wordFreqs))
)

agency_freqs

Counter({'good': 0.024982603100858092,
         'want': 0.0035885569066856915,
         'ask': 0.035124030638465586,
         'for': 0.017637943902166553,
         'the': 0.16957901575201095,
         'agency': 4.3725330743924244,
         'that': 0.00343303426898367,
         'manage': 0.0012576167220145205,
         'here': 0.054972014135204084,
         'grandplacement': 0.0008733753063171999,
         'and': 0.12418535936464004,
         'charge': 0.027911588341101287,
         'fee': 0.14776430668109208,
         'like': 0.010337003785722373,
         'placement': 0.1668226875641735,
         'processing': 0.02510935511690421,
         'kung': 0.012557586610190075,
         'binibigay': 0.001780986119571129,
         'ang': 0.455297660197864,
         'grand': 0.0031788238005011236,
         'kasi': 0.08993142510132313,
         'namin': 0.1607123393511216,
         'ung': 0.07713017125254618,
         'binayaran': 0.014035918052611535,
         'medical': 0.05374535398895769,
   

In [23]:
# Words that appear more often in LENDER Msgs than in allMsgs
#! -> Can be used to build a classifier for lender msgs
lender_freqs = (
    Counter(dict(lenderMsgs_wordFreqs))
     - Counter(dict(allMsgs_wordFreqs))
     #- Counter(dict(agencyMsgs_wordFreqs))
     #- Counter(dict(employerMsgs_wordFreqs))
)

lender_freqs

Counter({'lending': 3.4306042029661605,
         'hnd': 0.03533514952347665,
         'nmn': 0.007739761033822984,
         'ako': 0.5016996193402732,
         'nglending': 0.022038142631474262,
         'kaso': 0.06257699189040886,
         'ung': 0.03975842561109366,
         'naipon': 0.009062535753257613,
         'dti': 0.0012363935033395425,
         'ang': 0.5840117365846826,
         'gnmit': 0.003673023771912378,
         'pang': 0.03898105574180934,
         'bayad': 0.11408971654244782,
         'din': 0.13898532048607276,
         'pero': 0.07247215054629569,
         'bgay': 0.004900378090725358,
         'month': 0.08471456555167534,
         'lang': 0.05595006847953865,
         'infinity': 0.00270379517519918,
         'manoower': 0.003673023771912378,
         'nag': 0.2217252721062154,
         'refer': 0.12171699673278147,
         'sila': 0.033088342129961645,
         'lico': 0.007346047543824756,
         'company': 0.4538378905374942,
         'nmin': 0.151318127

In [24]:
# Words that appear more often in EMPLOYER Msgs than in allMsgs
#! -> Can be used to build a classifier for employer msgs
employer_freqs = (
    Counter(dict(employerMsgs_wordFreqs))
     - Counter(dict(allMsgs_wordFreqs))
     - Counter(dict(agencyMsgs_wordFreqs))
     - Counter(dict(lenderMsgs_wordFreqs))
)

employer_freqs

Counter({'amo': 1.045591788418678,
         'ngchange': 0.004180708268715135,
         'employer': 2.081306224554266,
         'maperahan': 0.004180708268715135,
         'gnw': 0.004180708268715135,
         'pinapagastos': 0.004180708268715135,
         'prin': 0.0009079969991277921,
         'pahiya': 1.855060618287819e-05,
         'pinadala': 0.00036765663082421237,
         'bhay': 0.004676680968499232,
         'khihiya': 1.855060618287819e-05,
         'rsibo': 1.855060618287819e-05,
         'angkop': 0.004180708268715135,
         'sainyung': 0.004180708268715135,
         'saibang': 0.004180708268715135,
         'ipinakita': 0.002818199071778056,
         'salbahi': 0.004180708268715135,
         'ran': 0.0004774220936012174,
         'alaman': 0.002818199071778056,
         'tan': 0.009327972521651368,
         'eng': 0.006998907340493191,
         'ann': 0.004658130362316352,
         'backout': 0.004758149849384464,
         'salbahis': 0.004180708268715135,
         'ye

In [25]:
# Words that appear more often in allMsgs than in agencyMsgs + lenderMsgs
# -> Words that should be ignored
ignore_freqs = (
    Counter(dict(allMsgs_wordFreqs)) - Counter(dict(agencyMsgs_wordFreqs))  - Counter(dict(lenderMsgs_wordFreqs))
).most_common(100)

ignore_words = {word for (word,x) in ignore_freqs}

ignore_words

{'afraid',
 'ahh',
 'alice',
 'always',
 'brother',
 'bukas',
 'busy',
 'calling',
 'cert',
 'cge',
 'come',
 'daughter',
 'delete',
 'department',
 'dis',
 'dole',
 'done',
 'email',
 'english',
 'fill',
 'follow',
 'form',
 'fren',
 'get',
 'gmail',
 'gnun',
 'hahanapin',
 'hanapin',
 'helping',
 'holiday',
 'how',
 'idea',
 'ill',
 'imployer',
 'info',
 'ipasa',
 'its',
 'knina',
 'kuha',
 'labour',
 'later',
 'link',
 'lnq',
 'location',
 'lumabas',
 'makita',
 'mamaya',
 'maya',
 'mgfile',
 'monday',
 'msg',
 'nasend',
 'next',
 'noted',
 'nothing',
 'number',
 'nyan',
 'off',
 'office',
 'okay',
 'okey',
 'opo',
 'pdos',
 'phone',
 'pic',
 'picturan',
 'province',
 'pti',
 'reply',
 'room',
 'sainyo',
 'saturday',
 'sec',
 'send',
 'sge',
 'shop',
 'sige',
 'sino',
 'slmat',
 'sorry',
 'started',
 'step',
 'submit',
 'sunday',
 'tagalog',
 'thank',
 'thanks',
 'thankyou',
 'tnx',
 'understand',
 'video',
 'wait',
 'welcome',
 'whatsapp',
 'where',
 'yeah',
 'yes',
 'yesterday',
 

## Classifier

### Classification helper functions

In [26]:
def get_keywords(keyword_no):
    agency_words   = {word for (word,_) in agency_freqs.most_common(keyword_no)}
    lender_words   = {word for (word,_) in lender_freqs.most_common(keyword_no)}
    employer_words = {word for (word,_) in employer_freqs.most_common(keyword_no)}
    
    keywords = agency_words.union(lender_words).union(employer_words)
    return keywords

In [27]:
def extract_features(lmsg, keywords):
    features = {}
    for word in keywords:
        features[f'contains({word})'] = (word in lmsg)
    return features

In [28]:
train_ratio   = math.floor( .6 * len(preClassified) )
devtest_ratio = train_ratio + math.floor( .2 * len(preClassified) )
test_ratio    = devtest_ratio + math.floor( .2 * len(preClassified) )

def optimise_classifier(labeled):
    best_classifier = None
    best_accuracy = 0
    
    for n_words in range(10, 110, 10):
        agency_words   = {word for (word,_) in agency_freqs.most_common(n_words)}
        lender_words   = {word for (word,_) in lender_freqs.most_common(n_words)}
        employer_words = {word for (word,_) in employer_freqs.most_common(n_words)}
        keywords = agency_words.union(lender_words).union(employer_words)
    
        featureset = [(extract_features(lmsg, keywords), label) for (lmsg, label) in labeled]
    #    random.shuffle(featureset)

        train_set   = featureset[: train_ratio]
        devtest_set = featureset[train_ratio : devtest_ratio]
        
        classifier = nltk.classify.svm.SV.train(train_set)  
        accuracy = nltk.classify.accuracy(classifier, devtest_set) *100
        print(f"Test Accuracy: { accuracy }%")
        if accuracy > best_accuracy:
            best_classifier = classifier
            best_accuracy = accuracy
        
    
    test_set  = featureset[devtest_ratio :]
    print("---")
    print(f"-> Test Accuracy:   { best_accuracy }%")
    print(f"-> Actual Accuracy: { nltk.classify.accuracy(best_classifier, test_set) *100 }%")
    
    return classifier
    

In [29]:
def create_feature_classifier(labeled, keyword_no):
        
    return classifier

### Train
To train the classifier well, we need to train on a set with an even distribution of messages that should be positive and those that should be negative. Otherwise the model could always guess one thing (e.g. that any message is positive) and get a high accuracy.

In [30]:
# Create a df that has 50% relavant messages (positive_df)
# and 50% non-relavant messages (negative_df)

def create_classification_df(positive_df):
    # negative_df = sample of (all messages - positive_df) with same size as positive_df
    negative_df = pd.concat([preClassified, positive_df]).drop_duplicates(keep=False).sample(len(positive_df))
    # classification_df = positive_df + negative_df
    classification_df = positive_df.append(negative_df)
    return classification_df

In [31]:
KEYWORD_NO = 100
def train_classifier(message_df, label_column = 'isAgency', keywords=None, svm=False):
    # Balance training data
    classification_df = create_classification_df(incoming_agencyMsgs)

    # Label the messages
    labeled_msgs = [tuple(x) for x in classification_df[['lemmatised_message', label_column]].to_numpy()]

    # Extract the message features
    if keywords is None:
        keywords = get_keywords(KEYWORD_NO)
    featureset = [(extract_features(lmsg, keywords), label) for (lmsg, label) in labeled_msgs]

    # Split into train/test sets
    train_ratio   = math.floor( .7 * len(classification_df) )
    devtest_ratio = train_ratio + math.floor( .3 * len(classification_df) )
    train_set   = featureset[: train_ratio]
    devtest_set = featureset[train_ratio : devtest_ratio]
    
    if svm:
        # Train SVM classifier
        classifier = nltk.classify.SklearnClassifier(SVC(kernel='linear', probability=True))
        classifier.train(train_set)
    else:
        # Train Naive Bayes classifier
        classifier = nltk.NaiveBayesClassifier.train(train_set)  
    
    # Test and get accuracy
    accuracy = nltk.classify.accuracy(classifier, devtest_set) *100
    print(f"Test Accuracy: { accuracy }%")
    
    return classifier

#### Agencies

In [32]:
keyword_no = 100
agency_words = {word for (word,_) in agency_freqs.most_common(keyword_no)}

agency_classifier1 = train_classifier(incoming_agencyMsgs, 'isAgency', agency_words)

Test Accuracy: 92.84603421461898%


In [33]:
agency_classifier1.show_most_informative_features(50)

Most Informative Features
        contains(agency) = True             True : False  =    472.2 : 1.0
       contains(angelex) = True             True : False  =     30.3 : 1.0
        contains(agency) = False           False : True   =     27.0 : 1.0
        contains(walang) = True             True : False  =     16.4 : 1.0
         contains(sahod) = True             True : False  =     12.4 : 1.0
          contains(list) = True             True : False  =     12.0 : 1.0
     contains(placewell) = True             True : False  =      9.0 : 1.0
      contains(manpower) = True             True : False  =      8.5 : 1.0
        contains(manila) = True             True : False  =      8.0 : 1.0
        contains(charge) = True             True : False  =      7.8 : 1.0
          contains(west) = True             True : False  =      7.4 : 1.0
        contains(mabawi) = True             True : False  =      7.3 : 1.0
          contains(east) = True             True : False  =      7.2 : 1.0

In [34]:
agency_relevant_words = ['agency', 'agent', 'staff', 'employment', 'charge', 'visa', 'manpower', 'renew', 'international', 'offer', 'processing', 'placement', 'recruitment', 'service', 'apply', 'passport', 'refer', 'agncy', 'salary', 'airport']

In [35]:
agency_relevant_words2 = ['agency', 'angelex', 'recruitment', 'allied', 'staff', 'international', 'binayad', 'employment', 'report', 'makukuha', 'manpower', 'mabawi', 'visa', 'nagbayad', 'nagastos', 'passport', 'placement', 'salary', 'fee', 'gastos']

In [36]:
agency_classifier2 = train_classifier(incoming_agencyMsgs, 'isAgency', agency_relevant_words, svm=False)

Test Accuracy: 99.02799377916018%


#### Lenders

In [37]:
keyword_no = 100
lender_words = {word for (word,_) in lender_freqs.most_common(keyword_no)}

lender_classifier1 = train_classifier(incoming_lenderMsgs, 'isLender', lender_words)

Test Accuracy: 98.9113530326594%


In [38]:
lender_classifier1.show_most_informative_features(50)

Most Informative Features
       contains(nirefer) = True             True : False  =     62.7 : 1.0
          contains(baon) = True             True : False  =     42.9 : 1.0
         contains(refer) = True             True : False  =     26.1 : 1.0
           contains(pjh) = True             True : False  =     25.5 : 1.0
       contains(pamilya) = True             True : False  =     22.5 : 1.0
          contains(tubo) = True             True : False  =     19.7 : 1.0
      contains(interest) = True             True : False  =     16.5 : 1.0
          contains(hoya) = True             True : False  =     15.1 : 1.0
        contains(nittan) = True             True : False  =     14.9 : 1.0
         contains(covid) = True             True : False  =     13.0 : 1.0
       contains(penalty) = True             True : False  =     12.8 : 1.0
       contains(company) = True             True : False  =     12.4 : 1.0
      contains(makaalis) = True             True : False  =     11.2 : 1.0

In [39]:
lender_relevant_words = ['lend', 'lending', 'borrow', 'finance', 'inutangan', 'nirefer', 'umutang', 'borrower', 'nittan', 'interest', 'utang', 'tubo', 'penalty', 'loan', 'inutang', 'money', 'payment', 'cash', 'rich']

In [40]:
lender_classifier2 = train_classifier(incoming_lenderMsgs, 'isLender', lender_relevant_words, svm=False)

Test Accuracy: 99.68895800933126%


In [41]:
#lender_classifier2.show_most_informative_features()

#### Employers

In [42]:
keyword_no = 100
employer_words = {word for (word,_) in employer_freqs.most_common(keyword_no)}

employer_classifier1 = train_classifier(incoming_employerMsgs, 'isEmployer', employer_words)

Test Accuracy: 98.48367029548989%


In [43]:
employer_classifier1.show_most_informative_features(50)

Most Informative Features
      contains(nkahanap) = True             True : False  =     57.3 : 1.0
          contains(tgal) = True             True : False  =     57.3 : 1.0
     contains(nakahanap) = True             True : False  =     44.5 : 1.0
       contains(bumalik) = True             True : False  =     35.4 : 1.0
      contains(landline) = True             True : False  =     31.8 : 1.0
          contains(find) = True             True : False  =     30.8 : 1.0
        contains(travel) = True             True : False  =     30.0 : 1.0
    contains(terminated) = True             True : False  =     28.2 : 1.0
       contains(decided) = True             True : False  =     26.7 : 1.0
           contains(kid) = True             True : False  =     26.7 : 1.0
        contains(extend) = True             True : False  =     24.5 : 1.0
        contains(second) = True             True : False  =     23.3 : 1.0
        contains(signed) = True             True : False  =     22.6 : 1.0

In [44]:
employer_relevant_words = ['employer', 'bawiin', 'household', 'reklmo', 'bumalik', 'terminated', 'bumili', 'nkahanap', 'find', 'asawa', 'declare', 'informed', 'party', 'sleep', 'terminate', 'finished', ' amo ', 'kid', 'nakatira', 'change', 'house', 'leave']

In [45]:
employer_classifier2 = train_classifier(incoming_employerMsgs, 'isEmployer', employer_relevant_words, svm=False)

Test Accuracy: 99.4945567651633%


### Save classifiers

In [46]:
import dill
dill.settings['recurse'] = True  # Includes dependencies when dill-ing

In [47]:
# Generate a function to easily apply a classifier
def createClassificationFunction(classifier, keywords, certainFunc=None):
    def classificationFunc(msg):
        if certainFunc is not None and certainFunc(msg):
            return True, 1
        
        lmsg = stemMsgText(msg)
        features = extract_features(lmsg, keywords)
        dist = classifier.prob_classify(features)
        return dist.max(), dist.prob(dist.max())
    
    return classificationFunc

In [48]:
classifyMsg?

[0;31mSignature:[0m [0mclassifyMsg[0m[0;34m([0m[0mmsg[0m[0;34m:[0m [0mstr[0m[0;34m,[0m [0mclassification_words[0m[0;34m)[0m [0;34m->[0m [0mbool[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m <no docstring>
[0;31mFile:[0m      /var/folders/qf/ltmg13k566qg2mqvt21n6k5w0000gn/T/ipykernel_41655/960332022.py
[0;31mType:[0m      function


In [49]:
is_about_agency = createClassificationFunction(
    agency_classifier2,
    agency_relevant_words,
    lambda msg: classifyMsg(msg, AGENCY_WORD_LIST)
)
dill.dump(is_about_agency, open('agency.dill','wb'))

In [50]:
is_about_lender = createClassificationFunction(
    lender_classifier2,
    lender_relevant_words,
    lambda msg: classifyMsg(msg, LENDER_WORD_LIST)
)
dill.dump(is_about_lender, open('lender.dill','wb'))

In [51]:
is_about_employer = createClassificationFunction(
    employer_classifier2,
    employer_relevant_words,
    lambda msg: classifyMsg(msg, EMPLOYER_WORD_LIST)
)
dill.dump(is_about_employer, open('employer.dill','wb'))

In [52]:
msg = "this agency is bad"

classifier = agency_classifier2
keywords = agency_relevant_words

lmsg = stemMsgText(msg)
features = extract_features(lmsg, keywords)
dist = classifier.prob_classify(features)
dist.max(), dist.prob(dist.max())

(True, 0.9964238446377032)

In [53]:
assert False

AssertionError: 

#### Other

In [None]:
msg = "for example pno svhin complaint lending"
lmsg = stemMsg(msg)
classify(lmsg, lender_classifier)

In [None]:
type(lender_classifier)

In [None]:
classification_df = create_classification_df(incoming_agencyMsgs)

train_ratio   = math.floor( .6 * len(classification_df) )
devtest_ratio = train_ratio + math.floor( .2 * len(classification_df) )
test_ratio    = devtest_ratio + math.floor( .2 * len(classification_df) )

labeled_agencies = [tuple(x) for x in classification_df[['lemmatised_message', 'isAgency']].to_numpy()]
agency_classifier = create_feature_classifier(labeled_agencies, 100)
print("---")

Here is an example of a classifier trained on an unevenly distributed training set ( more negative (non-agency) messages than positive (agency) )

It scores a higher accuracy but performs worse

In [None]:
# Here is an example of a classifier trained with an unevenly distributed set
#  ( more negative (non-agency) messages than positive (agency) )
# It scores a higher accuracy but performs worse
labeled_agencies = [tuple(x) for x in preClassified[['lemmatised_message', 'isAgency']].to_numpy()]
bad_agency_classifier = create_feature_classifier(labeled_agencies, 100)
print("---")
#bad_agency_classifier.show_most_informative_features(5)

In [None]:
msg = "good agent"
features = extract_features(msg, get_keywords(100))

print(f"For message \"{msg}\"")

print("\nBad classifier:")
dist = bad_agency_classifier.prob_classify(features)
print(f" True:  {dist.prob(True)}")
print(f" False: {dist.prob(False)}")
print(f" => Result: {dist.max()}")

print("\nBetter classifier:")
dist = agency_classifier.prob_classify(features)
print(f" True:  {dist.prob(True)}")
print(f" False: {dist.prob(False)}")
print(f" => Result: {dist.max()}")

In [None]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

pipeline = Pipeline([('tfidf', TfidfTransformer()),
                     ('chi2', SelectKBest(chi2, k='all')),
                     ('nb', SVC(kernel='linear', probability=True))])
classif = SklearnClassifier(pipeline)

In [None]:
KEYWORD_NO = 100
def train_classifier(message_df, label_column = 'isAgency'):
    # Balance training data
    classification_df = create_classification_df(incoming_agencyMsgs)

    # Label the messages
    labeled_msgs = [tuple(x) for x in classification_df[['lemmatised_message', label_column]].to_numpy()]

    # Extract the message features
    keywords = get_keywords(KEYWORD_NO)
    featureset = [(extract_features(lmsg, keywords), label) for (lmsg, label) in labeled_msgs]

    # Split into train/test sets
    train_ratio   = math.floor( .7 * len(classification_df) )
    devtest_ratio = train_ratio + math.floor( .3 * len(classification_df) )
    train_set   = featureset[: train_ratio]
    devtest_set = featureset[train_ratio : devtest_ratio]
    
    # Train SVM classifier
    classif = SklearnClassifier(pipeline)
    classif.train(train_set)
    
    # Test and get accuracy
    #classifier = nltk.NaiveBayesClassifier.train(train_set)  
    #accuracy = nltk.classify.accuracy(classifier, devtest_set) *100
    #print(f"Test Accuracy: { accuracy }%")
    
    return classif

In [None]:
x = train_classifier(incoming_employerMsgs, 'isEmployer')

In [None]:
msg = "Eto din po sa eE-cash grabe panggigipit sa mga tao...tlgang obligado pa pbyran o ipkaltas sa sahod sa amo mo penalty code ng lapse mo kht bayad kn s lhat ng barcodes mo...pilit pa pbyran ang barcode pinadala sau sa knla lng nman mppnta un...sobrang ginigipit nla ang tao"
lmsg = stemMsg(msg)
classify(lmsg, x)

In [None]:
msg = "for example pno svhin complaint lending"
lmsg = stemMsg(msg)
classify(lmsg, x)