In [None]:
import pandas as pd
import numpy as np
import pickle
import os
import keras
import mailparser
import glob
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup
from nltk.stem.snowball import *
import re

Function that makes prediction based on a given text string

In [274]:
def classify_single_string(text1):
    cleantext = BeautifulSoup(text1, "lxml").text
    cleantext = re.sub(r'CORE_BUSINESS', "", cleantext)
    cleantext = re.sub(r'CLOSE_PERSONAL', "", cleantext)
    cleantext = re.sub(r'_.*', "", cleantext)
    cleantext = re.sub(r'\W+', ' ', cleantext)
    cleantext = re.sub(r'\d+', ' ', cleantext)
    cleantext = [i.lower() for i in word_tokenize(cleantext) if len(i) > 1]
    filtered_cleantext = " ".join([stemmer(w) for w in cleantext if not w in stop_words])
    single_string_df = pd.DataFrame({'text': [filtered_cleantext]})
    X_single_test = transformer.transform(single_string_df.text)
    model_prediction = model.predict_proba(X_single_test)
    model_predicted_class = np.where(np.argmax(model_prediction[0]) == 1, 'business', 'personal').item()
    business_score = np.round(model_prediction[0][1], 2)
    personal_score = np.round(model_prediction[0][0], 2)    
    result = 'Business score is {}\npersonal score is {}'.format(business_score, personal_score)
    print ('{} - predicted class is {}\n\n{}\n'.format(text1, model_predicted_class, result))

Function that makes prediction based on a given folder containing files with emails and returns dataframe with prediction results

In [275]:
def classify_files_from_folder(files_path):
    raw_data = []
    for text_file in glob.glob(files_path):
        mail = mailparser.parse_from_file(text_file)
        text1 = mail.body
        cleantext = BeautifulSoup(text1, "lxml").text
        cleantext = re.sub(r'CORE_BUSINESS', "", cleantext)
        cleantext = re.sub(r'CLOSE_PERSONAL', "", cleantext)
        cleantext = re.sub(r'_.*', "", cleantext)
        cleantext = re.sub(r'\W+', ' ', cleantext)
        cleantext = re.sub(r'\d+', ' ', cleantext)
        cleantext = [i.lower() for i in word_tokenize(cleantext) if len(i) > 1]
        filtered_cleantext = " ".join([stemmer(w) for w in cleantext if not w in stop_words])
        raw_data.append({'filename': os.path.basename(text_file), 'cleaned_text': filtered_cleantext})

    X_data_full = pd.DataFrame(raw_data)    
    X_features = transformer.transform(X_data_full.cleaned_text)
    predicted_data = model.predict(X_features)
        
    predicted_business_prob = predicted_data[:, 1]
    predicted_personal_prob = predicted_data[:, 0]    
    predicted_values = np.argmax(predicted_data, 1)
    predicted_class_label = np.where(predicted_values == 1, 'business', 'personal')
    
    X_data_full = X_data_full.assign(business_prob = predicted_business_prob, 
                                     personal_prob = predicted_personal_prob, 
                                     class_label = predicted_class_label)
    
    return X_data_full[['filename', 'class_label', 'business_prob', 'personal_prob', 'cleaned_text']]

In [271]:
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english').stem
transformer = pickle.load(open('transformer_model.sav', 'rb'))
model = keras.models.load_model('my_model.h5')

You can specify path to folder with test emails here:

In [282]:

test_files_path = '/insert_your_path_here/*'

You can input your strings here:

In [283]:
new_text_1 = 'Excuse me I will be late my revenue is down'
new_text_2 = 'Hi bro what is about a cup of beer tonight?'


In [284]:
classify_single_string(new_text_1)
classify_single_string(new_text_2)


Excuse me I will be late my revenue is down - predicted class is business

Business score is 0.9800000190734863
personal score is 0.019999999552965164

Hi bro what is about a cup of beer tonight? - predicted class is personal

Business score is 0.009999999776482582
personal score is 0.9900000095367432



In [277]:
test_df = classify_files_from_folder(test_files_path)
test_df

Unnamed: 0,filename,class_label,business_prob,personal_prob,cleaned_text
0,maildir_allen-p_inbox_84,business,0.999561,0.000439,purpos acceler distribut psa singl sum distrib...
1,10,personal,0.014198,0.985802,rick offic mail address mjacobson fce com send...
2,maildir_allen-p_inbox_23,business,0.998114,0.001886,three deal fax let know interest thank jeff sm...
3,14,personal,0.018104,0.981896,phone work day great see rick ami ben jonathan...
4,8,personal,0.033977,0.966023,entrust forward anyon origin messag todd mrien...


In [None]:
test_df.to_csv('/insert_your_path_here/text_class_result.csv', index=False)
