# GET THE DATA AND IMPORT LIBRARIES

In [67]:
#this needs to be downloaded
# import nltk
# nltk.download('wordnet')

import numpy as np
import pandas as pd
from nltk.stem import WordNetLemmatizer
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [3]:
email_subjects = pd.read_csv('email_subjects.csv')

In [5]:
email_subjects.head(2)

Unnamed: 0,Subject,Receipt
0,NYE Party 2020 @ Navigator Taproom 🎉,0
1,NYE Party 2020 @ Navigator Taproom 🎉,0


# TEXT PREPROCESSING

In [31]:
#convert all text to lowercase
email_subjects['Subject'] = [i.lower() for i  in email_subjects['Subject']]
email_subjects.head()

Unnamed: 0,Subject,Receipt
0,nye party 2020 navigator taproom,0
1,nye party 2020 navigator taproom,0
2,10 off all items at qxydumplings come celebrat...,0
3,navigator taproom holiday egift cards,0
4,we miss you come back for a special offer,0


In [32]:
#remove any special characters (non-letter, non-numeric, non-whitespace)
email_subjects['Subject'] = [re.sub('[^A-Za-z0-9 ]', '', i) for i in email_subjects['Subject']]
email_subjects.head()

Unnamed: 0,Subject,Receipt
0,nye party 2020 navigator taproom,0
1,nye party 2020 navigator taproom,0
2,10 off all items at qxydumplings come celebrat...,0
3,navigator taproom holiday egift cards,0
4,we miss you come back for a special offer,0


In [39]:
#lemmatize the words (simplify them to base forms)
lemma = WordNetLemmatizer()

#wordnetlemmatizer is not great at '--ed' words. need to find alternative. example below
# print(lemma.lemmatize('missed'))

email_subjects['Subject'] = [lemma.lemmatize(i) for i in email_subjects['Subject']]
email_subjects.head()

Unnamed: 0,Subject,Receipt
0,nye party 2020 navigator taproom,0
1,nye party 2020 navigator taproom,0
2,10 off all items at qxydumplings come celebrat...,0
3,navigator taproom holiday egift cards,0
4,we miss you come back for a special offer,0


In [57]:
#get the tfidf (term frequency, inverse document frequency)
tfidf = TfidfVectorizer()
data = tfidf.fit_transform(email_subjects['Subject']).toarray()

In [58]:
#confirm we have an array
#each row is an email, each index is if it contains a word
data

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.25705129, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.45387686, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.30190139, 0.        ,
        0.        ],
       [0.        , 0.        , 0.43098348, ..., 0.        , 0.        ,
        0.        ]])

In [64]:
x_train, x_test, y_train, y_test = train_test_split(data, email_subjects['Receipt'], test_size=0.2, random_state=0)

# MODELING

### RANDOM FOREST

In [69]:
#random forest model
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(x_train, y_train) 

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [75]:
#this is our prediction on our training set using the model above
y_pred = classifier.predict(x_test)
y_pred

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

# EVALUATION

In [74]:
#evaluation metrics on the model using y_pred
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

#overall pretty good, but very small dataset

[[11  0]
 [ 0  1]]
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        11
          1       1.00      1.00      1.00         1

avg / total       1.00      1.00      1.00        12

1.0
