# NLP Solution from Stephen Thomas

In [1]:
import pandas as pd
import numpy as np

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 0.24.1.


In [3]:
import os
os.getcwd()

'C:\\Users\\w.sun\\Downloads'

In [4]:
df = pd.read_csv("spamraw_train.csv")
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        5000 non-null   int64 
 1   sms_text  5000 non-null   object
 2   spam      5000 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 117.3+ KB


Unnamed: 0,id,sms_text,spam
0,1,Hope you are having a good week. Just checking in,0
1,2,K..give back my thanks.,0
2,3,Am also doing in cbe only. But have to pay.,0
3,4,"complimentary 4 STAR Ibiza Holiday or £10,000 ...",1
4,5,okmail: Dear Dave this is your final notice to...,1


In [5]:
from sklearn.model_selection import train_test_split

X = df['sms_text']
y = df['spam']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.005, random_state=42)

# Custom Functions for Preprocessing and Feature Engineering

In [6]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import unidecode
import textstat
import string  

lemmer = WordNetLemmatizer()

# Simple preprocessor.
# Input is a single document, as a single string.
# Otuput should be a single document, as a single string.
def my_preprocess(doc):
    
    # Lowercase
    doc = doc.lower()
    
    # Replace URL with URL string
    doc = re.sub(r'http\S+', 'URL', doc)
    
    # Replace AT with AT string
    doc = re.sub(r'@', 'AT', doc)
    
    # Replace all numbers/digits with the string NUM
    doc = re.sub(r'\b\d+\b', 'NUM', doc)
    
    # Lemmatize each word.
    doc = ' '.join([lemmer.lemmatize(w) for w in doc.split()])

    return doc

In [7]:
# These functions will calculate additional features on the document.
# They will be put into the Pipeline, called via the FunctionTransformer() function.
# Each one takes an entier corpus (as a list of documents), and should return
# an array of feature values (one for each document in the corpus).
# These functions can do anything they want; I've made most of them quick
# one-liners Hopefully the names of the functions will make them self explanitory.

def doc_length(corpus):
    return np.array([len(doc) for doc in corpus]).reshape(-1, 1)

def lexicon_count(corpus):
    return np.array([textstat.lexicon_count(doc) for doc in corpus]).reshape(-1, 1)

def _get_punc(doc):
    return len([a for a in doc if a in string.punctuation])

def punc_count(corpus):
    return np.array([_get_punc(doc) for doc in corpus]).reshape(-1, 1)

def _get_caps(doc):
    return sum([1 for a in doc if a.isupper()])

def capital_count(corpus):
    return np.array([_get_caps(doc) for doc in corpus]).reshape(-1, 1)

def num_exclamation_marks(corpus):
    return np.array([doc.count('!') for doc in corpus]).reshape(-1, 1)

def num_question_marks(corpus):
    return np.array([doc.count('?') for doc in corpus]).reshape(-1, 1)

def xxx_pics_count(corpus):
    return np.array(["xxx pics" in doc.lower() for doc in corpus]).reshape(-1, 1)

# See if the document ends with someting like "Love Steve XXX"
def has_lovexxx(corpus):
    return np.array([bool(re.search(r"l[ou]+ve?.{0,10}x{2,5}\.? ?$", doc.lower())) for doc in corpus]).reshape(-1, 1)

def has_url(corpus):
    return np.array([bool(re.search("http", doc.lower())) for doc in corpus]).reshape(-1, 1)

def has_pence(corpus):
    return np.array([bool(re.search("\dp\W", doc.lower())) for doc in corpus]).reshape(-1, 1)

def has_money(corpus):
    return np.array([bool(re.search("[\$£]|\bpence\b|\bdollar\b", doc.lower())) for doc in corpus]).reshape(-1, 1)

def has_sexy_phrase(corpus):
    return np.array([bool(re.search("sexy single|\bfree sexy\b|\bsexy pic\b|\blive sex\b", doc.lower())) for doc in corpus]).reshape(-1, 1)

In [8]:
# To help handle class imbalance, calculate the class weights.

import numpy as np
neg, pos = np.bincount(df['spam'])
total = neg + pos
weight_for_0 = (1 / neg)*(total)/2.0 
weight_for_1 = (1 / pos)*(total)/2.0

class_weight = {0: weight_for_0, 1: weight_for_1}

print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))

Weight for class 0: 0.58
Weight for class 1: 3.71


# Construct the Pipeline

In [9]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
#from sklearn.feature_extraction import stop_words
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import FunctionTransformer
from sklearn.decomposition import NMF
from sklearn.neural_network import MLPClassifier

# Need to preprocess the stopwords, because scikit learn's TfidfVectorizer
# removes stopwords _after_ preprocessing
#stop_words = [my_preprocess(word) for word in stop_words.ENGLISH_STOP_WORDS]

# This vectorizer will be used to create the BOW features
vectorizer = TfidfVectorizer(preprocessor=my_preprocess, 
                             max_features = 1000, 
                             ngram_range=[1,4],
                             stop_words=None,
                             strip_accents="unicode", 
                             lowercase=False, max_df=0.25, min_df=0.001, use_idf=True)

# This vectorizer will be used to preprocess the text before topic modeling.
# (I _could_ use the same vectorizer as above- but why limit myself?)
vectorizer2 = TfidfVectorizer(preprocessor=my_preprocess, 
                             max_features = 1000, 
                             ngram_range=[1,2],
                             stop_words=None,
                             strip_accents="unicode", 
                             lowercase=False, max_df=0.25, min_df=0.001, use_idf=True)

nmf = NMF(n_components=25, random_state=1, init='nndsvda', solver='mu', alpha=.1, l1_ratio=.5)
rf = RandomForestClassifier(criterion='entropy', random_state=223)
mlp = MLPClassifier(random_state=42, verbose=2, max_iter=200)



feature_processing =  FeatureUnion([ 
    ('bow', Pipeline([('cv', vectorizer), ])),
    ('topics', Pipeline([('cv', vectorizer2), ('nmf', nmf),])),
    ('length', FunctionTransformer(doc_length, validate=False)),
    ('words', FunctionTransformer(lexicon_count, validate=False)),
    ('punc_count', FunctionTransformer(punc_count, validate=False)),
    ('capital_count', FunctionTransformer(capital_count, validate=False)),  
    ('num_exclamation_marks', FunctionTransformer(num_exclamation_marks, validate=False)),  
    ('num_question_marks', FunctionTransformer(num_question_marks, validate=False)),  
    ('xxx_pics_count', FunctionTransformer(xxx_pics_count, validate=False)),  
    ('has_lovexxx', FunctionTransformer(has_lovexxx, validate=False)),  
    ('has_url', FunctionTransformer(has_url, validate=False)),  
    ('has_pence', FunctionTransformer(has_pence, validate=False)),  
    ('has_money', FunctionTransformer(has_money, validate=False)),
    ('has_sexy_phrase', FunctionTransformer(has_sexy_phrase, validate=False)),
])

steps = [('features', feature_processing)]

pipe = Pipeline([('features', feature_processing), ('clf', mlp)])

param_grid = {}

# You - yes you! Manually choose which classifier run you'd like to try.
# In future I'd like to automate this so that both are tried; but for this simple
# Kaggle competition, I'm keeping it simple. You can set this to either:
#
# "RF" - Random Forest
# "MLP" - NN
#
# and then re-run the entire notebook
which_clf = "RF"

if which_clf == "RF":

    steps.append(('clf', rf))

    # I already ran a 4-hour extensive grid; this is not the full set. BTW, the best hyperarms I found are:
    # Best parameter (CV scy_train0.988):
    # {'clf__class_weight': None, 
    # 'clf__n_estimators': 500, 
    # 'features__bow__cv__max_features': 500, 
    # 'features__bow__cv__preprocessor': None, 
    # 'features__bow__cv__use_idf': False, 
    # 'features__topics__cv__stop_words': None, 
    # 'features__topics__nmf__n_components': 300}
    param_grid = {
        'features__bow__cv__preprocessor': [None, my_preprocess],
        'features__bow__cv__max_features': [50, 200, 500, 1000, 5000],
        'features__bow__cv__use_idf': [True, False],
        'features__topics__cv__stop_words': [None],
        'features__topics__nmf__n_components': [25, 75, 150],
        'clf__n_estimators': [100, 500],
        'clf__class_weight': [None, class_weight],
    }
    
elif which_clf == "MLP":
    
    steps.append(('clf', mlp))

    # I already ran a 4-hour extensive grid; this is not the full set. BTW, the best hyperarms I found are:
    # Best parameter (CV scy_train0.991): 
    # {'clf__hidden_layer_sizes': (25, 25, 25), 
    # 'features__bow__cv__max_features': 3000, 
    # 'features__bow__cv__min_df': 0, 
    # 'features__bow__cv__preprocessor': <function my_preprocess at 0x0000024801E161E0>, 
    # 'features__bow__cv__use_idf': False, 
    # 'features__topics__nmf__n_components': 300}
    param_grid = {
        'features__bow__cv__preprocessor': [my_preprocess],
        'features__bow__cv__max_features': [1000, 3000],
        'features__bow__cv__min_df': [0],
        'features__bow__cv__use_idf': [False],
        'features__topics__nmf__n_components': [300],
        'clf__hidden_layer_sizes': [(100, ), (50, 50), (25, 25, 25)],
    }

pipe = Pipeline(steps)

search = GridSearchCV(pipe, param_grid, cv=3, n_jobs=3, scoring='f1_micro', return_train_score=True, verbose=2)

# Fit Model

It's showtime, baby.

In [10]:
search = search.fit(X, y)

Fitting 3 folds for each of 240 candidates, totalling 720 fits


In [11]:
print("Best parameter (CV scy_train%0.3f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV scy_train0.988):
{'clf__class_weight': None, 'clf__n_estimators': 100, 'features__bow__cv__max_features': 1000, 'features__bow__cv__preprocessor': None, 'features__bow__cv__use_idf': False, 'features__topics__cv__stop_words': None, 'features__topics__nmf__n_components': 150}


In [12]:
# Print out the results of hyperparmater tuning

def cv_results_to_df(cv_results):
    results = pd.DataFrame(list(cv_results['params']))
    results['mean_fit_time'] = cv_results['mean_fit_time']
    results['mean_score_time'] = cv_results['mean_score_time']
    results['mean_train_score'] = cv_results['mean_train_score']
    results['std_train_score'] = cv_results['std_train_score']
    results['mean_test_score'] = cv_results['mean_test_score']
    results['std_test_score'] = cv_results['std_test_score']
    results['rank_test_score'] = cv_results['rank_test_score']

    results = results.sort_values(['mean_test_score'], ascending=False)
    return results

results = cv_results_to_df(search.cv_results_)
results
#results.to_csv('results2.csv', index=False)

Unnamed: 0,clf__class_weight,clf__n_estimators,features__bow__cv__max_features,features__bow__cv__preprocessor,features__bow__cv__use_idf,features__topics__cv__stop_words,features__topics__nmf__n_components,mean_fit_time,mean_score_time,mean_train_score,std_train_score,mean_test_score,std_test_score,rank_test_score
41,,100,1000,,False,,150,24.535563,2.137078,0.9999,0.000141,0.988200,0.001418,1
95,,500,500,<function my_preprocess at 0x000001A69E265E50>,False,,150,26.523727,1.858391,1.0000,0.000000,0.987400,0.002137,2
98,,500,1000,,True,,150,27.745744,1.694011,1.0000,0.000000,0.987400,0.001769,3
101,,500,1000,,False,,150,25.625283,1.594532,1.0000,0.000000,0.987400,0.001769,3
37,,100,1000,,True,,75,7.890456,2.593838,0.9999,0.000141,0.987200,0.000748,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48,,100,5000,,True,,25,4.883458,0.921526,0.9993,0.000283,0.980599,0.002214,236
3,,100,50,,False,,25,4.932565,0.762892,0.9985,0.000245,0.980400,0.001725,237
231,"{0: 0.5777675063554426, 1: 3.714710252600297}",500,5000,,False,,25,18.535887,2.292993,0.9993,0.000374,0.980399,0.002998,238
171,"{0: 0.5777675063554426, 1: 3.714710252600297}",100,5000,,False,,25,16.825491,3.169034,0.9992,0.000141,0.980199,0.004191,239


# Estimate Model Performance on Val Data

In [13]:
# Because we are using a pipeline and a GridSearchCV, things are a bit complicated.
# I want to get references to the objects from the pipeline with the *best* hyperparameter settings,
# so that I can explore those objects (later). 
# The code below is a bit ugly, but after reading throught the docs of Pipeline, 
# I believe this is the only way to do it.

# The pipeline with the best performance
pipeline = search.best_estimator_

# Get the feature processing pipeline, so I can use it later
feature_processing_obj = pipeline.named_steps['features']

# Find the vectorizer objects, the NMF objects, and the classifier objects
pipevect= dict(pipeline.named_steps['features'].transformer_list)
vectorizer_obj = pipevect.get('bow').named_steps['cv']
vectorizer_obj2 = pipevect.get('topics').named_steps['cv']
nmf_obj = pipevect.get('topics').named_steps['nmf']
clf_obj = pipeline.named_steps['clf']

# Sanity check - what was vocabSize set to? Should match the output here.
len(vectorizer_obj.get_feature_names())

1000

In [17]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, f1_score

features_val = feature_processing_obj.transform(X_val).todense()

pred_val = search.predict(X_val)

print("Confusion matrix:")
print(confusion_matrix(y_val, pred_val))

print("\nF1 Score = {:.5f}".format(f1_score(y_val, pred_val, average='macro')))

print("\nClassification Report:")
print(classification_report(y_val, pred_val))

Confusion matrix:
[[20  0]
 [ 0  5]]

F1 Score = 1.00000

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        20
           1       1.00      1.00      1.00         5

    accuracy                           1.00        25
   macro avg       1.00      1.00      1.00        25
weighted avg       1.00      1.00      1.00        25



# Train model on all data

In [15]:
#search.fit(X, y)

# Estimate Performance on Test
Data

In [18]:
test_df = pd.read_csv('spamraw_test.csv')

features_test = feature_processing_obj.transform(test_df['sms_text']).todense()
pred_test = search.predict(test_df['sms_text'])

# Output the predictions to a file to upload to Kaggle.
# Uncomment to actually create the file
#my_submission = pd.DataFrame({'id': test_df.id, 'predicted': pred_test})
#my_submission.to_csv('steve_submission.csv', index=False)

solutions_df = pd.read_csv('sms-spam/spamraw_test_solutions.csv')
y_test = solutions_df['spam']

print("Confusion matrix:")
print(confusion_matrix(y_test, pred_test))

print("\nF1 Score = {:.5f}".format(f1_score(y_test, pred_test, average="macro")))

print("\nClassification Report:")
print(classification_report(y_test, pred_test))

FileNotFoundError: [Errno 2] File sms-spam/spamraw_test_solutions.csv does not exist: 'sms-spam/spamraw_test_solutions.csv'

# Explore the Model Further

The path to enlightment begins by understanding what our model learned.

## Print Topics

Print the top words for each of the NMF topics

In [None]:
n_top_words = 10
def get_top_words(H, feature_names):
    output = []
    for topic_idx, topic in enumerate(H):
        top_words = [(feature_names[i]) for i in topic.argsort()[:-n_top_words - 1:-1]]
        output.append(top_words)
        
    return pd.DataFrame(output) 

top_words = get_top_words(nmf_obj.components_, vectorizer_obj2.get_feature_names())
top_words

## Print Feature Importances

Note: this section will only work with models that have `.feature_importances_`, such as RF and DT.

In [None]:
topic_feature_names = ["topic {}".format(i) for i in range(nmf_obj.n_components_)]

stat_feature_names = [t[0] for t in pipeline.named_steps['features'].transformer_list if t[0] not in ['topics', 'bow']]

feature_names = vectorizer_obj.get_feature_names() + topic_feature_names + stat_feature_names
len(feature_names)

feature_importances = None
if hasattr(clf_obj, 'feature_importances_'):
    feature_importances = clf_obj.feature_importances_

In [None]:
features_train = feature_processing_obj.transform(X_train).todense()

if feature_importances is None:
    print("No Feature importances! Skipping.")
else:
    N = features_train.shape[1]

    ssum = np.zeros(N)
    avg = np.zeros(N)
    avg_spam = np.zeros(N)
    avg_ham = np.zeros(N)
    for i in range(N):
        ssum[i] = sum(features_train[:, i]).reshape(-1, 1)
        avg[i] = np.mean(features_train[:, i]).reshape(-1, 1)
        avg_spam[i] = np.mean(features_train[y_train==1, i]).reshape(-1, 1)
        avg_ham[i] = np.mean(features_train[y_train==0, i]).reshape(-1, 1)

    rf = search.best_estimator_
    imp = pd.DataFrame(data={'feature': feature_names, 'imp': feature_importances, 'sum': ssum, 'avg': avg, 'avg_ham': avg_ham, 'avg_spam': avg_spam})
    imp = imp.sort_values(by='imp', ascending=False)
    imp.head(20)
    imp.tail(10)
    #imp.to_csv('importances.csv', index=False)

# Further explanation on Val Data

This cool package will explain all the predictions of a tree-based model. I'll have it explain all predictions that were incorrect, to see what is going on (and hopefully inform some additional feature engineering or cleaning steps).

Note: this only works on tree-based models, like RF. This cell will crash when using, e.g., MLPClassifier

In [None]:
if feature_importances is None:
    print("No Feature importances! Skipping.")
else:

    from treeinterpreter import treeinterpreter as ti

    prediction, bias, contributions = ti.predict(clf_obj, features_val)

    for i in range(len(features_val)):
        if y_val.iloc[i] == pred_val[i]:
            continue
        print("Instance {}".format(i))
        X_val.iloc[i]
        print("Bias (trainset mean) {}".format(bias[i]))
        print("Truth {}".format(y_val.iloc[i]))
        print("Prediction {}".format(prediction[i, :]))
        print("Feature contributions:")
        con = pd.DataFrame(data={'feature': feature_names, 
                                 'value': features_val[i].A1,
                                 'legit contr': contributions[i][:, 0],
                                 'spam contr': contributions[i][:, 1],
                                 'abs contr': abs(contributions[i][:, 1])})

        con = con.sort_values(by="abs contr", ascending=False)
        con['spam cumulative'] = con['spam contr'].cumsum() + bias[i][1]
        con.head(30)
        print("-"*20) 

In [None]:
if  feature_importances is None:
    print("No Feature importances! Skipping.")
else:

    from treeinterpreter import treeinterpreter as ti

    prediction, bias, contributions = ti.predict(clf_obj, features_test)

    for i in range(len(features_test)):
        if y_test[i] == pred_test[i]:
            continue
        print("Instance {}".format(i))
        test_df.iloc[i, :].sms_text
        print("Bias (trainset mean) {}".format(bias[i]))
        print("Truth {}".format(y_test[i]))
        print("Prediction {}".format(prediction[i, :]))
        print("Feature contributions:")
        con = pd.DataFrame(data={'feature': feature_names,
                                 'value': features_test[i].A1,
                                 'legit contr': contributions[i][:, 0],
                                 'spam contr': contributions[i][:, 1],
                                 'abs contr': abs(contributions[i][:, 1])})
        con = con.sort_values(by="abs contr", ascending=False)
        con['spam cumulative'] = con['spam contr'].cumsum() + bias[i][1]
        con.head(30)
        print("-"*20) 