In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import unidecode
import re
from spacy.cli import download
import spacy
from textblob import TextBlob
import nltk
import contractions

spacy.cli.download("en_core_web_sm")
nltk.download('stopwords')

from nltk.corpus import stopwords

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m49.8 MB/s[0m  [33m0:00:00[0meta [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


[nltk_data] Downloading package stopwords to /home/tguyot/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Preprocessing Pipeline is :
#  1. Lowercase
# 2. Remove accents
# 3. Expand contractions
# 4. Remove punctuation
# 5. Remove numbers
# 6. Correct typos
# 7. Remove repeating characters
# 8. Lemmatization
# 9. Remove stopwords
# 10. Remove empty lines
df = pd.read_csv('spam.csv', encoding='latin1')


In [3]:
df = pd.read_csv('spam.csv', encoding='latin1')

# Clean up Unnamed columns and change column names for clarity
df = df.assign(
    is_spam=lambda x: 1
)
df.loc[df.v1 == 'ham', 'is_spam'] = 0
df = df[['is_spam', 'v2']]
df.columns = ['is_spam', 'sms']
df

Unnamed: 0,is_spam,sms
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [4]:
def get_corpus(df):
    return ' '.join(' '.join(df.sms.tolist()).split())

def count_unique(corpus):
    return len(set(corpus.split()))

corpus = get_corpus(df)
initial = count_unique(corpus)
print(f'Initial unique words: {initial}')

Initial unique words: 15585


In [5]:
# Data cleaning

# Uniform capitalization
df_clean = df.copy()
df_clean.sms = df_clean.sms.str.lower()

# Replace accents
df_clean.sms = df_clean.sms.apply(lambda x: unidecode.unidecode(x))

# Remove punctuation
df_clean.sms = df_clean.sms.apply(lambda x: re.sub(r'[^\w\s]', '', x))
df_clean.sms = df_clean.sms.apply(lambda x: re.sub(r'_', '', x))

# Remove numbers
df_clean.sms = df_clean.sms.apply(lambda x: re.sub(r'\d', '', x))

print(f'Unique words after first cleaning (uniform capitalization, no accents, no punctuation, no numbers): {count_unique(get_corpus(df_clean))}')

Unique words after first cleaning (uniform capitalization, no accents, no punctuation, no numbers): 8610


In [6]:
# Correct typos
def correct_typos(text):
    return str(TextBlob(text).correct())

df_clean.sms = df_clean.sms.apply(lambda x: correct_typos(x))
print(f'Unique words after typo correction: {count_unique(get_corpus(df_clean))}')


Unique words after typo correction: 6437


In [7]:
# Remove repeating characters

def remove_repeating_characters(text):
    return re.sub(r'(.)\1{2,}', r'\1\1', text)

df_clean.sms = df_clean.sms.apply(lambda x: remove_repeating_characters(x))
print(f'Unique words after removing repeating characters: {count_unique(get_corpus(df_clean))}')

Unique words after removing repeating characters: 6424


In [8]:
# Lemmatize

# Need to download en_core_web_sm model if not already done
spacy.cli.download("en_core_web_sm")

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m27.2 MB/s[0m  [33m0:00:00[0m eta [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [9]:
nlp = spacy.load("en_core_web_sm")
def to_lemma(text):
    doc = nlp(str(text))
    lemmatized_version = ''
    for token in doc:
        lemmatized_version += token.lemma_ + ' '
    return lemmatized_version

df_clean.sms = df_clean.sms.apply(
    lambda x: to_lemma(x)
)

print(f'Unique words after lemmatization: {count_unique(get_corpus(df_clean))}')

Unique words after lemmatization: 5421


In [10]:
# Contractions fix (don't into do not etc...)
df_clean.sms = df_clean.sms.apply(lambda x: contractions.fix(x))

print(f'Unique words after expanding contractions: {count_unique(get_corpus(df_clean))}')

Unique words after expanding contractions: 5416


In [11]:
# Remove "stop words": words not bringing information, in same high frequency everywhere

stop_words = set(stopwords.words('english'))
df_clean.sms = df_clean.sms.apply(lambda x: ' '.join([char for char in x.split() if char not in stop_words]))
print(f'Unique words after removing stop words: {count_unique(get_corpus(df_clean))}')

Unique words after removing stop words: 5277


In [12]:
# Remove blank messages

df_clean = df_clean.loc[df_clean.sms.map(len) > 0]

In [13]:
df_clean.to_csv('cleaned_spam.csv')

In [14]:
print(f'Final unique words after removing blank messages: {count_unique(get_corpus(df_clean))} for a corpus of {len(get_corpus(df_clean).split())} words. Percentage of uniqueness: {count_unique(get_corpus(df_clean))/len(get_corpus(df_clean).split()):.2%}')

Final unique words after removing blank messages: 5277 for a corpus of 47189 words. Percentage of uniqueness: 11.18%


In [15]:
# We managed to divide by 3 (15587 -> 5277) the amount of unique words. This will help our model to generalize better.

In [40]:
# Show a bunch of unique words

corpus2 = [get_corpus(df_clean)[i:i+200] for i in range(0, 2000, 200)]
corpus2

# We notice that still plenty of words are typos, rare words, slang... We could try to use a lookup table, or a more advanced spell checker, but for now this will do.

['go point crazy available boris n great world la e buffet line get wat ok war joke free entry wily come win cup final st may text receive entry question txt rate apply dun say early c already say ah I ',
 'think go live around though freemen hey darle week word back like fun still ok xx st cog send rev even brother like speak I treat I like aids patent per request selle selle minnaminunginte nurungu ves',
 'ta set callertune caller press copy friend callertune winner value network customer select received prize reward claim call claim code valid hour mobile month r entitle update late colour mobile camer',
 'a free call mobile update co free donna home soon I want talk stuff anymore tonight k give cry enough today six chance win cash pound txt cash send cost day day stand apply reply urgent win week free ',
 'membership prize jackson txt word claim wwdbuknet lccltd pobox ldnwarw give search right word thank breathe I promise I take help grant fulfil promise wonderful blessing time I d