In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import unidecode
import re
from spacy.cli import download
import spacy
import nltk
import contractions

spacy.cli.download("en_core_web_sm")
nltk.download('stopwords')

from nltk.corpus import stopwords

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m23.6 MB/s[0m  [33m0:00:00[0meta [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


[nltk_data] Downloading package stopwords to /home/tguyot/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_csv('spam.csv', encoding='latin1')

# Clean up Unnamed columns and change column names for clarity
df = df.assign(
    is_spam=lambda x: 0
)
df.loc[df.v1 == 'ham', 'is_spam'] = 1
df = df[['is_spam', 'v2']]
df.columns = ['is_spam', 'sms']


In [36]:
def get_corpus(df):
    return ' '.join(' '.join(df.sms.tolist()).split())

def count_unique(corpus):
    return len(set(corpus.split()))

corpus = get_corpus(df)
initial = count_unique(corpus)
print(f'Initial unique words: {initial}')

Initial unique words: 15585


In [37]:
# Data cleaning

    # Remplacer toutes les majuscules par des minuscules
    # Retirer les accents (étape très utile pour les textes français)
    # Retirer la ponctuation
    # Retirer les nombres (seulement si ceux-ci n’apportent pas d’informations pour l’analyse qui suivra !!! Dans notre problème binaire, les nombres semblent apporter de l’information (comme les numéros de téléphone); on va les remplacer par “phonenumber”).
    # Lemmatizer
    # Enlever les “stop words”
    # Supprimer les lignes vides 

# Uniform capitalization

df_clean = df.copy()

df_clean.sms = df_clean.sms.str.lower()

# Replace accents
df_clean.sms = df_clean.sms.apply(lambda x: unidecode.unidecode(x))

# Remove punctuation
df_clean.sms = df_clean.sms.apply(lambda x: re.sub(r'[^\w\s]', '', x))
df_clean.sms = df_clean.sms.apply(lambda x: re.sub(r'_', '', x))

# Remove numbers
df_clean.sms = df_clean.sms.apply(lambda x: re.sub(r'\d', '', x))

print(f'Unique words after first cleaning (uniform capitalization, no accents, no punctuation, no numbers): {count_unique(get_corpus(df_clean))}')

Unique words after first cleaning (uniform capitalization, no accents, no punctuation, no numbers): 8610


In [38]:
# Lemmatize

# Need to download en_core_web_sm model if not already done
spacy.cli.download("en_core_web_sm")

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m27.1 MB/s[0m  [33m0:00:00[0meta [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Unique words after lemmatization: 7491


In [None]:
nlp = spacy.load("en_core_web_sm")
def to_lemma(text):
    doc = nlp(str(text))
    lemmatized_version = ''
    for token in doc:
        lemmatized_version += token.lemma_ + ' '
    return lemmatized_version

df_clean.sms = df_clean.sms.apply(
    lambda x: to_lemma(x)
)

print(f'Unique words after lemmatization: {count_unique(get_corpus(df_clean))}')

In [42]:
# Contractions fix (don't into do not etc...)
df_clean.sms = df_clean.sms.apply(lambda x: contractions.fix(x))

print(f'Unique words after expanding contractions: {count_unique(get_corpus(df_clean))}')

Unique words after expanding contractions: 7361


In [53]:
# Remove "stop words": words not bringing information, in same high frequency everywhere

stop_words = set(stopwords.words('english'))
df_clean.sms = df_clean.sms.apply(lambda x: ' '.join([char for char in x.split() if char not in stop_words]))
print(f'Unique words after removing stop words: {count_unique(get_corpus(df_clean))}')

Unique words after removing stop words: 7339


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean.sms = df_clean.sms.apply(lambda x: ' '.join([char for char in x.split() if char not in stop_words]))


In [54]:
# Remove blank messages

df_clean = df_clean.loc[df_clean.sms.map(len) > 0]

In [55]:
df_clean.to_csv('cleaned_spam.csv')

In [56]:
print(f'Final unique words after removing blank messages: {count_unique(get_corpus(df_clean))} for a corpus of {len(get_corpus(df_clean).split())} words. Percentage of uniqueness: {count_unique(get_corpus(df_clean))/len(get_corpus(df_clean).split()):.2%}')

Final unique words after removing blank messages: 7339 for a corpus of 50152 words. Percentage of uniqueness: 14.63%
