## Importation des packages

In [2]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords

## Lecture des données

### 1ére méthode avec manipulation des string et listes puis transformer en df

In [3]:
my_data = open('Data/SMSSpamCollection.txt').read()

In [4]:
print(f"type des données : {type(my_data)} et taille : {len(my_data)}")


type des données : <class 'str'> et taille : 477203


In [5]:
my_data = my_data.replace("\t","\n")
my_data = my_data.split("\n")

In [6]:
# on a une liste de str qui contient: 'label','mail'
print(type(my_data),len(my_data))

<class 'list'> 11149


In [7]:
# séparé les labels et les mails en deux listes 
list_label = my_data[0::2]
list_mail = my_data[1::2]

In [8]:
print(len(list_mail))
print(len(list_label))

5574
5575


In [9]:
del list_label[-1]

In [10]:
print(len(list_mail))
print(len(list_label))

print(list_mail[0:5])
print(list_label[0:5])

5574
5574
['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...', 'Ok lar... Joking wif u oni...', "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", 'U dun say so early hor... U c already then say...', "Nah I don't think he goes to usf, he lives around here though"]
['ham', 'ham', 'spam', 'ham', 'ham']


In [11]:
pd.set_option('display.max_colwidth', 100)

df = pd.DataFrame(
    {
        'label' : list_label,
        'mail': list_mail
    }
)

In [12]:
df

Unnamed: 0,label,mail
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"
...,...,...
5569,spam,This is the 2nd time we have tried 2 contact u. U have won the £750 Pound prize. 2 claim is easy...
5570,ham,Will ü b going to esplanade fr home?
5571,ham,"Pity, * was in mood for that. So...any other suggestions?"
5572,ham,The guy did some bitching but I acted like i'd be interested in buying something else next week ...


### 2éme méthode avec df directement

In [13]:
df = pd.read_csv('./Data/SMSSpamCollection.txt', sep='\t', header=None)
df.columns = ['label', 'mail']
df

Unnamed: 0,label,mail
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u. U have won the £750 Pound prize. 2 claim is easy...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other suggestions?"
5570,ham,The guy did some bitching but I acted like i'd be interested in buying something else next week ...


In [14]:
print("Nombre de lignes totales : {}\nNombre de mail spam : {}\nNombre de mail ham : {}".format(len(df),
                                                                                                len(df[df['label']=='spam']),
                                                                                                len(df[df['label']=='ham'])))

Nombre de lignes totales : 5572
Nombre de mail spam : 747
Nombre de mail ham : 4825


In [15]:
# vérification à faire 
print("Nombre de label vide (avec une valeur nulle) : {}".format(df['label'].isnull().sum()))
print("Nombre de mail vide (avec une valeur nulle) : {}".format(df['mail'].isnull().sum()))

Nombre de label vide (avec une valeur nulle) : 0
Nombre de mail vide (avec une valeur nulle) : 0


## Nettoyage de données

### 1.remove punctuation

In [16]:
def remove_punctuation(texte):
    data_clean = "".join([ch for ch in texte if ch not in string.punctuation])
    return data_clean


df['mail_without_punct'] = df['mail'].apply(lambda x: remove_punctuation(x))
df.head()

Unnamed: 0,label,mail,mail_without_punct
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...",Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amo...
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say
4,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though


### 2.tokenization

In [17]:
# tokenizer les mail
# \W+ matches any Non-alphanumeric character. Non-alphanumeric means no letter, digit, and underscore.
def tokeninze(texte):
    data_tokenized = re.split("\W+",texte)
    return data_tokenized

df['mail_tokenized'] = df['mail_without_punct'].apply(lambda x: tokeninze(x))
df.head()

Unnamed: 0,label,mail,mail_without_punct,mail_tokenized
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...",Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amo...,"[Go, until, jurong, point, crazy, Available, only, in, bugis, n, great, world, la, e, buffet, Ci..."
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[Ok, lar, Joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[Free, entry, in, 2, a, wkly, comp, to, win, FA, Cup, final, tkts, 21st, May, 2005, Text, FA, to..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[U, dun, say, so, early, hor, U, c, already, then, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[Nah, I, dont, think, he, goes, to, usf, he, lives, around, here, though]"


### 3.remove stop words

In [18]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/yssf/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
en_stop_words = set(stopwords.words('english'))
print(sorted(list(en_stop_words)))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", "he's", 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', "i'd", "i'll", "i'm", "i've", 'if', 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [20]:
def remove_stop_word(texte):
    data_clean = [word for word in texte if word not in en_stop_words]
    return data_clean


df['mail_without_stops'] = df['mail_tokenized'].apply(lambda x: remove_stop_word(x))
df.head()

Unnamed: 0,label,mail,mail_without_punct,mail_tokenized,mail_without_stops
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...",Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amo...,"[Go, until, jurong, point, crazy, Available, only, in, bugis, n, great, world, la, e, buffet, Ci...","[Go, jurong, point, crazy, Available, bugis, n, great, world, la, e, buffet, Cine, got, amore, wat]"
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[Ok, lar, Joking, wif, u, oni]","[Ok, lar, Joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[Free, entry, in, 2, a, wkly, comp, to, win, FA, Cup, final, tkts, 21st, May, 2005, Text, FA, to...","[Free, entry, 2, wkly, comp, win, FA, Cup, final, tkts, 21st, May, 2005, Text, FA, 87121, receiv..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[U, dun, say, so, early, hor, U, c, already, then, say]","[U, dun, say, early, hor, U, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[Nah, I, dont, think, he, goes, to, usf, he, lives, around, here, though]","[Nah, I, dont, think, goes, usf, lives, around, though]"


In [21]:
# Regrouppant le tout dans une seule fonction
def clean_mail(texte):
    data_without_punct = "".join([ch for ch in texte if ch not in string.punctuation])
    data_tokenized = re.split("\W+", data_without_punct)
    data_without_stops = [word for word in data_tokenized if word not in en_stop_words]
    result = data_without_stops
    return result

df['mail_clean'] = df['mail'].apply(lambda x: clean_mail(x.lower()))
df.head()

Unnamed: 0,label,mail,mail_without_punct,mail_tokenized,mail_without_stops,mail_clean
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...",Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amo...,"[Go, until, jurong, point, crazy, Available, only, in, bugis, n, great, world, la, e, buffet, Ci...","[Go, jurong, point, crazy, Available, bugis, n, great, world, la, e, buffet, Cine, got, amore, wat]","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]"
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[Ok, lar, Joking, wif, u, oni]","[Ok, lar, Joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[Free, entry, in, 2, a, wkly, comp, to, win, FA, Cup, final, tkts, 21st, May, 2005, Text, FA, to...","[Free, entry, 2, wkly, comp, win, FA, Cup, final, tkts, 21st, May, 2005, Text, FA, 87121, receiv...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[U, dun, say, so, early, hor, U, c, already, then, say]","[U, dun, say, early, hor, U, c, already, say]","[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[Nah, I, dont, think, he, goes, to, usf, he, lives, around, here, though]","[Nah, I, dont, think, goes, usf, lives, around, though]","[nah, dont, think, goes, usf, lives, around, though]"


In [22]:
res = df[['label','mail_clean']];
res

Unnamed: 0,label,mail_clean
0,ham,"[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]"
1,ham,"[ok, lar, joking, wif, u, oni]"
2,spam,"[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
3,ham,"[u, dun, say, early, hor, u, c, already, say]"
4,ham,"[nah, dont, think, goes, usf, lives, around, though]"
...,...,...
5567,spam,"[2nd, time, tried, 2, contact, u, u, 750, pound, prize, 2, claim, easy, call, 087187272008, now1..."
5568,ham,"[ü, b, going, esplanade, fr, home]"
5569,ham,"[pity, mood, soany, suggestions]"
5570,ham,"[guy, bitching, acted, like, id, interested, buying, something, else, next, week, gave, us, free]"


### 4.stematisation

In [23]:
df = df.drop('mail_tokenized', axis=1)
df = df.drop('mail_without_stops', axis=1)
df = df.drop('mail_without_punct', axis=1)
df.head()

Unnamed: 0,label,mail,mail_clean
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]"
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, goes, usf, lives, around, though]"


In [24]:
ps = nltk.PorterStemmer()

def stem_mail(words):
    result = [ps.stem(word) for word in words]
    return result

df['mail_stemmed'] = df['mail_clean'].apply(lambda x: stem_mail(x))
df.head()

Unnamed: 0,label,mail,mail_clean,mail_stemmed
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]","[go, jurong, point, crazi, avail, bugi, n, great, world, la, e, buffet, cine, got, amor, wat]"
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]","[ok, lar, joke, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv...","[free, entri, 2, wkli, comp, win, fa, cup, final, tkt, 21st, may, 2005, text, fa, 87121, receiv,..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, earli, hor, u, c, alreadi, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, goes, usf, lives, around, though]","[nah, dont, think, goe, usf, live, around, though]"


### 4.v2 Lemmatisation

In [25]:
# Le défaut du stemming est qu'il renvoie certains mots qui n'apparaissent pas dans le dictionnaire + non prise en compte du contexe (perte d'informations)
# Le défaut de la lemmatisation est le temps de calcul plus important
nltk.download('wordnet')
nltk.download('omw-1.4')
wn = nltk.WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /Users/yssf/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/yssf/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [26]:
def lemmatise_mail(words):
    result = [wn.lemmatize(word) for word in words]
    return result

df['mail_lemmatized'] = df['mail_clean'].apply(lambda x: lemmatise_mail(x))
df.head()

Unnamed: 0,label,mail,mail_clean,mail_stemmed,mail_lemmatized
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]","[go, jurong, point, crazi, avail, bugi, n, great, world, la, e, buffet, cine, got, amor, wat]","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]"
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]","[ok, lar, joke, wif, u, oni]","[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv...","[free, entri, 2, wkli, comp, win, fa, cup, final, tkt, 21st, may, 2005, text, fa, 87121, receiv,...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, earli, hor, u, c, alreadi, say]","[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, goes, usf, lives, around, though]","[nah, dont, think, goe, usf, live, around, though]","[nah, dont, think, go, usf, life, around, though]"


Récapitulatif :   
En entrée nous avons eu du texte brut (un seul string avec ponctuations et stop_words).   
En sortie nous aurons une liste de mots clean (sans ponctuations et stop_words) à la quel nous appliquons la stematisation ou la lemmatisation.

In [27]:

import pandas as pd
import re
import nltk
import string
from nltk.corpus import stopwords


data = pd.read_csv('./Data/SMSSpamCollection.txt', sep='\t', header=None)
data.columns = ['label','content']

en_stop_words = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

def clean_data(texte):
    texte_punct = "".join([ch for ch in texte if ch not in string.punctuation])
    tokens = re.split("\W+", result1)
    texte_clean = [ps.stem(word) for word in tokens if word not in en_stop_words]
    return texte_clean
