# 1. Chargement du jeu de données (dataset)

In [11]:
import pandas as pd

In [19]:
data = pd.read_csv("SMSSpamCollection.txt", sep="\t", header=None, names=["label", "sms"])
data.head(10)

Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


# 2. Pré-traitement (Pre-processing)

In [20]:
# intalling nltk module
import sys
!{sys.executable} -m pip install nltk



In [21]:
# charger les mots vides et la ponctuation
import string
import nltk

nltk.download('stopwords')
nltk.download('punkt')

stopwords = nltk.corpus.stopwords.words('english')
punctuation = string.punctuation

print(stopwords[:5])
print(punctuation)

['i', 'me', 'my', 'myself', 'we']
!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/abdoulaziz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/abdoulaziz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [27]:
# pré-traiter le contenu des SMS
def pre_process(sms):
    lowercase = "".join([char.lower() for char in sms if char not in punctuation])
    tokenize = nltk.tokenize.word_tokenize(lowercase)
    remove_stopwords = [word for word in tokenize if word not in stopwords]
    return remove_stopwords

data['processed'] = data['sms'].apply(lambda x: pre_process(x))
data.head()

Unnamed: 0,label,sms,processed
0,ham,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy, available, bugis, n..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, dont, think, goes, usf, lives, around, t..."


# 3. Catégoriser et compter les jetons

In [39]:
# categorizing ham/spam associated words
def categorize_words():
    spam_words = []
    ham_words = []
    
    # spam associated words
    for sms in data['processed'][data['label'] == 'spam']:
        for word in sms:
            spam_words.append(word)
            
    # spam associated words
    for sms in data['processed'][data['label'] == 'ham']:
        for word in sms:
            ham_words.append(word)
            
    return spam_words, ham_words

spam_words, ham_words = categorize_words()
print(spam_words[:5])
print(ham_words[:5])

['free', 'entry', '2', 'wkly', 'comp']
['go', 'jurong', 'point', 'crazy', 'available']


# 4. Fonction de prédiction

In [52]:
# itérer sur tous les mots de la saisie de l'utilisateur et compter leurs occurrences dans les mots ham_words et spam_words
def predict(user_input):
    spam_counter, ham_counter = 0, 0
    
    for word in user_input:
        spam_counter += spam_words.count(word)
        ham_counter += ham_words.count(word)
        
    print("**************** RESULTS ****************")
    if ham_counter > spam_counter:
        # ajout de précision pour ham_counter
        accuracy = round(ham_counter / (ham_counter + spam_counter) * 100, 2)
        print("message is not spam with {}% accuracy".format(accuracy))
    elif spam_counter > ham_counter:
        # ajout de précision pour spam_counter
        accuracy = round(spam_counter / (ham_counter + spam_counter) * 100, 2)
        print("message is spam with {}% accuracy".format(accuracy))
    else:
        print("message could be spam with 50% accuracy")

# 5. Collecte des entrées utilisateur

In [47]:
# collect uer input
user_input = input("Please type a spam or ham message to check if our function predicts properly\n")

Please type a spam or ham message to check if our function predicts properly
 CRA has very important information for you! call 1-800-789-2345 now


In [53]:
processed_input = pre_process(user_input)
predict(processed_input)

**************** RESULTS ****************
message is spam with 59.58% accuracy
