In [29]:
data_file = 'spam&ham.txt'

In [30]:
import pandas as pd

In [31]:
data = pd.read_csv(data_file, sep='\t', header = None, names = ['label', 'sms'])

In [32]:
data.head()

Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Load Stopwords and punctuation

In [33]:
import string

In [35]:
import nltk

In [36]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\VP200\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [37]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\VP200\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [38]:
stopwords = nltk.corpus.stopwords.words('english')

In [39]:
punctuation = string.punctuation

In [40]:
print(stopwords[:5])

['i', 'me', 'my', 'myself', 'we']


In [41]:
print(punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


# pre-process sms content

In [61]:
def pre_process(sms):
    lowercase = "".join([char.lower() for char in sms if char not in punctuation])
    tokenize = nltk.tokenize.word_tokenize(lowercase)
    remove_stopwords = [word for word in tokenize if word not in stopwords]
    return remove_stopwords

In [62]:
data['processed'] = data['sms'].apply(lambda x: pre_process(x))

In [63]:
data.head()

Unnamed: 0,label,sms,processed
0,ham,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy, available, bugis, n..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, dont, think, goes, usf, lives, around, t..."


# Categorizing and Counting Tokens

In [67]:
def categorize_words():
    spam_words = []
    ham_words = []
    
    #spam associated words
    
    for sms in data['processed'][data['label'] == 'spam']:
        for word in sms:
            spam_words.append(word)
            
    #ham associated words
    
    for sms in data['processed'][data['label'] == 'ham']:
        for word in sms:
            ham_words.append(word)
            
    return spam_words, ham_words

spam_words, ham_words = categorize_words()

print(spam_words[:5])
print(ham_words[:5])

['free', 'entry', '2', 'wkly', 'comp']
['go', 'jurong', 'point', 'crazy', 'available']


# Predict Function

In [90]:
def predict(user_input):
    spam_counter = 0
    ham_counter = 0
    
    for word in user_input:
        spam_counter += spam_words.count(word)
        ham_counter += ham_words.count(word)
        
    print('***********************************RESULTS*******************************************')
        
    if ham_counter > spam_counter:
        accuracy = round((ham_counter / (ham_counter + spam_counter)) * 100, 2)
        print('message is not spam, with {}% accuracy'.format(accuracy))
    elif spam_counter > ham_counter:
        accuracy = round((spam_counter / (ham_counter + spam_counter)) * 100, 2)
        print('Message is spam, with {}% accuracy'.format(accuracy))
    else:
        print('Message could be spam, with 50% accuracy')

In [91]:
# Collect user input

user_input = input('Please type a message to check if our function predicts properly')

Please type a message to check if our function predicts properlyCRA has very important information for you! call 1-800-789-2345 now!


In [92]:
processed_input = pre_process(user_input)

In [93]:
predict(processed_input)

***********************************RESULTS*******************************************
Message is spam, with 59.77% accuracy


In [None]:
|