In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import string
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords

/kaggle/input/sms-spam-collection-dataset/spam.csv


#### Load the csv file, in this case we needed to add some encoding to it so that the unicode characters wouldn't break the 


In [2]:
df = pd.read_csv("/kaggle/input/sms-spam-collection-dataset/spam.csv", encoding='latin-1')

In [3]:
##  see what we're dealing with by getting the top 5 rows and columns

df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


We see that there are more than the columns we really need so let's subset them to the first two columns and all rows.  The format is iloc[ROW RANGE,COLUMN RANGE]

In [4]:
df_sms = df.iloc[:,0:2]

In [5]:
df_sms.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


Let's relabel the columns

In [6]:
df_sms.columns = ['label', 'message']

In [7]:
df_sms.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


Let's find out how many items labeled "spam" there are and use describe to get some more info like which sms message is the top spam message

In [8]:
df_sms.groupby('label').describe()

Unnamed: 0_level_0,message,message,message,message
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


Let's find out the length of these messages and place the value of length into a new column

In [9]:
df_sms['length'] = df_sms.message.apply(len)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Let's feature engineer the "spam" "ham" label and make it binary but converting them to 1 , 0 into a new column.

In [10]:
df_sms['label_num'] = df_sms.label.map({'ham':0, 'spam':1})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [11]:
df_sms.head()

Unnamed: 0,label,message,length,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",111,0
1,ham,Ok lar... Joking wif u oni...,29,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,1
3,ham,U dun say so early hor... U c already then say...,49,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,0


Let's set up a text processor function that uses 'stopwords' module and will remove puncuation and any extra stopwords we define

In [12]:
def sms_text_process(mess):
    STOPWORDS = stopwords.words('english') + ['u', 'ü', 'ur', '4', '2', 'im', 'dont', 'doin', 'ure']
    nopunc = [char for char in mess if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    return ' '.join([word for word in nopunc.split() if word.lower() not in STOPWORDS])

We can view the result by applying it just to the first few rows

In [13]:
df_sms['message'].head(5).apply(sms_text_process)

0    Go jurong point crazy Available bugis n great ...
1                                Ok lar Joking wif oni
2    Free entry wkly comp win FA Cup final tkts 21s...
3                      dun say early hor c already say
4               Nah think goes usf lives around though
Name: message, dtype: object

Let's apply the processor to all the messages and create a new column with the new clean output

In [14]:
df_sms['message_clean'] = df_sms['message'].apply(sms_text_process)

In [15]:
df_sms.head()

Unnamed: 0,label,message,length,label_num,message_clean
0,ham,"Go until jurong point, crazy.. Available only ...",111,0,Go jurong point crazy Available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,29,0,Ok lar Joking wif oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,1,Free entry wkly comp win FA Cup final tkts 21s...
3,ham,U dun say so early hor... U c already then say...,49,0,dun say early hor c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,0,Nah think goes usf lives around though


In order to do predictive modeling, we must remove the context of the messages and agree that the context simply doesn't matter.

We don't necessarily care what the message is about, rather we need to determine a way to analyze the words and phrases in it.

We need to break down the sms messages into some form of numerical representation that the computer can ingest and analyze.

In NLP this is called tokenization whereby we count the number of times a word appears, no matter where it appears in a message.  Each new word gets a new token.  If a message has the same token structure, it's very likely it is a repeat, and therefore we can begin pattern recognition on that structure and determine if it is spam.

CountVectorizer helps us break down text content into a tokenized structure.

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

Bag of Words is a name for the process (not the name of a module) that takes the content of each message and breaks down the words into numerical counts.  You lose context of the message and essentially now you have a bag of words and not a real sentence. 

We will apply this process to the clean messages.

In [17]:
bag_of_words = CountVectorizer(analyzer=sms_text_process).fit(df_sms['message_clean'])

In [18]:
print(bag_of_words.get_feature_names())

[' ', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\x89', '\x8b', '\x8e', '£', '©', 'ª', '¬', '´', '¼', 'Á', 'Â', 'È', 'Ì', 'Ï', 'Ð', 'Ò', 'Ó', 'Ô', 'Õ', 'Û', 'ä', 'å', 'ö', '÷']


In [19]:
print(len(bag_of_words.vocabulary_))

87


In [20]:
print(bag_of_words.vocabulary_)

{'G': 17, 'o': 51, ' ': 0, 'j': 46, 'u': 57, 'r': 54, 'n': 50, 'g': 43, 'p': 52, 'i': 45, 't': 56, 'c': 39, 'a': 37, 'z': 62, 'y': 61, 'A': 11, 'v': 58, 'l': 48, 'b': 38, 'e': 41, 's': 55, 'w': 59, 'd': 40, 'f': 42, 'C': 13, 'm': 49, 'O': 25, 'k': 47, 'J': 20, 'F': 16, '2': 3, '1': 2, 'M': 23, '0': 1, '5': 6, 'T': 30, 'x': 60, '8': 9, '7': 8, 'q': 53, '4': 5, 'h': 44, 'N': 24, 'H': 18, '3': 4, 'I': 19, 'X': 34, 'å': 84, '£': 66, 'E': 15, 'V': 32, 'P': 26, '9': 10, 'W': 33, 'R': 28, '6': 7, 'K': 21, 'L': 22, 'U': 31, 'S': 29, 'D': 14, 'B': 12, 'Y': 35, 'Q': 27, 'Õ': 81, 'Ì': 75, '¼': 71, '\x89': 63, 'Û': 82, '÷': 86, 'Ï': 76, 'Z': 36, 'Ò': 78, '¬': 69, 'Ó': 79, 'Ô': 80, 'ª': 68, 'Ð': 77, 'È': 74, '©': 67, 'ä': 83, 'Â': 73, '\x8e': 65, 'ö': 85, '´': 70, '\x8b': 64, 'Á': 72}


message_bagofwords = bag_of_words.transform(df_sms['message_clean'])

In [21]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_trans = TfidfTransformer().fit(message_bagofwords)

NameError: name 'message_bagofwords' is not defined

In [22]:
message_tfidf = tfidf_trans.transform(message_bagofwords)
print(message_tfidf.shape)

NameError: name 'tfidf_trans' is not defined

In [23]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(message_tfidf,df_sms['label'])

NameError: name 'message_tfidf' is not defined

In [24]:
message = df_sms['message_clean'][4]
print(message)

Nah think goes usf lives around though


In [25]:
bag_of_words_for_message = bag_of_words.transform([message])

In [26]:
tfidf = tfidf_trans.transform(bag_of_words_for_message)

NameError: name 'tfidf_trans' is not defined

In [27]:
row = 688
print('predicted', spam_detect_model.predict(tfidf)[0])
print('actual', df_sms.label[row])

NameError: name 'spam_detect_model' is not defined

In [28]:
df_sms[row:row+1]

Unnamed: 0,label,message,length,label_num,message_clean
688,ham,Thanks love. But am i doing torch or bold.,42,0,Thanks love torch bold
