In [228]:
import pandas as pd
import numpy as np

In [229]:
import gensim
import nltk
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
np.random.seed(891)
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))

In [230]:
df = pd.read_csv('SPAM text message.csv')
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [231]:
df = pd.get_dummies(data = df, columns=['Category'], drop_first=True)
df

Unnamed: 0,Message,Category_spam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...
5567,This is the 2nd time we have tried 2 contact u...,1
5568,Will ü b going to esplanade fr home?,0
5569,"Pity, * was in mood for that. So...any other s...",0
5570,The guy did some bitching but I acted like i'd...,0


### Why will these preprocessing techniques be applicable?
### Because not all text-related data is formatted appropriately, several preprocessing approaches are useful. Preprocessing procedures are necessary to clean up inconsistencies and undesirable formatting in RAW data, like the dataset that is being used.

In [232]:
stemmer = SnowballStemmer(language = 'english')
lemma = WordNetLemmatizer()

def lemmatize_stemming(term):
    return stemmer.stem(lemma.lemmatize(term, pos='v'))

In [233]:
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text): 
        if token not in stop and len(token) > 3 :
            result.append(lemmatize_stemming(token))         
    return result

In [234]:
processed_docs = df['Message'].map(preprocess)
processed_docs

0       [jurong, point, crazi, avail, bugi, great, wor...
1                                                  [joke]
2       [free, entri, wkli, comp, final, tkts, text, r...
3                                        [earli, alreadi]
4                       [think, go, live, around, though]
                              ...                        
5567    [time, tri, contact, pound, prize, claim, easi...
5568                                 [go, esplanad, home]
5569                                [piti, mood, suggest]
5570    [bitch, act, like, interest, buy, someth, els,...
5571                                   [rofl, true, name]
Name: Message, Length: 5572, dtype: object

In [235]:
dictionary = gensim.corpora.Dictionary(processed_docs)
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
BOW_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]


In [236]:
TDM = gensim.matutils.corpus2csc(bow_corpus)
DTM = TDM.transpose()
print(doc_term_matrix.shape)

(5572, 445)


In [237]:
df2 = pd.DataFrame(DTM.toarray().astype('int'),columns=dictionary.values())
df2

Unnamed: 0,avail,great,point,world,joke,appli,entri,final,free,question,...,understand,await,drop,plus,wrong,least,parti,surpris,alon,how
0,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,2,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5568,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5569,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


### Is it sparse or dense?
### A: sparse
### Find two non-zero entries and briefly interpret their meaning, in words
### A: avail and great showed up 1 time in doc 0 
### entri showed up 2 times in doc 2 etc...

In [238]:
from sklearn.feature_extraction.text import TfidfTransformer

In [239]:
transformer = TfidfTransformer()
matrix = transformer.fit_transform(df2)
tfidf = pd.DataFrame(matrix.toarray(), columns = dictionary.values())
tfidf

Unnamed: 0,avail,great,point,world,joke,appli,entri,final,free,question,...,understand,await,drop,plus,wrong,least,parti,surpris,alon,how
0,0.560765,0.413378,0.511924,0.502588,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.000000,0.000000,0.000000,1.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.000000,0.000000,0.0,0.309851,0.658219,0.311331,0.212870,0.305654,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5568,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5569,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5570,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.276912,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [240]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(tfidf, df['Category_spam'], test_size=0.4, random_state=42)
LR = LogisticRegression()
LR.fit(x_train, y_train)
y_pred = LR.predict(x_test)

In [241]:
acc = LR.score(x_test,y_test)

In [242]:
from sklearn.metrics import precision_recall_fscore_support
precision, recall, f1, support = precision_recall_fscore_support(y_test,y_pred, average= 'binary')
print(f"accuracy = {round(acc*100,2)}%\n\
precision = {round(precision*100,2)}%\n\
recall = {round(recall*100,2)}%\n\
f1score = {round(f1*100,2)}%")

accuracy = 95.92%
precision = 95.05%
recall = 72.51%
f1score = 82.26%
