In [None]:
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df_email = pd.read_csv('/content/drive/MyDrive/spam_ham_dataset.csv')

In [None]:
df_email.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [None]:
###Drop Nan Values
df_email=df_email.dropna()
df_email.reset_index(inplace=True)

In [None]:
import re
import string
def cleaning(txt):
    # case folding
    text = txt.lower()
    # remove multiple space, tabs, dan newlines
    text = re.sub('\s+',' ',text)
    # remove links
    text = text.replace("http://", " ").replace("https://", " ")
    # remove special characters
    text = text.encode('ascii', 'replace').decode('ascii')
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
    # remove punctuation
    text = ''.join([word for word in text if word not in string.punctuation])
    #remove single character
    text = re.sub(r"\b[a-zA-Z]\b", "", text)
    #remove numbers
    text = re.sub(r"\d+", "", text)
    #remove multiple spaces (again)
    text = re.sub('\s+',' ',text)
    return text

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
#Stemming
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
def stem(email):
    email = email.split()
    email = [ps.stem(word) for word in email if not word in stopwords.words('english')]
    return email

In [None]:
#Data Preprocessing
df_email['text'] = df_email['text'].apply(lambda x: cleaning(x))

In [None]:
df_email['text'] = df_email['text'].apply(lambda x: stem(x))

In [None]:
df_email.head()

Unnamed: 0.1,index,Unnamed: 0,label,text,label_num
0,0,605,ham,"[subject, enron, methanol, meter, follow, note...",0
1,1,2349,ham,"[subject, hpl, nom, januari, see, attach, file...",0
2,2,3624,ham,"[subject, neon, retreat, ho, ho, ho, around, w...",0
3,3,4685,spam,"[subject, photoshop, window, offic, cheap, mai...",1
4,4,2030,ham,"[subject, indian, spring, deal, book, teco, pv...",0


In [None]:
## Get the Independent Features
X=df_email.drop(['index','label_num','label','Unnamed: 0'],axis=1)

In [None]:
## Get the Dependent features
y=df_email['label_num']

In [None]:
X.shape

(5171, 1)

In [None]:
X

Unnamed: 0,text
0,"[subject, enron, methanol, meter, follow, note..."
1,"[subject, hpl, nom, januari, see, attach, file..."
2,"[subject, neon, retreat, ho, ho, ho, around, w..."
3,"[subject, photoshop, window, offic, cheap, mai..."
4,"[subject, indian, spring, deal, book, teco, pv..."
...,...
5166,"[subject, put, ft, transport, volum, decreas, ..."
5167,"[subject, follow, nom, hpl, take, extra, mmcf,..."
5168,"[subject, calpin, daili, ga, nomin, juli, ment..."
5169,"[subject, industri, worksheet, august, activ, ..."


In [None]:
y.shape

(5171,)

In [None]:
y

0       0
1       0
2       0
3       1
4       0
       ..
5166    0
5167    0
5168    0
5169    0
5170    1
Name: label_num, Length: 5171, dtype: int64

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X.text, y, test_size=0.33, random_state=42)

In [None]:
import gensim

In [None]:
# Train the word2vec model
#feature size = 100
w2v_model = gensim.models.Word2Vec(X_train,size=100,window=5,min_count=2)

In [None]:
w2v_model.wv.index2word

['ect',
 'subject',
 'hou',
 'enron',
 'deal',
 'pleas',
 'com',
 'ga',
 'meter',
 'hpl',
 'cc',
 'thank',
 'pm',
 'need',
 'daren',
 'price',
 'forward',
 'corp',
 'volum',
 'day',
 'know',
 'get',
 'compani',
 'chang',
 'product',
 'new',
 'may',
 'inform',
 'mmbtu',
 'see',
 'nom',
 'time',
 'http',
 'let',
 'attach',
 'contract',
 'farmer',
 'call',
 'would',
 'month',
 'nomin',
 'us',
 'one',
 'mail',
 'xl',
 'messag',
 'sale',
 'use',
 'question',
 'flow',
 'follow',
 'email',
 'th',
 'juli',
 'sitara',
 'texa',
 'manag',
 'look',
 'robert',
 'work',
 'www',
 'ticket',
 'want',
 'servic',
 'report',
 'list',
 'energi',
 'number',
 'go',
 'file',
 'also',
 'sent',
 'actual',
 'contact',
 'bob',
 'make',
 'ena',
 'order',
 'like',
 'receiv',
 'schedul',
 'market',
 'effect',
 'system',
 'purchas',
 'origin',
 'busi',
 'secur',
 'take',
 'avail',
 'back',
 'daili',
 'statement',
 'help',
 'provid',
 'font',
 'per',
 'could',
 'march',
 'account',
 'includ',
 'pec',
 'request',
 'fre

In [None]:
# Find the most similar words to from our trained model
w2v_model.wv.most_similar('prize')

[('throughout', 0.9951569437980652),
 ('consider', 0.9938293099403381),
 ('advantag', 0.9937296509742737),
 ('earn', 0.9934826493263245),
 ('warehous', 0.9928135275840759),
 ('shift', 0.9920439720153809),
 ('safeti', 0.9909731149673462),
 ('foreign', 0.9908500909805298),
 ('realiz', 0.9908014535903931),
 ('biggest', 0.990614116191864)]

In [None]:
import numpy as np

In [None]:
w2v_model.wv['avoid']

array([ 0.10501271,  0.12230531, -0.10374425, -0.12222383,  0.3759728 ,
       -0.09446032, -0.7094072 , -0.32537597,  0.3653539 , -0.65279573,
       -0.31621996,  0.19112352, -1.0885036 ,  0.03290749,  0.39555222,
       -0.10785022, -0.30581284, -0.3086248 ,  0.39919022,  0.24294563,
        0.45278248,  0.35500586,  0.19979985, -0.17681244, -0.20586455,
        0.28814113,  0.06813863,  0.35542285,  0.21221395,  0.12143993,
       -0.48798013,  0.03277368, -0.50559115, -0.07769861, -0.22726811,
        0.10593437, -0.42214286, -0.12749334, -0.45165217,  0.19127622,
        0.3830705 , -0.513182  , -0.30308124,  0.12387617,  0.15186532,
        0.10401089,  0.66376346,  0.00437571, -0.3261038 , -0.10864793,
        0.31203914,  0.4693572 , -0.48602095, -0.15974747,  0.11026616,
        0.43967316, -0.23003256,  0.18453014,  0.06043662,  0.26322594,
        0.41022217, -0.11842853, -0.1433158 , -0.14381866,  0.18687566,
       -0.01407259,  0.28379083, -0.52541095, -0.19491336,  0.45

In [None]:
# Generate aggregated sentence vectors based on the word vectors for each word in the sentence
# Replace the words in each text message with the learned word vector
words = set(w2v_model.wv.index2word)
X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_train])
X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_test])

  """
  import sys


In [None]:
X_train_vect[0]

array([[-0.32109797,  1.2493627 ,  0.13421297, ..., -0.54885006,
        -0.09748083, -0.17088957],
       [ 0.00716424,  0.05376712,  0.01793709, ..., -0.08651087,
         0.03384385, -0.01534526],
       [ 1.385162  , -0.06763467,  0.4508465 , ...,  0.17989781,
        -0.08594676,  0.00666128],
       ...,
       [ 0.9868093 , -0.1663161 ,  1.4514278 , ..., -0.32475522,
         0.03202817, -1.1119998 ],
       [ 0.14848748,  0.22947003,  1.3504449 , ..., -0.42533162,
        -0.10086901, -0.9717226 ],
       [-0.8943951 ,  1.0501251 ,  0.8053407 , ..., -0.981951  ,
         0.31124598, -0.51178205]], dtype=float32)

In [None]:
X_train_vect[0].shape

(35, 100)

In [None]:
X_test_vect[0].shape

(11, 100)

In [None]:
# Average the word vectors for each sentence (and assign a vector of zeros if the model
# did not learn any of the words in the text message during training
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

In [None]:
X_train_vect_avg[0].shape

(100,)

Fit RandomForestClassifier On Top Of Word Vector

In [None]:
# Instantiate and fit a basic Random Forest model on top of the vectors
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect_avg, y_train.values.ravel())

In [None]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect_avg)

In [None]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Precision: 0.917 / Recall: 0.907 / Accuracy: 0.953
