<a href="https://colab.research.google.com/github/venkatanagaakshita/email-spam-detection/blob/main/Spam_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [41]:
import os
import glob
import numpy as np
import email
from sklearn.model_selection import train_test_split
 

In [42]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [43]:
path = '/content/gdrive/My Drive/data/'

In [44]:
easy_ham_paths = glob.glob(path+'easy_ham/*')
spam_paths = glob.glob(path+'spam/*')

In [45]:
ham_sample = np.array([train_test_split(easy_ham_paths, test_size = 0.3)])

  """Entry point for launching an IPython kernel.


In [46]:
ham_train = np.array([])
ham_test = np.array([])
for i in ham_sample:
    ham_train = np.concatenate((ham_train,i[0]),axis=0)
    ham_test = np.concatenate((ham_test,i[1]),axis=0)

In [47]:
spam_sample = np.array([train_test_split(spam_paths, test_size = 0.3)])

  """Entry point for launching an IPython kernel.


In [48]:
spam_train = np.array([])
spam_test = np.array([])
for i in spam_sample:
    spam_train = np.concatenate((spam_train,i[0]),axis=0)
    spam_test = np.concatenate((spam_test,i[1]),axis=0)

In [49]:
ham_train_label = [0]*ham_train.shape[0]
spam_train_label = [1]*spam_train.shape[0]
x_train = np.concatenate((ham_train,spam_train))
y_train = np.concatenate((ham_train_label,spam_train_label))

In [50]:
ham_test_label = [0]*ham_test.shape[0]
spam_test_label = [1]*spam_test.shape[0]
x_test = np.concatenate((ham_test,spam_test))
y_test = np.concatenate((ham_test_label,spam_test_label))

In [51]:
train_shuffle_index = np.random.permutation(np.arange(0,x_train.shape[0]))
test_shuffle_index = np.random.permutation(np.arange(0,x_test.shape[0]))
x_train = x_train[train_shuffle_index]
y_train = y_train[train_shuffle_index]
x_test = x_test[test_shuffle_index]
y_test = y_test[test_shuffle_index]

In [52]:
def rem_null_data(datas,labels):
    not_null_idx = [i for i,o in enumerate(datas) if o is not None]
    return np.array(datas)[not_null_idx],np.array(labels)[not_null_idx]

In [53]:
x_train,y_train = rem_null_data(x_train,y_train)
x_test,y_test = rem_null_data(x_test,y_test)

In [54]:
import re
import string
from nltk.tokenize import word_tokenize

In [55]:
def rem_newline(word):
    return word.replace('\n','')

In [56]:
def rem_num_in_data(word):
    result = re.sub(r'\d+', '', word)
    return result

In [57]:
def convert_lower(word):
    result = word.lower()
    return result

In [58]:
def rem_punc_in_data(word):
    result = word.translate(str.maketrans(dict.fromkeys(string.punctuation)))
    return result

In [62]:
def preprocessing(sentence):
    cleaning_utils = [
                      rem_newline,
                      convert_lower,
                      rem_num_in_data,
                      rem_punc_in_data]
    for i in cleaning_utils:
        sentence = i(sentence)
    return sentence

In [63]:
x_train = [preprocessing(i) for i in x_train]
x_test = [preprocessing(i) for i in x_test]

In [64]:
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from nltk.stem import PorterStemmer


In [65]:
stemmer = PorterStemmer()

In [66]:
import nltk
nltk.download('punkt')
x_train = [word_tokenize(i) for i in x_train]
x_test = [word_tokenize(i) for i in x_test]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [67]:
def rem_stopwords(words):
    result = [i for i in words if i not in ENGLISH_STOP_WORDS]
    return result

In [68]:
def data_stem(words):
    return [stemmer.stem(i) for i in words]

In [69]:
def clean_token(words):
    cleaning_utils = [rem_stopwords,data_stem]
    for i in cleaning_utils:
        words = i(words)
    return words

In [70]:
x_train = [clean_token(i) for i in x_train]
x_test = [clean_token(i) for i in x_test]

In [71]:
x_train = [" ".join(i) for i in x_train]
x_test = [" ".join(i) for i in x_test]

In [72]:
x_train = [i.split(" ") for i in x_train]
x_test = [i.split(" ") for i in x_test]

In [73]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [74]:
vectorizer = TfidfVectorizer()
raw_sentences = [' '.join(i) for i in x_train]
vectorizer.fit(raw_sentences)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [75]:
def convert_to_feature(raw_tokenize_data):
    raw_sentences = [' '.join(i) for i in raw_tokenize_data]
    return vectorizer.transform(raw_sentences)

In [76]:
x_train_features = convert_to_feature(x_train)
x_test_features = convert_to_feature(x_test)

In [77]:
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier

In [78]:
dtModel = DecisionTreeClassifier(class_weight=None,
 max_features = 50, min_impurity_split=1e-07, min_samples_leaf=1,
 min_samples_split=2, max_depth = 100, min_weight_fraction_leaf=0.0, random_state=18, splitter='best')

# Train Decision Tree Classifer
dtModel = dtModel.fit(x_train_features.toarray(),y_train)

#Predict the response for test dataset
y_pred = dtModel.predict(x_test_features.toarray())



In [79]:
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8375136314067612
