# Spam Email Classification

# 1. dataset

* https://archive.ics.uci.edu/ml/datasets/sms+spam+collection

In [3]:
import pandas as pd

df = pd.read_csv('SMSSpamCollection', header=None, sep='\t')

In [5]:
df.tail()

Unnamed: 0,0,1
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


# 2. check data

In [7]:
# check if null exist in dataset
df.isnull().any()

0    False
1    False
dtype: bool

In [8]:
# Detect missing values.
df.isna().any()

0    False
1    False
dtype: bool

In [17]:
df2 = df.replace('spam',1).replace('ham',0)
print(df2[0].value_counts())
# spam mail count: 747
# non-smpa mail count: 4825

0    4825
1     747
Name: 0, dtype: int64


# 3. preprocess data

In [24]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


def preprocessing(text):
    tokens=[word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    stops=stopwords.words('english')
    tokens=[token for token in tokens if token not in stops]

    tokens=[token.lower() for token in tokens if len(token)>=3]
    lmtzr=WordNetLemmatizer()
    tokens=[lmtzr.lemmatize(token) for  token in tokens]
    preprocessed_text=' '.join(tokens)
    return preprocessed_text

test_str = 'hello my world, they are words\nthis is another sentences'
preprocessing(test_str)

'hello world word another sentence'

In [28]:
data_str_list = []
data_label_list = []

for index, row in df2.iterrows():
    label = row[0]
    data = row[1]
    data_str_list.append(preprocessing(data))
    data_label_list.append(label)
    
print(len(data_str_list),len(data_label_list))    

5572 5572


# 4. get tf-idf feature and construct dataset

In [41]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

x_train_str_list, x_test_str_list, y_train, y_test = train_test_split(data_str_list, data_label_list, test_size=0.2,random_state=0)


vectorizer=TfidfVectorizer(min_df=2,ngram_range=(1,2),stop_words='english',strip_accents='unicode',norm='l2')

x_train = vectorizer.fit_transform(x_train_str_list) 
x_test = vectorizer.transform(x_test_str_list) 
y_train = y_train
y_test =y_test

# Ref

* https://apriljia.com/2018/10/08/%E5%88%A9%E7%94%A8nltksklearn%E8%BF%9B%E8%A1%8C%E5%9E%83%E5%9C%BE%E9%82%AE%E4%BB%B6%E5%88%86%E7%B1%BB/