# Text Classification

In [1]:
import pandas as pd
df = pd.read_csv('email_text.csv')
df

Unnamed: 0,label,text
0,0,me spam is the tool for dissident news since t...
1,0,escapenumber escapenumber escapenumber penguin...
2,0,on wednesday escapenumber july escapenumber es...
3,0,can't think of how i'd be running afoul of th...
4,0,forwarded by william knowles http www thesun c...
...,...,...
5635,1,you are receiving this email because you have ...
5636,1,royal meds com your online pharmacy · www roya...
5637,1,your attention please please reply only to esc...
5638,1,tremendous savings on toners inkjets fax and t...


# Pre-Processing 

In [3]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

In [7]:
def preprocess(text):
    txt = text.lower()
    punc = re.sub(r"[^a-zA-Z0-9]"," ",txt)
    words = word_tokenize(punc)
    stp = [w for w in words if w not in stopwords.words('english')]
    lema = [WordNetLemmatizer().lemmatize(w) for w in stp]
    return " ".join(lema)
df['Clean Text'] = df['text'].apply(preprocess)

# EDA

In [39]:
df['text'][0]

"me spam is the tool for dissident news since the fact that it's unsolicited means that recipients can't be blamed for being on a mailing list russell turpin that depends on how the list is collected or even on what the senders say about how the list is collected better to just put it on a website and that way it can be surfed anonymously and it doesn't clutter my inbox it doesn't work that way a website is opt in spam is no opt if you visit a samizdat site you can get in trouble if you get samizdat spam the worst that can be said is that you might have read it and as long as the mailers send to individuals who clearly didn't opt in like party officials then other recipients can't get in trouble for requesting the mail plus it's much harder to block spam than web sites but this shouldn't come as a surprize spam is speech it may be sleazy but so what lucas http xent com mailman listinfo fork"

In [9]:
df['Clean Text'][0]

'spam tool dissident news since fact unsolicited mean recipient blamed mailing list russell turpin depends list collected even sender say list collected better put website way surfed anonymously clutter inbox work way website opt spam opt visit samizdat site get trouble get samizdat spam worst said might read long mailer send individual clearly opt like party official recipient get trouble requesting mail plus much harder block spam web site come surprize spam speech may sleazy lucas http xent com mailman listinfo fork'

# Splitting data

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [14]:
vector = TfidfVectorizer()
X = vector.fit_transform(df['Clean Text'])
y = df['label']

In [15]:
train_X,test_X,train_y,test_y = train_test_split(X,y,test_size=0.2,random_state=42)

In [18]:
model_nb = MultinomialNB()
model_nb.fit(train_X,train_y)

In [19]:
pred = model_nb.predict(test_X)

In [20]:
acc = accuracy_score(pred,test_y)
acc

0.901595744680851

# Logistic regression 

In [21]:
from sklearn.linear_model import LogisticRegression

In [22]:
model_lr = LogisticRegression()
model_lr.fit(train_X,train_y)

In [23]:
pred = model_lr.predict(test_X)
acc = accuracy_score(pred,test_y)
acc

0.9574468085106383

# Making predictions

In [38]:
def predcit_txt(text):
    txt = preprocess(text)
    vect = vector.transform([txt])
    pred = model_lr.predict(vect)
    if pred[0] == 0:
        return 'ham'
    return 'spam'
predcit_txt("attention this is a must for all computer users new special package deal nor ton systemworks escapenumber software suite professional edition includes six yes escapenumber feature packed utilities all for escapenumber special low price this software will protect your computer from unwanted and hazardous viruses help secure your private valuable information allow you to transfer files and send e mails safely backup your all your data quick and easily improve your pc's performance w superior integral diagnostics escapenumber feature packed utilities escapenumber great price a escapenumber combined retail value yours for only escapenumber escapenumber includes free shipping don't fall prey to destructive viruses or hackers protect your computer and your valuable informat ion and gt click here to order yours now click here for more information your email address was obtained from an opt in list opt in mrsa list purchase code escapenumber escapenumber escapenumber if you wish to be unsubs cribed from this list please click here and press send to be removed if you have previously unsubs cribed and are still receiving this message you may email our spam abuse control center we do not condone spam in any shape or for m thank you kindly for your cooperation")

'spam'