In [172]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import string
import re
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [173]:
df=pd.read_csv('/content/spam_ham_dataset.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [174]:
df.shape

(5171, 4)

In [175]:
#check for imbalanced dataset
df.label_num.value_counts()

0    3672
1    1499
Name: label_num, dtype: int64

In [176]:
#drop duplicates
df.drop_duplicates(inplace=True)
df.shape

(5171, 4)

In [177]:
#remove/replace missing data
df.isnull().sum()

Unnamed: 0    0
label         0
text          0
label_num     0
dtype: int64

In [178]:
nltk.download('stopwords')
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [179]:
prot_stemmer=PorterStemmer()
test=df.text[3]
print(test)
test=test.lower()
test=[char for char in test if char not in string.punctuation];
test=''.join(test)
test=test.split()

test=[prot_stemmer.stem(word) for word in test if word not in stopwords.words('english') ]
test=' '.join(test)
print(test)



Subject: photoshop , windows , office . cheap . main trending
abasements darer prudently fortuitous undergone
lighthearted charm orinoco taster
railroad affluent pornographic cuvier
irvin parkhouse blameworthy chlorophyll
robed diagrammatic fogarty clears bayda
inconveniencing managing represented smartness hashish
academies shareholders unload badness
danielson pure caffein
spaniard chargeable levin

subject photoshop window offic cheap main trend abas darer prudent fortuit undergon lightheart charm orinoco taster railroad affluent pornograph cuvier irvin parkhous blameworthi chlorophyl robe diagrammat fogarti clear bayda inconvenienc manag repres smart hashish academi sharehold unload bad danielson pure caffein spaniard chargeabl levin


In [180]:
def pre_process(text) :
  test=text.lower()
  test=[char for char in test if char not in string.punctuation]
  test=''.join(test)
  test=test.split()
  test=[prot_stemmer.stem(word) for word in test if word not in stopwords.words('english') ]
  test=' '.join(test)
  return test


In [181]:
df.text=df.text.apply(pre_process)
df.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,subject enron methanol meter 988291 follow not...,0
1,2349,ham,subject hpl nom januari 9 2001 see attach file...,0
2,3624,ham,subject neon retreat ho ho ho around wonder ti...,0
3,4685,spam,subject photoshop window offic cheap main tren...,1
4,2030,ham,subject indian spring deal book teco pvr reven...,0


In [182]:
X=df['text']
Y=df['label_num']
X.head()

0    subject enron methanol meter 988291 follow not...
1    subject hpl nom januari 9 2001 see attach file...
2    subject neon retreat ho ho ho around wonder ti...
3    subject photoshop window offic cheap main tren...
4    subject indian spring deal book teco pvr reven...
Name: text, dtype: object

In [183]:
vectorizer=TfidfVectorizer(min_df=1)
vectorizer.fit(X)
X=vectorizer.transform(X)
print(X)

  (0, 42373)	0.17928354700765012
  (0, 40295)	0.11133740679255401
  (0, 36647)	0.03877439705152389
  (0, 32691)	0.18250440428730974
  (0, 31724)	0.19585743095584732
  (0, 31528)	0.14232071688023554
  (0, 31207)	0.1810071372992117
  (0, 31165)	0.22634331997567883
  (0, 30880)	0.18311988383003322
  (0, 30648)	0.07748104188050535
  (0, 29342)	0.2851387795965185
  (0, 28584)	0.21289360827350798
  (0, 28291)	0.14501134958545445
  (0, 27759)	0.09558328092223306
  (0, 26956)	0.16137683077847215
  (0, 26359)	0.2163891627029953
  (0, 26355)	0.10957571314328682
  (0, 18396)	0.2194318522773983
  (0, 18205)	0.09734228431958361
  (0, 17626)	0.11522833897332038
  (0, 17529)	0.12898133776238715
  (0, 15968)	0.08773755555581961
  (0, 15326)	0.22188157609415077
  (0, 13304)	0.17872563238861228
  (0, 13253)	0.10130704032981865
  :	:
  (5170, 18135)	0.05663525565698476
  (5170, 17866)	0.09765121962441027
  (5170, 17626)	0.04003575216559987
  (5170, 17289)	0.06534665031392696
  (5170, 16808)	0.08477851994

In [184]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=30)
print(X_test.size,X_train.size)

67960 278620


In [185]:
from sklearn.linear_model import LogisticRegression
model=LogisticRegression()
model.fit(X_train,Y_train)


In [186]:
from sklearn.metrics import accuracy_score
train_data_predic = model.predict(X_train)
train_data_acc = accuracy_score(Y_train, train_data_predic)


print( train_data_acc)

0.9946808510638298


In [187]:
test_data_predic=model.predict(X_test)
test_data_acc=accuracy_score(Y_test,test_data_predic)
print(test_data_acc)

0.9855072463768116


In [189]:
check=["photoshop , windows , office . cheap . main trending abasements darer prudently fortuitous undergone lighthearted charm orinoco taster railroad affluent pornographic cuvier irvin parkhouse blameworthy chlorophyll robed diagrammatic fogarty clears bayda inconveniencing managing represented smartness hashish academies shareholders unload badness danielson pure caffein spaniard chargeable levin"]


In [190]:
check=vectorizer.transform(check)
pred=model.predict(check)
print(pred)

[1]
