In [1]:
import pandas as pd 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import re 
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


In [2]:
file_test = r'C:\Users\paton\OneDrive\Desktop\dataset\SMS_test.csv'
file_train = r'C:\Users\paton\OneDrive\Desktop\dataset\SMS_train.csv'


In [3]:
data_1 = pd.read_csv(file_train, encoding="latin1")
data_2 = pd.read_csv(file_test, encoding="latin1")

In [4]:
data_1.head()

Unnamed: 0,S. No.,Message_body,Label
0,1,Rofl. Its true to its name,Non-Spam
1,2,The guy did some bitching but I acted like i'd...,Non-Spam
2,3,"Pity, * was in mood for that. So...any other s...",Non-Spam
3,4,Will ü b going to esplanade fr home?,Non-Spam
4,5,This is the 2nd time we have tried 2 contact u...,Spam


In [5]:
# Text preprocessing

In [6]:
data_1['Message_body'] = data_1['Message_body'].str.lower() 

In [7]:
nltk.download('stopwords')
en_stopword = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\paton\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
data_1['Message_no_stopword'] = data_1['Message_body'].apply(lambda x:' '.join([word for word in x.split() if word not in en_stopword]))

In [9]:
data_1.head()

Unnamed: 0,S. No.,Message_body,Label,Message_no_stopword
0,1,rofl. its true to its name,Non-Spam,rofl. true name
1,2,the guy did some bitching but i acted like i'd...,Non-Spam,guy bitching acted like interested buying some...
2,3,"pity, * was in mood for that. so...any other s...",Non-Spam,"pity, * mood that. so...any suggestions?"
3,4,will ü b going to esplanade fr home?,Non-Spam,ü b going esplanade fr home?
4,5,this is the 2nd time we have tried 2 contact u...,Spam,2nd time tried 2 contact u. u £750 pound prize...


In [10]:
nltk.download('punkt_tab')
data_1['Message_Tokenize'] = data_1['Message_no_stopword'].apply(lambda x: word_tokenize(x))

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\paton\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [11]:
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
data_1['Message_Cleaned'] = data_1['Message_Tokenize'].apply(
    lambda tokens: [lemmatizer.lemmatize(word) for word in tokens])

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\paton\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
data_1.head()

Unnamed: 0,S. No.,Message_body,Label,Message_no_stopword,Message_Tokenize,Message_Cleaned
0,1,rofl. its true to its name,Non-Spam,rofl. true name,"[rofl, ., true, name]","[rofl, ., true, name]"
1,2,the guy did some bitching but i acted like i'd...,Non-Spam,guy bitching acted like interested buying some...,"[guy, bitching, acted, like, interested, buyin...","[guy, bitching, acted, like, interested, buyin..."
2,3,"pity, * was in mood for that. so...any other s...",Non-Spam,"pity, * mood that. so...any suggestions?","[pity, ,, *, mood, that, ., so, ..., any, sugg...","[pity, ,, *, mood, that, ., so, ..., any, sugg..."
3,4,will ü b going to esplanade fr home?,Non-Spam,ü b going esplanade fr home?,"[ü, b, going, esplanade, fr, home, ?]","[ü, b, going, esplanade, fr, home, ?]"
4,5,this is the 2nd time we have tried 2 contact u...,Spam,2nd time tried 2 contact u. u £750 pound prize...,"[2nd, time, tried, 2, contact, u., u, £750, po...","[2nd, time, tried, 2, contact, u., u, £750, po..."


In [13]:
data_1['Message_Cleaned'] = data_1['Message_Cleaned'].apply(lambda x: ' '.join(x))

In [14]:
# Countvectorizing 
from sklearn.feature_extraction.text import TfidfVectorizer 

In [15]:
tfidfvec = TfidfVectorizer()
cout_vector_fit = tfidfvec.fit_transform(data_1['Message_Cleaned'])
print (cout_vector_fit)

  (0, 2189)	0.6629498232183731
  (0, 2675)	0.5348011938730758
  (0, 1759)	0.5239133658599108
  (1, 1211)	0.25979987514553854
  (1, 483)	0.36131085077803854
  (1, 274)	0.3408834861409456
  (1, 1529)	0.21569369000486108
  (1, 1381)	0.32639004027837465
  (1, 561)	0.32639004027837465
  (1, 2384)	0.27542550528672394
  (1, 936)	0.29146922977871087
  (1, 1789)	0.2632757415395664
  (1, 2827)	0.23018713586743192
  (1, 1140)	0.3151480502822953
  (1, 1097)	0.20914160298234016
  (2, 1961)	0.44890603025260817
  (2, 1715)	0.44890603025260817
  (2, 2570)	0.30798848210265567
  (2, 2372)	0.34816473822632577
  (2, 349)	0.44890603025260817
  (2, 2486)	0.4235263131806921
  (3, 1173)	0.3948726567837269
  (3, 964)	0.5750791031575305
  (3, 1094)	0.6095405863709299
  (3, 1293)	0.3765898081502929
  :	:
  (952, 2629)	0.20679978290434176
  (952, 1797)	0.2664317891277223
  (952, 1239)	0.30455316394672116
  (952, 1309)	0.2940633107041306
  (952, 2893)	0.3180769368846815
  (952, 1024)	0.3371376242941809
  (952, 177

In [16]:
tfidf_bag = pd.DataFrame(cout_vector_fit.toarray(), columns = tfidfvec.get_feature_names_out()) 
print (tfidf_bag)

     000  0121   02  0207  02073162414   03   04   05   06  07099833605  ...  \
0    0.0   0.0  0.0   0.0          0.0  0.0  0.0  0.0  0.0          0.0  ...   
1    0.0   0.0  0.0   0.0          0.0  0.0  0.0  0.0  0.0          0.0  ...   
2    0.0   0.0  0.0   0.0          0.0  0.0  0.0  0.0  0.0          0.0  ...   
3    0.0   0.0  0.0   0.0          0.0  0.0  0.0  0.0  0.0          0.0  ...   
4    0.0   0.0  0.0   0.0          0.0  0.0  0.0  0.0  0.0          0.0  ...   
..   ...   ...  ...   ...          ...  ...  ...  ...  ...          ...  ...   
952  0.0   0.0  0.0   0.0          0.0  0.0  0.0  0.0  0.0          0.0  ...   
953  0.0   0.0  0.0   0.0          0.0  0.0  0.0  0.0  0.0          0.0  ...   
954  0.0   0.0  0.0   0.0          0.0  0.0  0.0  0.0  0.0          0.0  ...   
955  0.0   0.0  0.0   0.0          0.0  0.0  0.0  0.0  0.0          0.0  ...   
956  0.0   0.0  0.0   0.0          0.0  0.0  0.0  0.0  0.0          0.0  ...   

     younger  yours  yoville   yr  yumm

In [17]:
# Classification model 

In [20]:
model = LogisticRegression()
y = data_1['Label']

In [21]:
X_train,X_test,y_train,y_test = train_test_split(tfidf_bag,y,test_size=0.3, random_state=7) 

In [22]:
model.fit(X_train,y_train)

In [24]:
y_pred = model.predict(X_test)
accuracy_score(y_pred,y_test)

0.8854166666666666