# Assignment 1

### Ayush Yadav (MDS202315)

Build a prototype for sms spam classification <br>

In `prepare.ipynb` write the functions to 
1) Load the data from a given file path
2) Preprocess the data (if needed)
3) Split the data into train/validation/test 
4) Store the splits at train.csv/validation.csv/test.csv



In [71]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score, train_test_split 
from sklearn.tree import DecisionTreeClassifier 
from scipy import sparse
from scipy.sparse import save_npz
import pickle

#### 1. Loading the Data

In [79]:
data = pd.read_csv('./sms+spam+collection/SMSSpamCollection', sep='\t', names=["label", "message"])
data

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [80]:
data.groupby('label').describe()

Unnamed: 0_level_0,message,message,message,message
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [81]:
data['length'] = data['message'].map(lambda text: len(text))
data.length.describe()

count    5572.000000
mean       80.489950
std        59.942907
min         2.000000
25%        36.000000
50%        62.000000
75%       122.000000
max       910.000000
Name: length, dtype: float64

#### 2. Data Preprocessing

In [82]:
STOPWORDS = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()  ### Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text) ### Remove punctuation and non-alphabetic characters
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in STOPWORDS]) ### Lemmatize and Remove Stopwords

    return text

In [83]:
data['preprocessed'] = data.message.apply(preprocess_text)
data['label'] = data.label.map({'ham':0,'spam':1})
data.drop(columns=['length'], inplace=True)
data

Unnamed: 0,label,message,preprocessed
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,0,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts st ...
3,0,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah dont think go usf life around though
...,...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...,nd time tried contact u u pound prize claim ea...
5568,0,Will ü b going to esplanade fr home?,b going esplanade fr home
5569,0,"Pity, * was in mood for that. So...any other s...",pity mood soany suggestion
5570,0,The guy did some bitching but I acted like i'd...,guy bitching acted like id interested buying s...


In [91]:
### Tokenize and Creating Vocabulary

tokens = word_tokenize(" ".join(data['preprocessed']))
bag_of_words = CountVectorizer().fit(tokens)

vocab = bag_of_words.vocabulary_
print("Vocabulary Size:",len(vocab))

Vocabulary Size: 7947


In [73]:
with open('./sms+spam+collection/bag_of_words.pkl','wb') as f:
    pickle.dump(bag_of_words,f)

In [104]:
bow_msgs = bag_of_words.transform(data['message'])
print('sparse matrix shape:', bow_msgs.shape)
print('number of non-zeros:', bow_msgs.nnz)
print('sparsity: %.2f%%' % (100.0 * bow_msgs.nnz / (bow_msgs.shape[0] * bow_msgs.shape[1])))

sparse matrix shape: (5572, 7947)
number of non-zeros: 44810
sparsity: 0.10%


In [74]:
bow_msgs.shape

(5572, 7947)

In [180]:
DATASET = data['preprocessed']
LABELS = data['label'].values

#### 3. Splitting the Data

In [181]:
X, X_test, Y, Y_test = train_test_split(DATASET, LABELS, test_size=0.15, stratify=LABELS)

In [182]:
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, stratify=Y)

In [183]:
print("Train Size:", len(X_train))
print("Validation Size:", len(X_val))
print("Test Size:", len(X_test))

Train Size: 3788
Validation Size: 948
Test Size: 836


In [184]:
TRAIN_DF = pd.DataFrame(X_train)
TRAIN_DF['label'] = Y_train

VAL_DF = pd.DataFrame(X_val)
VAL_DF['label'] = Y_val

TEST_DF = pd.DataFrame(X_test)
TEST_DF['label'] = Y_test

In [185]:
TRAIN_DF

Unnamed: 0,preprocessed,label
5305,hi missed call mumhas beendropping red wine th...,0
3176,ur going bahamas callfreefone speak live opera...,1
3431,youve always brainy one,0
2161,specialisation work slave labor look month sha...,0
2404,jesus christ bitch im trying give drug answer ...,0
...,...,...
1148,ok help ask shes working tmr,0
2665,remains bro amongst bros,0
4630,saturday sunday holiday difficult,0
4475,want explicit sex sec ring cost pmin,1


In [186]:
VAL_DF

Unnamed: 0,preprocessed,label
1243,nobody decide eat dad want chinese,0
3299,message free welcome new improved sex dogging ...,1
5164,congrats mobile g videophones r call videochat...,1
1079,convey regard,0
2391,first gained ltgt kg since took second done bl...,0
...,...,...
1420,speak feel word better silence gud mrng,0
5455,wishing beautiful day moment revealing even th...,0
3251,babe need advice,0
197,u got person story,0


In [187]:
TEST_DF

Unnamed: 0,preprocessed,label
2524,sorry ill call later,0
944,also ive sorta blown couple time recently id r...,0
4009,forgot working today wanna chat thing ok drop ...,0
5294,xmas iscoming ur awarded either cd gift vouche...,1
42,rodger burn msg tried call reply sm free nokia...,1
...,...,...
3156,ok,0
2185,know people hit fuck yes,0
4891,sound like could lot time spent chastity devic...,0
4169,congrats nokia video camera phone call call co...,1


#### 4. Saving the Splits in Separate Files

In [190]:
TRAIN_DF.to_csv('./sms+spam+collection/TRAIN.csv', index=False)
VAL_DF.to_csv('./sms+spam+collection/VALIDATION.csv', index=False)
TEST_DF.to_csv('./sms+spam+collection/TEST.csv', index=False)