# Data Cleaning for Project 

In [1]:
import pandas as pd
import numpy as np

## Import Datasets

In [2]:
ds1 = pd.read_csv("data/train.csv")

In [3]:
ds2 = pd.read_csv("data/Mendeley/Dataset_5971.csv")

In [4]:
ds1.head()

Unnamed: 0,sms,label
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...\n,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [5]:
ds2.head()

Unnamed: 0,LABEL,TEXT,URL,EMAIL,PHONE
0,ham,Your opinion about me? 1. Over 2. Jada 3. Kusr...,No,No,No
1,ham,What's up? Do you want me to come online? If y...,No,No,No
2,ham,So u workin overtime nigpun?,No,No,No
3,ham,"Also sir, i sent you an email about how to log...",No,No,No
4,Smishing,Please Stay At Home. To encourage the notion o...,No,No,No


### Add Label column for Dataset 2 similar to Dataset 1

In [6]:
ds2["label"] =  np.where(ds2['LABEL']=='Smishing', 1, 0)

In [7]:
ds2.head()

Unnamed: 0,LABEL,TEXT,URL,EMAIL,PHONE,label
0,ham,Your opinion about me? 1. Over 2. Jada 3. Kusr...,No,No,No,0
1,ham,What's up? Do you want me to come online? If y...,No,No,No,0
2,ham,So u workin overtime nigpun?,No,No,No,0
3,ham,"Also sir, i sent you an email about how to log...",No,No,No,0
4,Smishing,Please Stay At Home. To encourage the notion o...,No,No,No,1


#### Counting the number of smishing attacks in both datasets

In [8]:
ds1[ds1["label"] == 1]["label"].count()

747

In [9]:
ds2[ds2["label"] == 1]["label"].count()

616

Number of total entries in both datasets:

In [10]:
ds1.count()

sms      5574
label    5574
dtype: int64

In [11]:
ds2.count()

LABEL    5971
TEXT     5971
URL      5971
EMAIL    5971
PHONE    5971
label    5971
dtype: int64

#### Combining both datasets

In [12]:
ds2 = ds2.drop(["LABEL", "URL", "EMAIL", "PHONE"], axis = 1)

In [13]:
ds2.head()

Unnamed: 0,TEXT,label
0,Your opinion about me? 1. Over 2. Jada 3. Kusr...,0
1,What's up? Do you want me to come online? If y...,0
2,So u workin overtime nigpun?,0
3,"Also sir, i sent you an email about how to log...",0
4,Please Stay At Home. To encourage the notion o...,1


In [14]:
ds1 = ds1.rename(columns = {'sms': 'TEXT'})

In [15]:
ds1.head()

Unnamed: 0,TEXT,label
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...\n,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [16]:
ds3 = pd.concat([ds1, ds2])

In [17]:
ds3 = ds3.reset_index(drop = True)

In [18]:
ds3

Unnamed: 0,TEXT,label
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...\n,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...
11540,:( but your not here....,0
11541,Becoz its &lt;#&gt; jan whn al the post ofic...,0
11542,Its a valentine game. . . send dis msg to all ...,0
11543,We r outside already.,0


In [19]:
# Checking for any missing values

ds3.isna().sum()

TEXT     0
label    0
dtype: int64

### Data Cleaning

In [20]:
# converitng all strings into lowercase. 
ds3["TEXT"] = ds3["TEXT"].str.lower()

In [21]:
ds3.head()

Unnamed: 0,TEXT,label
0,"go until jurong point, crazy.. available only ...",0
1,ok lar... joking wif u oni...\n,0
2,free entry in 2 a wkly comp to win fa cup fina...,1
3,u dun say so early hor... u c already then say...,0
4,"nah i don't think he goes to usf, he lives aro...",0


 Number of total smishing attacks:

In [22]:
print(ds3['label'].value_counts()[1])

1363


Checking for Strings that Contain URL

In [23]:
# Creating a new column called contains URL. This column will look for substrings that have characteristics of a URL
ds3["contains_url"] = ds3["TEXT"].str.contains('www.|http|/|.com', regex = True)

In [24]:
ds3.head()

Unnamed: 0,TEXT,label,contains_url
0,"go until jurong point, crazy.. available only ...",0,False
1,ok lar... joking wif u oni...\n,0,False
2,free entry in 2 a wkly comp to win fa cup fina...,1,True
3,u dun say so early hor... u c already then say...,0,False
4,"nah i don't think he goes to usf, he lives aro...",0,False


In [25]:
# Converting Boolean values into 1s and 0s and dropping contain_url column
ds3["Contains URL"] = np.where(ds3["contains_url"] == True, 1, 0)
ds3 = ds3.drop(["contains_url"], axis = 1)

In [26]:
ds3.head()

Unnamed: 0,TEXT,label,Contains URL
0,"go until jurong point, crazy.. available only ...",0,0
1,ok lar... joking wif u oni...\n,0,0
2,free entry in 2 a wkly comp to win fa cup fina...,1,1
3,u dun say so early hor... u c already then say...,0,0
4,"nah i don't think he goes to usf, he lives aro...",0,0


Number of Text that contain URL

In [27]:
print(ds3['Contains URL'].value_counts()[1])

1656


Checking for any strings that contain substring relating to monetary terms

In [28]:
ds3["Contains Monetary Terms"] = ds3["TEXT"].str.contains('prize|cash|money|dollars|pounds|jackpot|txt|urgent|account|po box|free|compensation|claim|tax', regex = True)

In [29]:
ds3["Contains Monetary Characters"] = np.where(ds3["Contains Monetary Terms"] == True, 1, 0)

In [30]:
ds3 = ds3.drop(["Contains Monetary Terms"], axis = 1)

In [31]:
ds3.head(10)

Unnamed: 0,TEXT,label,Contains URL,Contains Monetary Characters
0,"go until jurong point, crazy.. available only ...",0,0,0
1,ok lar... joking wif u oni...\n,0,0,0
2,free entry in 2 a wkly comp to win fa cup fina...,1,1,1
3,u dun say so early hor... u c already then say...,0,0,0
4,"nah i don't think he goes to usf, he lives aro...",0,0,0
5,freemsg hey there darling it's been 3 week's n...,1,0,1
6,even my brother is not like to speak with me. ...,0,0,0
7,as per your request 'melle melle (oru minnamin...,0,0,0
8,winner!! as a valued network customer you have...,1,0,1
9,had your mobile 11 months or more? u r entitle...,1,0,1


Number of Texts that contain monetary terms

In [32]:
print(ds3['Contains Monetary Characters'].value_counts()[1])

1541


Adding a column called classification for easier interpretation

In [33]:
ds3["Classification"] = np.where(ds3["label"] == 1, "spam", "not spam")

In [34]:
ds3.head()

Unnamed: 0,TEXT,label,Contains URL,Contains Monetary Characters,Classification
0,"go until jurong point, crazy.. available only ...",0,0,0,not spam
1,ok lar... joking wif u oni...\n,0,0,0,not spam
2,free entry in 2 a wkly comp to win fa cup fina...,1,1,1,spam
3,u dun say so early hor... u c already then say...,0,0,0,not spam
4,"nah i don't think he goes to usf, he lives aro...",0,0,0,not spam


## Machine Learning Implementation: 
Just a trial basis to test whether data cleaning was effective and working. 

### KNN Algorithm

In [35]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [36]:
X = ds3.drop(columns = ["Classification", "TEXT", "label"])
y = ds3["Classification"]

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 42)

In [38]:
knn = KNeighborsClassifier()

In [39]:
knn.fit(X_train, y_train)

KNeighborsClassifier()

In [40]:
print(round(knn.score(X_test, y_test),2))

0.88


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


### Decison Tree Classifier

In [41]:
from sklearn.tree import DecisionTreeClassifier

In [42]:
model = DecisionTreeClassifier()

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 42)

In [55]:
model.fit(X_train, y_train)

DecisionTreeClassifier()

In [56]:
print(round(model.score(X_test, y_test), 2))

0.93


### Naive Bayes Classification

#### TFIDF Vectorizer

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [46]:
from sklearn.naive_bayes import GaussianNB

In [47]:
vectorizer = TfidfVectorizer()

In [48]:
dx = ds3["TEXT"]
dy = ds3["label"]

In [50]:
X = vectorizer.fit_transform(dx)

In [51]:
y = dy

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X.toarray(), y, stratify = y, random_state = 42)

In [59]:
nb = GaussianNB()

In [60]:
nb.fit(X_train, y_train)

GaussianNB()

In [61]:
nb.score(X_test, y_test)

0.8794596466920679

In [70]:
predicted_values = nb.predict(X_test)