#  SMS Classifier

### Importing

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

## Loading dataset 

In [2]:
data = pd.read_csv("SMSSpamCollection.csv" , sep='\t', names=['label', 'message'])
data

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


### Data preprossesing 

In [3]:
# Printing the first five rows of the dataset
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# Printing the last five rows of the dataset
data.tail()

Unnamed: 0,label,message
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [5]:
# Getting information about our data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [6]:
data.dtypes

label      object
message    object
dtype: object

In [7]:
data.isnull()

Unnamed: 0,label,message
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
5567,False,False
5568,False,False
5569,False,False
5570,False,False


In [8]:
# Checking the missing values in each column
data.isnull().sum()

label      0
message    0
dtype: int64

In [9]:
# Number of rows and columns in data
data.shape

(5572, 2)

In [10]:
# Checking the duplicates
data.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
5567    False
5568    False
5569    False
5570    False
5571    False
Length: 5572, dtype: bool

In [11]:
# Checking the columns
data.columns

Index(['label', 'message'], dtype='object')

## Training dataset 

In [12]:
data['label'] = data['label'].map({'ham': 0, 'spam': 1})
X_train, X_test, y_train, y_test = train_test_split(data['message'], data['label'], test_size=0.2, random_state=42)

In [13]:
# Text vectorization
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train Model
classifier = MultinomialNB()
classifier.fit(X_train_vectorized, y_train)

# Make Predictions
predictions = classifier.predict(X_test_vectorized)


In [14]:
accuracy = accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions)
print(f'Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')

Accuracy: 0.9919282511210762
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       966
           1       1.00      0.94      0.97       149

    accuracy                           0.99      1115
   macro avg       1.00      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115



### To count number of spam and non spam sms 

In [15]:
# Labels - 0 for nonspam , 1 for spam
spam_count = data[data["label"] == 1].shape[0]
nonspam_count = data[data["label"] == 0].shape[0]

# Printing the counts
print("Number of spam messages :" , spam_count)
print("Number of non-spam messages :" , nonspam_count)

Number of spam messages : 747
Number of non-spam messages : 4825


### Taking user input 

In [16]:
# Take User Input
user_in = input('Enter SMS Message: ')
user_in_vectorized = vectorizer.transform([user_in])
prediction = classifier.predict(user_in_vectorized)

if prediction[0] == 1:
    print(' Hey! It is a Spam SMS..!')
else:
    print('Dont worry! It is a Non-Spam SMS..!')

Enter SMS Message: Rofl. Its true to its name
Dont worry! It is a Non-Spam SMS..!
