In [31]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('spam.csv')

In [57]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [7]:
re = df.Category.value_counts()

In [8]:
re

Category
ham     4825
spam     747
Name: count, dtype: int64

In [9]:
df['spam'] = df['Category'].apply(lambda x:1 if x =='spam' else 0)

In [10]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [11]:
df['spam'].value_counts()

spam
0    4825
1     747
Name: count, dtype: int64

## Split the test sets

In [12]:
from sklearn.model_selection import train_test_split
X_train , X_test, y_train , y_test = train_test_split(df.Message , df.spam , test_size = 0.2)

In [13]:
X_train.shape

(4457,)

In [15]:
X_test.shape

(1115,)

## Converting the Categorical Data into number_type using CountVectorizer

## How It Works:
1. **Tokenization:** Splits the text into individual words (tokens).
2. **Vocabulary Creation:** Assigns a unique index to each unique token.
3. **Count Encoding:** Represents each document as a vector, where each value indicates the frequency of a word in the document.

## Example Usage:
```python

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer()
X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<4457x7826 sparse matrix of type '<class 'numpy.int64'>'
	with 59423 stored elements in Compressed Sparse Row format>

In [21]:
X_train_cv.toarray()[:2]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [22]:
X_train_cv.shape

(4457, 7826)

In [24]:
v.get_feature_names_out().shape

(7826,)

In [34]:
X_train_np = X_train_cv.toarray()

In [35]:
X_train_np[:4][0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

## test the train sets

In [36]:
np.where(X_train_np[0] != 0)

(array([ 903, 1537, 1929, 2169, 3231, 3437, 4062, 4074, 4448, 4804, 5526,
        6115, 6151, 6897, 6999, 7589], dtype=int64),)

In [41]:
X_train[:1][2448]

'Tmr then ü brin lar... Aiya later i come n c lar... Mayb ü neva set properly ü got da help sheet wif ü...'

In [45]:
X_train_np[0][1929]

1

## Train the Model

In [48]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_cv , y_train)

In [49]:
X_test_cv = v.transform(X_test)

# classification_report of the model

In [51]:
from sklearn.metrics import classification_report
y_pred = model.predict(X_test_cv)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       976
           1       0.98      0.92      0.95       139

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



## Test The Model

In [58]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1], dtype=int64)