In [3]:
import pandas as pd

In [4]:
df = pd.read_csv('./spam.csv', encoding = 'ISO-8859-1')

In [5]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [7]:
df = df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'])

In [8]:
df.rename(columns={'v1': 'label', 'v2': 'text'},inplace=True)

In [9]:
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
label_mapping = {'ham': 0, 'spam': 1}
df['label'] = df['label'].map(label_mapping)


In [11]:
df.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [12]:
from sklearn.model_selection import train_test_split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.33, random_state=42)

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [15]:
featurizer = CountVectorizer(max_features=2000)

In [16]:
X_train_features = featurizer.fit_transform(X_train)

In [17]:
X_test_features = featurizer.transform(X_test)

In [18]:
nb_model = MultinomialNB()

In [19]:
nb_model.fit(X_train_features, y_train)

In [20]:
from sklearn.ensemble import RandomForestClassifier

In [21]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_features, y_train)

In [22]:
nb_y_pred = nb_model.predict(X_test_features)

In [23]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [24]:
nb_accuracy = accuracy_score(y_test, nb_y_pred)
nb_precision = precision_score(y_test, nb_y_pred)
nb_recall = recall_score(y_test, nb_y_pred)
nb_f1 = f1_score(y_test, nb_y_pred)
print("nb_accuracy",nb_accuracy)
print("nb_precision",nb_precision)
print("nb_recall",nb_recall)
print("nb_f1",nb_f1)


nb_accuracy 0.9820554649265906
nb_precision 0.9620253164556962
nb_recall 0.9047619047619048
nb_f1 0.9325153374233128


In [25]:
df.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [26]:
import matplotlib.pyplot as plt
plt.hist(df['label'],color='yellow')
plt.xlabel('ham/spam')
plt.ylabel('Frequency')

ModuleNotFoundError: No module named 'matplotlib'