In [3]:
import numpy as np # import numpy
import pandas as pd # import pandas

df = pd.read_csv('../data/spam.csv', encoding='latin-1') # membaca dataset dengan menspesifikasikan encoding data

df.head() # mencetak 5 data teratas

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
# drop 3 kolom terakhir
df = df.drop(df.iloc[:,2:], axis=1)

# mencetak 5 data teratas
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
# mendefinisikan nama kolom baru
new_cols = {
    'v1': 'Labels',
    'v2': 'SMS'
}

# mengganti nama kolom
df = df.rename(columns=new_cols)

# cek data
df.head()   

Unnamed: 0,Labels,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
# melihat jumlah data untuk tiap label
print(df['Labels'].value_counts())
print('\n')

# informasi dataset
print(df.info())
print('\n')

# deskripsi statistik dataset
print(df.describe())

ham     4825
spam     747
Name: Labels, dtype: int64


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Labels  5572 non-null   object
 1   SMS     5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
None


       Labels                     SMS
count    5572                    5572
unique      2                    5169
top       ham  Sorry, I'll call later
freq     4825                      30


In [7]:
# memberi data baru untuk label dalam bentuk map
new_labels = {
    'spam': 1,
    'ham': 0
}

# mengimplementasikan map pada label
df['Labels'] = df['Labels'].map(new_labels)

# Cek data
df.head()

Unnamed: 0,Labels,SMS
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
X = df['SMS'].values # mendefinisikan fitur
y = df['Labels'].values # mendefinisikan label

In [9]:
from sklearn.model_selection import train_test_split # u/ split data train dan test
from sklearn.feature_extraction.text import CountVectorizer # tokenisasi dataset text

# Split data training dan testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)

bow = CountVectorizer() # instansiasi model

X_train = bow.fit_transform(X_train) # melakukan training sekaligus merubah data train menjadi data yang telah ditokenisasi

X_test = bow.transform(X_test) # melakukan tokenisasi pada data tes

In [11]:
print(len(bow.get_feature_names_out())) # mencetak jumlah token unik
print(f'Dimensi data: {X_train.shape}') # mencetak dimensi data

7727
Dimensi data: (4457, 7727)


In [12]:
from sklearn.naive_bayes import MultinomialNB # u/ melakukan klasifikasi naive bayes pada data multinomial
from sklearn.metrics import accuracy_score # u/ evaluasi skor akurasi model

# instansiasi model
mnb = MultinomialNB()

# melatih model dengan data yang sudah ditokenisasi
mnb.fit(X_train, y_train)

# Prediksi dengan data training
y_pred_train = mnb.predict(X_train)

# Evaluasi akurasi data training
acc_train = accuracy_score(y_train, y_pred_train)

# Prediksi dengan data test
y_pred_test = mnb.predict(X_test)

# Evaluasi akurasi data test
acc_test = accuracy_score(y_test, y_pred_test)

# Print hasil evaluasi
print(f'Hasil akurasi data train: {acc_train}')
print(f'Hasil akurasi data test: {acc_test}')

Hasil akurasi data train: 0.9946152120260264
Hasil akurasi data test: 0.9775784753363229
