In [3]:
import sklearn
import numpy as np
import matplotlib as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import cross_val_score
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [4]:
spam = pd.read_csv('./datasets/spam.csv', encoding='latin-1')

In [5]:
spam.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [6]:
spam['v1'].value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

In [13]:
c = ' '.join(spam.loc[spam['v1'] == 'ham', 'v2'])   # join: 한줄로 연결 (공백으로 각 문장 구별)

In [14]:
l = c.split(' ')
l[:10]

['Go',
 'until',
 'jurong',
 'point,',
 'crazy..',
 'Available',
 'only',
 'in',
 'bugis',
 'n']

In [63]:
from collections import Counter

counter1 = Counter(l)

In [19]:
counter1.most_common(20)

[('to', 1530),
 ('you', 1458),
 ('I', 1436),
 ('the', 1019),
 ('a', 969),
 ('and', 738),
 ('i', 736),
 ('in', 734),
 ('u', 645),
 ('is', 638),
 ('my', 619),
 ('', 597),
 ('me', 537),
 ('of', 498),
 ('for', 475),
 ('that', 398),
 ('it', 375),
 ('your', 373),
 ('on', 352),
 ('have', 346)]

In [50]:
# df1 = pd.DataFrame.from_dict([counter1.keys(), counter1.values()]).T

# type(counter1)
df1 = pd.DataFrame(list(counter1.items()))


In [51]:
df1.columns = ['words in non-spam', 'count']

In [52]:
df1.head()

Unnamed: 0,words in non-spam,count
0,Go,10
1,until,21
2,jurong,1
3,"point,",1
4,crazy..,1


In [59]:
df1

Unnamed: 0,words in non-spam,count
0,Go,10
1,until,21
2,jurong,1
3,"point,",1
4,crazy..,1
...,...,...
12475,"Pity,",1
12476,So...any,1
12477,suggestions?,1
12478,bitching,1


In [56]:
c = ' '.join(spam.loc[spam['v1'] == 'spam', 'v2']) 
l = c.split(' ')
counter1 = Counter(l)
df2 = pd.DataFrame(list(counter1.items()))
df2.columns = ['words in spam', 'count']
df2.head()

# 여기까지 그냥 데이터 보여주기 위해 만든 것 - CounterVectorizer 쓰면 금방 해결

Unnamed: 0,words in spam,count
0,Free,35
1,entry,25
2,in,64
3,2,169
4,a,358


In [58]:
from sklearn import feature_extraction

fe = feature_extraction.text.CountVectorizer(stop_words='english')  # stop_words: 관사 없애기

In [64]:
X = fe.fit_transform(spam['v2'])
X.shape

(5572, 8404)

In [65]:
y = spam['v1'].map({'spam':1, 'ham':0})

In [66]:
y

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: v1, Length: 5572, dtype: int64

In [69]:
## 테스트 / 학습

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=2022, stratify=y)

In [72]:
from sklearn.naive_bayes import MultinomialNB  # 범주형 - mutinomia, 연속형 - gaussian

clf = MultinomialNB()
clf.fit(X_train, y_train)

In [73]:
y_pred = clf.predict(X_test)

In [78]:
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))

0.9808612440191388
0.9033613445378151


In [80]:
## Support Vector
from sklearn import svm

clf = svm.SVC()
clf.fit(X_train, y_train)

In [81]:
y_pred = clf.predict(X_test)

In [82]:
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))

0.9766746411483254
1.0
