# Spam_전처리완료.csv 파일
- MachineLearning 방법으로 분류해서 정확도 구하기

### CountVectorizer + NaiveBayes
### TfidVectorizer + LogistixRegression

In [11]:
from google.colab import files

up = files.upload()

Saving spam_전처리완료.csv to spam_전처리완료.csv


In [12]:
import pandas as pd

df = pd.read_csv('spam_전처리완료.csv')
df.head()

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


- 텍스트 전처리

In [29]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [43]:
x = df.v2.values
y = df.v1.values

In [44]:
# 구둣점 제거, 소문자 변환
import re

def preprocessing(s):
    s = s.encode('utf8').decode('ascii', 'ignore')
    s = re.sub('[^a-z0-9 ]','', s.lower())
    return s

In [46]:
X_data = [preprocessing(sent) for sent in x]
X_data[1]

'ok lar joking wif u oni'

- Train/test

In [47]:
import tensorflow as tf
import numpy as np
seed = 2022
np.random.seed(seed)
tf.random.set_seed(seed)

In [48]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    x, y, stratify=y, random_state=seed
) 

### 1. Pipeline: TfidVectorizer + LogisticRegression
- 모델 생성/학습/평가

In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [50]:
pipeline = Pipeline([ 
    ('tvect', TfidfVectorizer(stop_words='english', ngram_range=(1,2))),
    ('lr', LogisticRegression(random_state=seed))
])

In [51]:
%time pipeline.fit(X_train, y_train)

CPU times: user 429 ms, sys: 200 ms, total: 628 ms
Wall time: 594 ms


Pipeline(steps=[('tvect',
                 TfidfVectorizer(ngram_range=(1, 2), stop_words='english')),
                ('lr', LogisticRegression(random_state=2022))])

In [52]:
pipeline.score(X_test, y_test)

0.9497293116782676

- 최적의 parameter

In [53]:
from sklearn.model_selection import GridSearchCV
params = {
    'tvect__max_df': [100,500],
    'lr__C': [1, 10]
}

In [54]:
grid_pipe = GridSearchCV(
    pipeline, param_grid=params, scoring='accuracy', cv=3, n_jobs=-1
)
%time grid_pipe.fit(X_train, y_train)

CPU times: user 719 ms, sys: 511 ms, total: 1.23 s
Wall time: 4.93 s


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('tvect',
                                        TfidfVectorizer(ngram_range=(1, 2),
                                                        stop_words='english')),
                                       ('lr',
                                        LogisticRegression(random_state=2022))]),
             n_jobs=-1,
             param_grid={'lr__C': [1, 10], 'tvect__max_df': [100, 500]},
             scoring='accuracy')

In [55]:
grid_pipe.best_params_

{'lr__C': 10, 'tvect__max_df': 500}

In [56]:
grid_pipe.best_estimator_.score(X_test, y_test)

0.9767981438515081

### 2. CountVectorizer + NaiveBayes
- 모델 생성/학습/평가

In [61]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [62]:
from sklearn.feature_extraction.text import CountVectorizer

cvect = CountVectorizer(min_df=1, max_df=0.9, ngram_range=(1,2))
cvect.fit(X_train)
X_train_cv = cvect.transform(X_train)
X_test_cv = cvect.transform(X_test)

In [63]:
nb.fit(X_train_cv, y_train)
nb.score(X_test_cv, y_test)

0.9853054911059551