### SMS Spam 분류

In [2]:
import pandas as pd
url = 'https://raw.githubusercontent.com/ukairia777/tensorflow-nlp-tutorial/main/10.%20RNN%20Text%20Classification/dataset/spam.csv'

In [3]:
df = pd.read_csv(url, encoding='latin1')
df.head(3)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,


- 데이터 전처리

In [4]:
# Selection
df = df[['v1', 'v2']]
df.head(3)

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...


In [5]:
# 결측치 확인
df.isna().sum()

v1    0
v2    0
dtype: int64

In [6]:
# 중복 데이터 확인
df.shape, df.v2.nunique()

((5572, 2), 5169)

In [8]:
# 중복 데이터 제거 
df.drop_duplicates(subset=['v2'], inplace=True)
df.shape

(5169, 2)

In [9]:
# ['ham', 'spam'] --> [0, 1]
df.v1 = df.v1.replace(['ham', 'spam'], [0, 1])
df.head(3)

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...


In [10]:
# Ham/Spam 분포
df.v1.value_counts()

v1
0    4516
1     653
Name: count, dtype: int64

- 텍스트 전처리

In [12]:
# 구둣점, 숫자 제거
df.v2 = df.v2.str.replace('[^A-Za-z]', ' ', regex=True)

In [13]:
df.v2[0]

'Go until jurong point  crazy   Available only in bugis n great world la e buffet    Cine there got amore wat   '

- 데이터셋 분리

In [16]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(
    df.v2.values, df.v1.values, stratify=df.v1.values, test_size=0.2, random_state=2023
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4135,), (1034,), (4135,), (1034,))

- Pipeline으로 베스트 파라메터 찾기 
    - CountVectorizer + RandomForestClassifier

In [17]:
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.pipeline import Pipeline 
from sklearn.ensemble import RandomForestClassifier

In [21]:
from sklearn.model_selection import GridSearchCV 
params = {
    'CVECT__ngram_range': [(1,1), (1,2)],
    'RFC__max_depth': [2,5,8]
}

In [22]:
cvect = CountVectorizer(stop_words='english')
rfc = RandomForestClassifier(random_state=2023)
pipeline = Pipeline([('CVECT', cvect), ('RFC', rfc)])
grid_pipe = GridSearchCV(pipeline, params, scoring='accuracy', cv=3)
%time grid_pipe.fit(X_train, y_train)

CPU times: total: 7.41 s
Wall time: 7.55 s


In [23]:
grid_pipe.best_params_

{'CVECT__ngram_range': (1, 1), 'RFC__max_depth': 8}

In [32]:
params = {'RFC__max_depth': [120,180,200]}
grid_pipe = GridSearchCV(pipeline, params, scoring='accuracy', cv=3)
%time grid_pipe.fit(X_train, y_train)
grid_pipe.best_params_

CPU times: total: 20.4 s
Wall time: 21 s


{'RFC__max_depth': 120}

In [33]:
grid_pipe.best_estimator_.score(X_test, y_test)

0.9680851063829787

In [None]:
cvect = CountVectorizer(stop_words='english')
rfc = RandomForestClassifier(random_state=2023)
pipeline = Pipeline([('CVECT', cvect), ('RFC', rfc)])
grid_pipe = GridSearchCV(pipeline, params, scoring='accuracy', cv=3)
%time grid_pipe.fit(X_train, y_train)

> 다른estimator

In [74]:
from sklearn.linear_model import LogisticRegression 
lrc = LogisticRegression(random_state=2023)
pipeline = Pipeline([('CVECT', cvect), ('LRC', lrc)]) 
pipeline.fit(X_train, y_train)
pipeline.score(X_test, y_test)

0.9709864603481625

In [61]:
# Linear
cvect = CountVectorizer(stop_words='english')
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
params = {
    'CVECT__ngram_range': [(1, 1), (1, 2)],
}
pipeline = Pipeline([('CVECT', cvect), ('LR', lr)])
grid_pipe = GridSearchCV(pipeline, params, scoring='neg_mean_squared_error', cv=3)
grid_pipe.fit(X_train, y_train)

In [62]:
grid_pipe.best_params_

{'CVECT__ngram_range': (1, 2)}

In [63]:
grid_pipe.best_estimator_.score(X_test, y_test)

0.7361763141088834

In [47]:
pipeline.get_params()

{'memory': None,
 'steps': [('preprocessor', StandardScaler()),
  ('classifier', LogisticRegression())],
 'verbose': False,
 'preprocessor': StandardScaler(),
 'classifier': LogisticRegression(),
 'preprocessor__copy': True,
 'preprocessor__with_mean': True,
 'preprocessor__with_std': True,
 'classifier__C': 1.0,
 'classifier__class_weight': None,
 'classifier__dual': False,
 'classifier__fit_intercept': True,
 'classifier__intercept_scaling': 1,
 'classifier__l1_ratio': None,
 'classifier__max_iter': 100,
 'classifier__multi_class': 'auto',
 'classifier__n_jobs': None,
 'classifier__penalty': 'l2',
 'classifier__random_state': None,
 'classifier__solver': 'lbfgs',
 'classifier__tol': 0.0001,
 'classifier__verbose': 0,
 'classifier__warm_start': False}

In [76]:
# Ridge 
from sklearn.linear_model import Ridge
ridge = Ridge(random_state=2023)
pipeline = Pipeline([('CVECT', cvect), ('Ridge', ridge)])
pipeline.fit(X_train, y_train)
pipeline.score(X_test, y_test)

0.7242716281928175

In [53]:
# Ridge 
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import Ridge
ridge = Ridge(random_state=2023)
params = {
    'CVECT__ngram_range': [(1, 1), (1, 2)],
    'ridge__alpha': [0.1, 1.0, 10.0],
    'ridge__random_state': [2023]
}
pipeline = Pipeline([('CVECT', cvect), ('ridge', ridge)])
grid_pipe = GridSearchCV(pipeline, params, scoring='accuracy', cv=3)
grid_pipe.fit(X_train, y_train)

In [54]:
grid_pipe.best_params_

{'CVECT__ngram_range': (1, 1),
 'ridge__alpha': 0.1,
 'ridge__random_state': 2023}

In [55]:
grid_pipe.best_estimator_.score(X_test, y_test)

0.6180610437145897

In [59]:
# Lasso
cvect = CountVectorizer(stop_words='english')
from sklearn.linear_model import Lasso 
ls = Lasso(random_state=2023)
params = {
    'CVECT__ngram_range': [(1, 1), (1, 2)],
    'Lasso__random_state': [2023]
}
pipeline = Pipeline([('CVECT', cvect), ('Lasso', ls)])
grid_pipe = GridSearchCV(pipeline, params, scoring='accuracy', cv=3)
grid_pipe.fit(X_train, y_train)
grid_pipe.best_params_

{'CVECT__ngram_range': (1, 1), 'Lasso__random_state': 2023}

In [60]:
grid_pipe.best_estimator_.score(X_test, y_test)

-1.8550224403224291e-06

In [78]:
from sklearn.svm import SVR 
sv = SVR()
pipeline = Pipeline([('CVECT', cvect), ('SVR', sv)])
pipeline.fit(X_train, y_train)
pipeline.score(X_test, y_test)

0.6757701745467213

In [64]:
# SV
cvect = CountVectorizer(stop_words='english')
from sklearn.svm import SVR 
sv = SVR()
params = {
    'CVECT__ngram_range': [(1, 1), (1, 2)],
}
pipeline = Pipeline([('CVECT', cvect), ('sv', sv)])
grid_pipe = GridSearchCV(pipeline, params, scoring='accuracy', cv=3)
grid_pipe.fit(X_train, y_train)
grid_pipe.best_params_

{'CVECT__ngram_range': (1, 1)}

In [65]:
grid_pipe.best_estimator_.score(X_test, y_test)

0.6757701745467213

In [None]:
# RF
cvect = CountVectorizer(stop_words='english')
from sklearn.ensemble import RandomForestRegressor 
rf = RandomForestRegressor()
params = {
    'CVECT__ngram_range': [(1, 1), (1, 2)],
}
pipeline = Pipeline([('CVECT', cvect), ('RF', rf)])
grid_pipe = GridSearchCV(pipeline, params, scoring='accuracy', cv=3)
grid_pipe.fit(X_train, y_train)
grid_pipe.best_params_

In [None]:
grid_pipe.best_estimator_.score(X_test, y_test)

In [69]:
# XGB
cvect = CountVectorizer(stop_words='english')
from xgboost import XGBRegressor
xgr = XGBRegressor()
params = {
    'CVECT__ngram_range': [(1, 1), (1, 2)],
}
pipeline = Pipeline([('CVECT', cvect), ('xgr', xgr)])
grid_pipe = GridSearchCV(pipeline, params, scoring='accuracy', cv=3)
grid_pipe.fit(X_train, y_train)
grid_pipe.best_params_

{'CVECT__ngram_range': (1, 1)}

In [73]:
from sklearn.linear_model import LogisticRegression 
lrc = LogisticRegression(random_state=2023)
pipeline = Pipeline([('CVECT', cvect), ('LRC', lrc)]) 
pipeline.fit(X_train, y_train)
pipeline.score(X_test, y_test)

0.9709864603481625

In [71]:
xgr.get_params()

{'objective': 'reg:squarederror',
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': None,
 'gpu_id': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'predictor': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [70]:
grid_pipe.best_estimator_.score(X_test, y_test)

0.7433677936679532