In [9]:
from sklearn.svm import SVC
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score,precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
sms = pd.read_table("./sms.tsv", header=None, names=["label", "message"])
sms["label"] = sms.label.map({"ham":0, "spam":1})
X = sms.message
y = sms.label
X_test, X_train, y_test, y_train = train_test_split(X,y, test_size=1000,random_state=123)

In [28]:
pipe_svc = Pipeline([
    ("vectorizing",CountVectorizer()),
    ("standarization",StandardScaler()),
    ("svc",SVC())])

param_grid_svc = [{"vectorizing__max_df":[0.01,0.05],
                  "vectorizing__max_features":[3000],
                  "vectorizing__max_df":[1.0,0.9],
                  "standarization__with_mean":[False],
                  "svc__kernel":['linear'],
                  "svc__C": [1, 10, 100, 1000]},
                 {"vectorizing__max_df":[0.01,0.05],
                  "vectorizing__max_features":[3000],
                  "vectorizing__max_df":[1.0,0.9],
                  "standarization__with_mean":[False],
                  "svc__kernel":['rbf'],
                  "svc__C": [1, 10, 100, 1000],
                  "svc__gamma": [0.001, 0.0001]},
                 {"vectorizing__max_df":[0.01,0.05],
                  "vectorizing__max_features":[3000],
                  "vectorizing__max_df":[1.0,0.9],
                  "standarization__with_mean":[False],
                  "svc__kernel":['poly'],
                  "svc__C": [1, 10, 100, 1000],
                  "svc__degree": [1, 2,3,4,5,8,12,20]},
                 {"vectorizing__max_df":[0.01,0.05],
                  "vectorizing__max_features":[3000],
                  "vectorizing__max_df":[1.0,0.9],
                  "standarization__with_mean":[False],
                  "svc__kernel":['sigmoid'],
                  "svc__C": [1, 10, 100, 1000]}]
            

gs = GridSearchCV(pipe_svc, param_grid_svc, cv = 5)

gs.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vectorizing', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
   ...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'standarization__with_mean': [False], 'svc__kernel': ['linear'], 'svc__C': [1, 10, 100, 1000], 'vectorizing__max_df': [1.0, 0.9], 'vectorizing__max_features': [3000]}, {'standarization__with_mean': [False], 'svc__C': [1, 10, 100, 1000], 'svc__kernel': ['rbf'], 'svc__gamma': [0.001, 0.00...vc__C': [1, 10, 100, 1000], 'vectorizing__max_df': [1.0, 0.9], 'vectorizing__max_features': [3000]}],
       pre_dispatch='2*n_jobs', refit=True, return_tra

In [30]:
print(classification_report(y_pred=gs.best_estimator_.predict(X_test), y_true=y_test))

             precision    recall  f1-score   support

          0       0.97      1.00      0.98      3965
          1       0.97      0.78      0.86       607

avg / total       0.97      0.97      0.97      4572



In [31]:
gs.best_score_

0.973

In [32]:
gs.best_estimator_

Pipeline(memory=None,
     steps=[('vectorizing', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=3000, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
   ...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [35]:
gs.best_params_

{'standarization__with_mean': False,
 'svc__C': 10,
 'svc__kernel': 'sigmoid',
 'vectorizing__max_df': 1.0,
 'vectorizing__max_features': 3000}

In [36]:
accuracy_score(y_pred=gs.best_estimator_.predict(X_test), y_true=y_test)

0.9674103237095363