In [90]:
import pandas as pd

In [91]:
sms = pd.read_table("./sms.tsv", header=None, names=["label", "message"])

In [92]:
sms["label"] = sms.label.map({"ham":0, "spam":1})

In [93]:
sms.label.mean()

0.13406317300789664

In [94]:
X = sms.message
y = sms.label

## All three methods of classification (LogisticRegression, DecisionTreeClasifier, NaiveBayes)

In [95]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer

In [96]:
X_test, X_train, y_test, y_train = train_test_split(X,y, test_size=1000,random_state=123)

#### LogisticRegression Optimal

In [97]:
pipe_logreg = Pipeline([
    ("vectorizing",CountVectorizer()),
    ("standarization",StandardScaler()),
    ("logreg",LogisticRegression())])

param_grid_logreg = {"vectorizing__max_df":[0.01,0.05],
              "vectorizing__max_features":[3000],
              "vectorizing__max_df":[1.0,0.9],
              "standarization__with_mean":[False],
              "logreg__penalty":["l1","l2"],
              "logreg__C":[10,1,0.1,0.01]}

gs = GridSearchCV(pipe_logreg, param_grid_logreg, cv = 5)

gs.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vectorizing', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
   ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'standarization__with_mean': [False], 'logreg__C': [10, 1, 0.1, 0.01], 'logreg__penalty': ['l1', 'l2'], 'vectorizing__max_df': [1.0, 0.9], 'vectorizing__max_features': [3000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [98]:
accuracy_score(gs.best_estimator_.predict(X_test),y_test)

0.9590988626421697

In [99]:
gs.best_params_

{'logreg__C': 1,
 'logreg__penalty': 'l1',
 'standarization__with_mean': False,
 'vectorizing__max_df': 1.0,
 'vectorizing__max_features': 3000}

#### DecisionTreeClasifier Optimal

In [100]:
pipe_tree = Pipeline([
    ("vectorizing",CountVectorizer()),
    ("tree",DecisionTreeClassifier())])
param_grid_tree = {"vectorizing__max_df":[0.01,0.05],
                   "vectorizing__max_features":[3000],
                   "vectorizing__max_df":[1.0,0.9],
                   "tree__max_depth":[3,5,7,10],
                   "tree__min_samples_leaf":[3,5,10,15]}

gs = GridSearchCV(pipe_tree,param_grid_tree, cv = 5)
gs.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vectorizing', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
   ...      min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'tree__max_depth': [3, 5, 7, 10], 'tree__min_samples_leaf': [3, 5, 10, 15], 'vectorizing__max_df': [1.0, 0.9], 'vectorizing__max_features': [3000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [101]:
accuracy_score(gs.best_estimator_.predict(X_test),y_test)

0.9444444444444444

In [102]:
gs.best_params_

{'tree__max_depth': 7,
 'tree__min_samples_leaf': 5,
 'vectorizing__max_df': 0.9,
 'vectorizing__max_features': 3000}

#### NaiveBayesClassifier

In [106]:
pipe_bayes = Pipeline([
    ("vectorizing",CountVectorizer()),
    ("bayes",MultinomialNB())])

param_grid_bayes = {"vectorizing__max_df":[0.01,0.05],
              "vectorizing__max_features":[3000],
              "vectorizing__max_df":[1.0,0.9]}

gs = GridSearchCV(pipe_bayes, param_grid_bayes, cv = 5)

gs.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vectorizing', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), pr...zer=None, vocabulary=None)), ('bayes', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'vectorizing__max_df': [1.0, 0.9], 'vectorizing__max_features': [3000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [107]:
accuracy_score(gs.best_estimator_.predict(X_test),y_test)

0.976159230096238

In [109]:
gs.best_params_

{'vectorizing__max_df': 1.0, 'vectorizing__max_features': 3000}

## Automation 

In [130]:
pipes = {
    "logreg": Pipeline([
    ("vectorizing",CountVectorizer()),
    ("standarization",StandardScaler()),
    ("logreg",LogisticRegression())])
    ,
    "tree":Pipeline([
    ("vectorizing",CountVectorizer()),
    ("tree",DecisionTreeClassifier())])
    ,
    "bayes":Pipeline([
    ("vectorizing",CountVectorizer()),
    ("bayes",MultinomialNB())])
   
}

param_grids = {
    "logreg":{"vectorizing__min_df":[0.01,0.05],
              "vectorizing__max_features":[3000],
              "vectorizing__max_df":[1.0,0.9],
              "standarization__with_mean":[False],
              "logreg__penalty":["l1","l2"],
              "logreg__C":[10,1,0.1,0.01]}
    ,
    "tree":{"vectorizing__min_df":[0.01,0.05],
                   "vectorizing__max_features":[3000],
                   "vectorizing__max_df":[1.0,0.9],
                   "tree__max_depth":[3,5,7,10],
                   "tree__min_samples_leaf":[3,5,10,15]}
    ,
    "bayes":{"vectorizing__min_df":[0.01,0.05],
              "vectorizing__max_features":[3000],
              "vectorizing__max_df":[1.0,0.9],
              "bayes__alpha":[3000],
            }
}

In [131]:
steps = [pipes, param_grids]
results = {}

In [132]:
for key in steps[0]:
    pipe = steps[0][key]
    param_grid = steps[1][key]
    gs = GridSearchCV(pipe, param_grid, cv = 5)
    gs.fit(X_train,y_train)
    accuracy = accuracy_score(gs.best_estimator_.predict(X_test),y_test)
    best_params = gs.best_params_
    results[key] = {"accuracy":accuracy,"best_params":best_params}

In [133]:
import json
print(json.dumps(results, indent=4))

{
    "tree": {
        "best_params": {
            "tree__max_depth": 10, 
            "vectorizing__max_features": 3000, 
            "vectorizing__min_df": 0.01, 
            "vectorizing__max_df": 1.0, 
            "tree__min_samples_leaf": 3
        }, 
        "accuracy": 0.941819772528434
    }, 
    "bayes": {
        "best_params": {
            "vectorizing__min_df": 0.01, 
            "vectorizing__max_df": 1.0, 
            "vectorizing__max_features": 3000
        }, 
        "accuracy": 0.9717847769028871
    }, 
    "logreg": {
        "best_params": {
            "standarization__with_mean": false, 
            "vectorizing__min_df": 0.01, 
            "logreg__penalty": "l2", 
            "logreg__C": 0.01, 
            "vectorizing__max_df": 1.0, 
            "vectorizing__max_features": 3000
        }, 
        "accuracy": 0.9698162729658792
    }
}
