In [2]:
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
try:
    from sklearn.model_selection import train_test_split
except:
    from sklearn.cross_validation import train_test_split
from sklearn import metrics

from sklearn.pipeline import Pipeline, FeatureUnion, make_union, make_pipeline

In [3]:
df = pd.read_csv("data/smsspamcollection/SMSSpamCollection", sep="\t", 
                 header=None, 
                 names=["target", "text"])

In [4]:
X = df["text"]
y = df["target"]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [6]:
X_train[:4]

708     Quite late lar... Ard 12 anyway i wun b drivin...
4338                        on a Tuesday night r u 4 real
5029    Go chase after her and run her over while she'...
4921     G says you never answer your texts, confirm/deny
Name: text, dtype: object

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

class CapitalDocTransfomer(BaseEstimator, TransformerMixin):
    """
    Transforms the input document to either 1 or 0.
    1 if all words in the document are Capital else  0.
    """
        
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # if all the words in each line of X is A-Z only, then return 1 else retun 0         
        X_new = np.array([line == line.upper() for line in X]).astype(int)
        return X_new.reshape(-1, 1)

In [8]:
# Name the Steps with Pipeline and FeatureUnion

log_reg_model = Pipeline(steps=[
        ("features", FeatureUnion([
                    ("iscap", CapitalDocTransfomer()), 
                    ("count", CountVectorizer())])),
        ("model", LogisticRegression())
        ])

In [9]:
# Since we are not doing any parameter tuning of Features, convert to make_union
log_reg_model = Pipeline(steps=[
        ("features", make_union(CapitalDocTransfomer(), CountVectorizer())),
        ("model", LogisticRegression())
        ])

In [10]:
# Using Make Pipeline and Make Union
log_reg_model = make_pipeline(make_union(CapitalDocTransfomer(), CountVectorizer()),
        LogisticRegression())

In [11]:
# All three of the above ways are the same

In [12]:
log_reg_model.fit(X_train, y_train)
# Accuracy
log_reg_model.score(X_test, y_test)

0.98504784688995217

In [13]:
from sklearn.externals import joblib

In [14]:
joblib.dump(log_reg_model, "models/spam_ham.pkl")

['spam_ham.pkl',
 'spam_ham.pkl_01.npy',
 'spam_ham.pkl_02.npy',
 'spam_ham.pkl_03.npy']

In [15]:
!ls

[34mdata[m[m                                    spam_ham.pkl_02.npy
feature-pipeline-example-spam-ham.ipynb spam_ham.pkl_03.npy
spam_ham.pkl                            spamham.pkl
spam_ham.pkl_01.npy
