In [4]:
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
try:
    from sklearn.model_selection import train_test_split
except:
    from sklearn.cross_validation import train_test_split
from sklearn import metrics

from sklearn.pipeline import Pipeline, FeatureUnion

In [21]:
df = pd.read_csv("data/smsspamcollection/SMSSpamCollection", sep="\t", 
                 header=None, 
                 names=["target", "text"])

In [22]:
X = df["text"]
y = df["target"]

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [24]:
import re

In [25]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

class CapitalDocTransfomer(BaseEstimator, TransformerMixin):
    """
    Transforms the input document to either 1 or 0.
    1 if all words in the document are Capital else  0.
    """
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # if all the words in each line of X is A-Z only, then return 1 else retun 0         
        X_new = np.array([line == line.upper() for line in X]).astype(int)
        return X_new.reshape(-1,1)

In [26]:
cp_transformer = CapitalDocTransfomer()

cp_transformer.fit_transform(X[:5])

array([[0],
       [0],
       [0],
       [0],
       [0]])

In [27]:
# Build a Model
log_reg_model = Pipeline(steps=[
        ("vect", CountVectorizer()), 
        ("model", LogisticRegression())
        ])

In [28]:
log_reg_model.fit(X_train, y_train)
# Accuracy
log_reg_model.score(X_test, y_test)

0.98504784688995217

In [29]:
import pickle

In [11]:
with open('/Users/veenakumar/spamham.pkl','wb') as picklefile:
    pickle.dump(log_reg_model, picklefile)

In [30]:
from sklearn.externals import joblib

In [31]:
joblib.dump(log_reg_model, 'models/spam_ham.pkl')

['models/spam_ham.pkl',
 'models/spam_ham.pkl_01.npy',
 'models/spam_ham.pkl_02.npy',
 'models/spam_ham.pkl_03.npy']

In [32]:
new_model = joblib.load('models/spam_ham.pkl')

In [34]:
new_model.predict(["CLAIM YOUR PRIZE"])

array(['ham'], dtype=object)