In [1]:
import pandas as pd
import numpy as np
data_file=r"SMSSpamCollection.txt"

In [2]:
sd=pd.read_csv(data_file,delimiter='\t',header=None,names=['target','message'])

In [3]:
sd.head()

Unnamed: 0,target,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


so here we have textual data which contain spam colelction, we have already used that 

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
lemma = WordNetLemmatizer()
stop = set(stopwords.words('english'))
# stop

In [6]:
def split_into_lemmas(message):
    message=message.lower()
    words = word_tokenize(message)
    words_sans_stop=[]
    for word in words :
        if word in stop:continue
        words_sans_stop.append(word)
    return [lemma.lemmatize(word) for word in words_sans_stop]

In [7]:
sd_train,sd_test=train_test_split(sd,test_size=0.2,random_state=2)

In [8]:
tfidf= TfidfVectorizer(analyzer=split_into_lemmas,min_df=20,max_df=3000)

In [9]:
tfidf.fit(sd_train['message'])

TfidfVectorizer(analyzer=<function split_into_lemmas at 0x00000161530657B8>,
        binary=False, decode_error='strict', dtype=<class 'numpy.float64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=3000,
        max_features=None, min_df=20, ngram_range=(1, 1), norm='l2',
        preprocessor=None, smooth_idf=True, stop_words=None,
        strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [10]:
train_data=tfidf.transform(sd_train['message'])

In [11]:
test_data=tfidf.transform(sd_test['message'])

In [12]:
clf=MultinomialNB()

In [13]:
clf.fit(train_data,sd_train['target'])

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [14]:
clf.predict_proba(test_data[6,:])

array([[0.97932328, 0.02067672]])

In [15]:
clf.classes_

array(['ham', 'spam'], dtype='<U4')

In [16]:
list(sd_test['message'])[6]

'ELLO BABE U OK?'

This is what we used to do 

but now we are aware of pipelines so we will create pipelines 

## With Python pipeline

In [17]:
from sklearn.pipeline import Pipeline

to create pipeline first we will import pipeline from sklearn.pipleline 


In [18]:
pipe1=Pipeline([
    ('tfidf',TfidfVectorizer(analyzer=split_into_lemmas,min_df=20,max_df=3000)),
    ('classfier',MultinomialNB())
])


so this pipeline allows us to provide a list of tuples so basically this list contain a different tuples of action , which we want to pipeline performe

In [19]:
pipe1.fit(sd_train['message'],sd_train['target'])

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer=<function split_into_lemmas at 0x00000161530657B8>,
        binary=False, decode_error='strict', dtype=<class 'numpy.float64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=3000,
        max_features=None, min_df=20, ngram_range=(1, 1), ...       vocabulary=None)), ('classfier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

we will use pipeline by using pipe.fit and will give the training data 

when we do pipe.fit it will actually run tfidf and classifer on this data 

In [20]:
pipe1.predict_proba(sd_test['message'])

array([[0.95823681, 0.04176319],
       [0.99030404, 0.00969596],
       [0.99119974, 0.00880026],
       ...,
       [0.94020892, 0.05979108],
       [0.97748664, 0.02251336],
       [0.0141158 , 0.9858842 ]])

This is about just one type of feature it is just having the text data 

let's say we have problem where we have numerical and catagorical data 

so in that case we can not directly apply tfidf vectorizer 

# Pipeline with Feature Union

In [23]:
file=r'Existing Base.csv'

bd=pd.read_csv(file)

In [24]:
bd.head()

Unnamed: 0,REF_NO,children,age_band,status,occupation,occupation_partner,home_status,family_income,self_employed,self_employed_partner,...,Investment Tax Saving Bond,Home Loan,Online Purchase Amount,Revenue Grid,gender,region,Investment in Commudity,Investment in Equity,Investment in Derivative,Portfolio Balance
0,1,Zero,51-55,Partner,Manual Worker,Secretarial/Admin,Own Home,"<17,500, >=15,000",No,No,...,19.99,0.0,0.0,1,Female,Wales,74.67,18.66,32.32,89.43
1,2,Zero,55-60,Single/Never Married,Retired,Retired,Own Home,"<27,500, >=25,000",No,No,...,0.0,0.0,0.0,2,Female,North West,20.19,0.0,4.33,22.78
2,3,Zero,26-30,Single/Never Married,Professional,Other,Own Home,"<30,000, >=27,500",Yes,No,...,0.0,3.49,0.0,2,Male,North,98.06,31.07,80.96,171.78
3,5,Zero,18-21,Single/Never Married,Professional,Manual Worker,Own Home,"<15,000, >=12,500",No,No,...,0.0,0.0,0.0,2,Female,West Midlands,4.1,14.15,17.57,-41.7
4,6,Zero,45-50,Partner,Business Manager,Unknown,Own Home,"<30,000, >=27,500",No,No,...,0.0,45.91,25.98,2,Female,Scotland,70.16,55.86,80.44,235.02


In [25]:
bd.nunique()

REF_NO                             10155
children                               5
age_band                              13
status                                 5
occupation                             9
occupation_partner                     9
home_status                            5
family_income                         13
self_employed                          2
self_employed_partner                  2
year_last_moved                       95
TVarea                                14
post_code                          10040
post_area                           2039
Average Credit Card Transaction     1411
Balance Transfer                    2183
Term Deposit                        1419
Life Insurance                      3111
Medical Insurance                   1589
Average A/C Balance                 2223
Personal Loan                       1760
Investment in Mutual Fund           2470
Investment Tax Saving Bond           832
Home Loan                            884
Online Purchase 

In [26]:
bd.dtypes

REF_NO                               int64
children                            object
age_band                            object
status                              object
occupation                          object
occupation_partner                  object
home_status                         object
family_income                       object
self_employed                       object
self_employed_partner               object
year_last_moved                      int64
TVarea                              object
post_code                           object
post_area                           object
Average Credit Card Transaction    float64
Balance Transfer                   float64
Term Deposit                       float64
Life Insurance                     float64
Medical Insurance                  float64
Average A/C Balance                float64
Personal Loan                      float64
Investment in Mutual Fund          float64
Investment Tax Saving Bond         float64
Home Loan  

In [27]:
from sklearn.base import BaseEstimator, TransformerMixin

so here we got two more libraries that is base estimator and transformixin

In [28]:
class VarTypeSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self,vartype,ignore_var):
        self.vartype=vartype
        self.ignore_var=ignore_var
    
    def fit(self,x,y=None):
        return self
    
    def transform(self,X):
        return X.select_dtypes(self.vartype).drop(self.ignore_var,axis=1)

so here we create one class it is basically for selecting particular variable type 
it allows us tos elect particular varaible type 
it is taking input as baseestimator and transformixin 

now it require a littile intialization so as initialization it needs the variable type so in vartype we are having numeric type and object type 
these are only two types which we will be  having 
so we are giving eiter numeric or object 

and out of these we will have some ignore variable 

ignore variable is the variable which we want to get rid off 

so any varaible which we wanted to drop initially we can proide it as ignore for ex:refno

next we will mention return type 

so in case of transforms we want to return the the datatypes and want to ignore the mentioned column  

In [2]:
class get_dummies_PipeLineFriendly(BaseEstimator, TransformerMixin):
    
    def __init__(self,freq_cutoff=0):
        self.freq_cutoff=freq_cutoff
        self.var_cat_dict={}
        
    def fit(self,x,y=None):
        data_cols=x.columns
        for col in data_cols:
            k=x[col].value_counts()
            cats=k.index[k>self.freq_cutoff][:-1]
            self.var_cat_dict[col]=cats
        return self
            
    def transform(self,x,y=None):
        dummy_data=x.copy()
        for col in self.var_cat_dict.keys():
            for cat in self.var_cat_dict[col]:
                name=col+'_'+cat
                dummy_data[name]=(dummy_data[col]==cat).astype(int)
            del dummy_data[col]
        return dummy_data

NameError: name 'BaseEstimator' is not defined

so now we are creating get dummies class which will create dummies varaible 

so we are mentioning freq cutoff which are used for creating dumm column 

then we are getting different columns 

and then we are getting count of diffrent categories 

and based on the count we are checking if the value is greater than mentioned freq cutoff 

if it is greater than then we will put it into var_cat-dict otherwise we won't 

in transforming part we have var_cat-dictkeys 
now based on these we get the column names and from each column we get each categories 
and from this categories we combine the column and convert it into dummy column 

In [30]:
from sklearn.pipeline import Pipeline,FeatureUnion

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [32]:
bd_train,bd_test=train_test_split(bd,test_size=0.2,random_state=2)

In [33]:
x_train=bd_train.drop('Revenue Grid',axis=1)
x_test=bd_test.drop('Revenue Grid',axis=1)
y_train=bd_train['Revenue Grid']
y_test=bd_test['Revenue Grid']

In [34]:
bd.dtypes

REF_NO                               int64
children                            object
age_band                            object
status                              object
occupation                          object
occupation_partner                  object
home_status                         object
family_income                       object
self_employed                       object
self_employed_partner               object
year_last_moved                      int64
TVarea                              object
post_code                           object
post_area                           object
Average Credit Card Transaction    float64
Balance Transfer                   float64
Term Deposit                       float64
Life Insurance                     float64
Medical Insurance                  float64
Average A/C Balance                float64
Personal Loan                      float64
Investment in Mutual Fund          float64
Investment Tax Saving Bond         float64
Home Loan  

In [35]:
cat_pipe=Pipeline([
    ('cat_var',VarTypeSelector(['object'],ignore_var=['post_code','post_area'])),
    ('dummies',get_dummies_PipeLineFriendly(100))
])

now we will create pipeline 

so first pipeline which we will be craeting is catgorical pipeline

it contain cat_var which contain varselector , it is selcting object datatype and ignoring postcode and post area

next we are getting dumies and for dummmies we are fixing the frequency value as 100

so first it will run catagorical varaible and then push all catagorical varaible into dummy varaible function and then it will generate dummy varaible 

In [36]:
pipe2=Pipeline([
    ('features',FeatureUnion([
        ('cat_pipe',cat_pipe),
        ('num_var',VarTypeSelector(['int64','float64'],ignore_var=['REF_NO']))
        
    ])),
    ('clf',LogisticRegression())
])

Then it will create next pipeline 

it apllies featureunion function 

so in feature union it will first run the cat pipeline 
while running the cat pipeline it will create catagorical varaible 
then it will get the num_var 

once we have these two then feature union will run 

and the result of the feature union will pass to the logistic regression 

In [37]:
pipe2.fit(x_train,y_train)



Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=None,
       transformer_list=[('cat_pipe', Pipeline(memory=None,
     steps=[('cat_var', VarTypeSelector(ignore_var=['post_code', 'post_area'], vartype=['object'])), ('dummies', get_dummies_PipeLineFriendly(freq_cutoff=100))])), ('num_var', VarTypeSelector(ig...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [38]:
pipe2.predict_proba(x_test)

array([[0.00352334, 0.99647666],
       [0.00623159, 0.99376841],
       [0.00768033, 0.99231967],
       ...,
       [0.43372072, 0.56627928],
       [0.17600561, 0.82399439],
       [0.01433096, 0.98566904]])

## Save python objects to use later

In [39]:
from sklearn.externals import joblib

In [40]:
joblib.dump(pipe1,'my_model_pipeline.pkl')

['my_model_pipeline.pkl']

## Loading models

In [41]:
import pandas as pd
from sklearn.externals import joblib
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
from nltk.stem.wordnet import WordNetLemmatizer
lemma = WordNetLemmatizer()

In [42]:
def split_into_lemmas(message):
    message=message.lower()
    words = word_tokenize(message)
    words_sans_stop=[]
    for word in words :
        if word in stop:continue
        words_sans_stop.append(word)
    return [lemma.lemmatize(word) for word in words_sans_stop]

In [43]:
mymodel=open('my_model_pipeline.pkl','rb')

In [44]:
pipe=joblib.load(mymodel)

In [45]:
my_msg=['I‘m going to try for 2 months ha ha only joking',
        '''Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. 
        Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's''']
my_df=pd.DataFrame({'message':my_msg})

In [46]:
my_df

Unnamed: 0,message
0,I‘m going to try for 2 months ha ha only joking
1,Free entry in 2 a wkly comp to win FA Cup fina...


In [47]:
pipe.predict_proba(my_df['message'])

array([[0.95929988, 0.04070012],
       [0.01743449, 0.98256551]])

In [48]:
pipe.classes_

array(['ham', 'spam'], dtype='<U4')