# Урок 9. Интеграция. Итоговый проект

### 0. Выбранный датасет: https://www.kaggle.com/uciml/sms-spam-collection-dataset "SMS Spam Collection Dataset"

In [1]:
import pandas as pd
import dill
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from sklearn.metrics import roc_auc_score,roc_curve
from sklearn.metrics import f1_score

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import precision_score,recall_score

from sklearn.impute import SimpleImputer

import sklearn.datasets

#### Приведение данных

In [2]:
df_raw = pd.read_csv('./spam.csv',  encoding='ISO-8859-1')
df_raw = df_raw[['v2','v1']]
df_raw.columns = ['sms','is_spam']
df=pd.DataFrame()
mapping = {'spam': 1,'ham': 0}
df['sms']=df_raw['sms']
df['is_spam']=df_raw['is_spam'].map(mapping)

df.head(5)

Unnamed: 0,sms,is_spam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [3]:
df.is_spam.value_counts()

0    4825
1     747
Name: is_spam, dtype: int64

#### Деление на train/test, сохранение на диск 

In [4]:
X_train, X_test, y_train, y_test = train_test_split(df, 
                                                    df['is_spam'], test_size=0.33, random_state=42)
#save test
X_test.to_csv('X_test.csv', index=None)
y_test.to_csv('y_test.csv', index=None)
#save train
X_train.to_csv('X_train.csv', index=None)
y_train.to_csv('y_train.csv', index=None)

In [5]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class TextImputer(BaseEstimator, TransformerMixin):
    def __init__(self, key, value):
        self.key = key
        self.value = value
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X[self.key] = X[self.key].fillna(self.value)
        return X

In [6]:
features = 'sms'
target = 'is_spam'

#### Feature engineering

In [7]:
#combine
sms = Pipeline([
                ('imputer', TextImputer('sms', '')),
                ('selector', ColumnSelector(key='sms')),
                ('tfidf', TfidfVectorizer(max_df=0.9, min_df=10))
            ])

feats = FeatureUnion([('sms', sms),
                      ])

#### Выбранный классификатор: MultinomialNB

In [8]:
from sklearn.naive_bayes import MultinomialNB

In [9]:
pipeline = Pipeline([
    ('features',feats),
    ('classifier', MultinomialNB(alpha=0.2)),
])

pipeline.fit(X_train, y_train)
#Посмотрим, как выглядит наш pipeline
pipeline.steps

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[self.key] = X[self.key].fillna(self.value)


[('features',
  FeatureUnion(transformer_list=[('sms',
                                  Pipeline(steps=[('imputer',
                                                   TextImputer(key='sms',
                                                               value='')),
                                                  ('selector',
                                                   ColumnSelector(key='sms')),
                                                  ('tfidf',
                                                   TfidfVectorizer(max_df=0.9,
                                                                   min_df=10))]))])),
 ('classifier', MultinomialNB(alpha=0.2))]

#### Сохраняем пайплайн

In [10]:
with open('app/models/pipeline.dill', 'wb') as f:
    dill.dump(pipeline, f)

#### Проверка пайплайна

In [11]:
import pandas as pd
from sklearn.metrics import roc_auc_score,roc_curve
import dill
dill._dill._reverse_typemap['ClassType'] = type

In [12]:
X_test = pd.read_csv('X_test.csv')
y_test = pd.read_csv('y_test.csv')

In [13]:
X_test.head(3)

Unnamed: 0,sms,is_spam
0,"Funny fact Nobody teaches volcanoes 2 erupt, t...",0
1,I sent my scores to sophas and i had to do sec...,0
2,We know someone who you know that fancies you....,1


In [14]:
with open('app/models/pipeline.dill', 'rb') as in_strm:
    pipeline = dill.load(in_strm)
    
predictions = pipeline.predict_proba(X_test)
pd.DataFrame({'preds': predictions[:, 1]}).to_csv('test_predictions.csv', index=None)

In [15]:
roc_auc_score(y_score=predictions[:, 1][:], y_true=y_test)

0.9796411318150448

In [17]:
import pandas as pd
from sklearn.metrics import roc_auc_score,roc_curve
from urllib import request, parse

X_test = pd.read_csv('X_test.csv')
y_test = pd.read_csv('y_test.csv')

In [18]:
X_test[['sms']].head(3)

Unnamed: 0,sms
0,"Funny fact Nobody teaches volcanoes 2 erupt, t..."
1,I sent my scores to sophas and i had to do sec...
2,We know someone who you know that fancies you....


In [19]:
import urllib.request
import json      


def get_prediction(x):
    sms = x
    body = {'sms': sms, 
            } 

    myurl = 'http://0.0.0.0:8180/predict'
    req = urllib.request.Request(myurl)
    req.add_header('Content-Type', 'application/json; charset=utf-8')
    jsondata = json.dumps(body)
    jsondataasbytes = jsondata.encode('utf-8')   # needs to be bytes
    req.add_header('Content-Length', len(jsondataasbytes))
    #print (jsondataasbytes)
    response = urllib.request.urlopen(req, jsondataasbytes)
    return json.loads(response.read())['is_spam']

In [20]:
get_prediction('Your question this week will enter u in our draw 4 cash.')

0.7948225582927236

In [21]:
predictions = X_test['sms'].apply(lambda x: get_prediction(x), 1)

In [22]:
predictions

0       0.312016
1       0.005498
2       0.816170
3       0.003577
4       0.999995
          ...   
1834    0.000263
1835    0.001471
1836    0.013453
1837    0.372728
1838    0.965014
Name: sms, Length: 1839, dtype: float64

In [23]:
roc_auc_score(y_score=predictions.values, y_true=y_test)

0.9796411318150448