In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.neighbors import NearestNeighbors, LocalOutlierFactor
from sklearn.cluster import KMeans
from sklearn.model_selection import GridSearchCV, ParameterGrid
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.mixture import GaussianMixture as GMM
from sklearn.base import BaseEstimator,TransformerMixin, ClusterMixin ,RegressorMixin
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.externals import joblib
from tkinter import Tk
from tkinter import filedialog as fd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
%matplotlib inline
#%matplotlib notebook
pd.set_option('display.max_columns', 100)
pd.set_option('display.precision', 2) 
from pivottablejs import pivot_ui
import plotly.offline as py
py.init_notebook_mode()
import plotly.graph_objs as go
from plotly import tools
from ipywidgets import *
from ipywidgets import widget


# importing data

In [None]:
def read_file(fileType,encode="ISO-8859-2"):
    low_memory=False
    Tk().withdraw()
    fileName = fd.askopenfilename()
    if fileType.lower()=="csv":
        data_table=pd.read_csv(fileName,header=0,encoding=encode)
         
    else:
        data_table=pd.read_excel(fileName,header=0)
    
    return data_table

# Reviewing data table

In [None]:
# Trying to see the mixed Type and whether there are any nans
filetype=input("Enter excel or CSV: ")
newTable=read_file(str(filetype).lower())

In [None]:
ctmw="Output"
cit="Cit"

In [None]:
newTable_proc=newTable[(newTable[ctmw]>50)].reset_index(drop=True)
base_df=newTable_proc

<h3 align="center"> Python Meetup Demo </h3>

<img src="images/foundation-3_small.jpg">
** We're a Houston startup with the goal to advance cutting edge data science through high quality and practical educational training**


# IoT anomaly detection using Scikit Learn Pipelines

<img align="center" src="images/Good news Everyone .jpg" alt="Drawing" style="width: 400px;"/>
<h3 align="center">Rise package can make your Jupyter slides dynamic</h3>

# Goals:
- Introduce Anomaly detection for IoT data and displaying few methods
- Introduce Machine Learning pipelines for production work
- Create a custom transformation/estimation and call it from scikit learn pipeline 

In [None]:
pivot_ui(base_df)

** Assessing several Outlier Detection methods from scikit learn**
```python
# Support Vector Machines
clfsvm = svm.OneClassSVM(kernel="rbf",nu=0.2) 
# Kmeans with 2 clusters
clfKM = KMeans(n_clusters=2)
# KNN one class classification
y_predilof = LocalOutlierFactor(contamination=0.4).fit_predict(xstd)
# Gaussian Mixture Models
y_gmm=(GMM(2, max_iter=1000, random_state=0).fit(xstd)).predict(xstd) 
y_predsvm = (clfsvm.fit(xstd)).predict(xstd)
y_kmeans = clfKM.fit(xstd).predict(xstd)
```

In [None]:
X=base_df[[cit,ctmw]]
xstd=StandardScaler().fit_transform(X)

In [None]:
clfsvm = svm.OneClassSVM(kernel="rbf",nu=0.2)
y_predsvm = (clfsvm.fit(xstd)).predict(xstd)

In [None]:
clfKM = KMeans(n_clusters=2)
y_kmeans = clfKM.fit(xstd).predict(xstd)

In [None]:
y_predilof = LocalOutlierFactor(contamination=0.4).fit_predict(xstd)

In [None]:
y_gmm=(GMM(2, max_iter=1000, random_state=0).fit(xstd)).predict(xstd)

In [None]:
outlier_df=pd.DataFrame({"outliergmm":y_gmm,"outlierKmeans":y_kmeans,"outliersvm":y_predsvm,"outlierlof":y_predilof})
baseo_df=base_df.join(outlier_df)

In [None]:
def plotall():
    for out_type in outlier_df.columns:
        (sns.FacetGrid(baseo_df,hue=out_type, size=7).map(plt.scatter, cit, ctmw).add_legend().set(title='Temp vs Power',
                                                                                               xlabel='Temp',
                                                                                               ylabel='Power'))
        plt.show();

In [None]:
plotall()

# Add picture showing error along lines and by using regression line plot
### Then demonstrate process how to do this with OLS manually

In [None]:
sns.lmplot(x=cit, y=ctmw, data=X, size=12,scatter_kws={"s": 10},line_kws={"color":"red","linewidth":4},ci=None);

In [None]:
plt.figure(figsize=(10,10))
sns.residplot(x=cit, y=ctmw, data=X,scatter_kws={"s": 10});

```python
xarr=X[cit].values.reshape(-1,1)
yarr=X[ctmw].values.reshape(-1,1)
linreg=Pipeline([(
'std_scaler', StandardScaler()),
('LinReg',LinearRegression(fit_intercept=True))]).fit(X=xarr,y=yarr)            
```

In [None]:
linreg=Pipeline([(
    'std_scaler', StandardScaler()),
    ('LinReg',
     LinearRegression(fit_intercept=True))]).fit(X=X[cit].values.reshape(-1,1),
                                                 y=X[ctmw].values.reshape(-1,1))                

```python
resid=(X[ctmw].values.reshape(-1,1))-linreg.predict(X[cit].values.reshape(-1,1))
clf = GMM(2, max_iter=500, random_state=23).fit(resid)
y_newgmm=clf.predict(resid)
outlier_new=pd.DataFrame({"outlierresg":y_newgmm})
datanew_df=X.join(outlier_new)
```

In [None]:
resid=(X[ctmw].values.reshape(-1,1))-linreg.predict(X[cit].values.reshape(-1,1))
clf = GMM(2, max_iter=500, random_state=23).fit(resid)
y_newgmm=clf.predict(resid)
outlier_new=pd.DataFrame({"outlierresg":y_newgmm})
datanew_df=X.join(outlier_new);

In [None]:
sns.lmplot(x=cit, y=ctmw, data=datanew_df, size=8,hue="outlierresg",fit_reg=False,scatter_kws={"s": 25});

In [None]:
plt.hist(resid, 50, normed=True)
plt.xlim(-10, 10)
plt.show();

**Gaussian Mixture Models allow you to fit partial and full Normal Distribution (i.e. Gaussian)**

## For Making custom Transformers and adding them to pipelines
- The transformer should work seamlessly with Scikit-Learn pipelines

- Scikit-Learn uses duck typing (not inheritance), so all you need to do is follow the API

- No need to inherit from scikit-learn classes

- Create a class and implement three methods: fit() (returning self), transform(), and fit_transform(). 

- fit_transform() can be achieved by calling TransformerMixin as a base class

- Adding BaseEstimator as a base class will allow two extra methods (get_params() and set_params()). Those can be used for hyperparameter tuning

http://scikit-learn.org/stable/developers/contributing.html#rolling-your-own-estimator

``` python
class BinOnResiduals(BaseEstimator, TransformerMixin):
    def __init__(self,param=True): # no *args or **kargs
        self.param = param
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X, y=None):
        exog=X[:,0].reshape(-1,1) # creating input for Linear Reg
        endog=X[:,1].reshape(-1,1) # creating the output
        resid=endog - linreg.predict(exog) # setting the residuals
        return np.c_[resid]
```

In [None]:
class BinOnResiduals(BaseEstimator, TransformerMixin):
    def __init__(self,param=True): # no *args or **kargs
        self.param = param
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X, y=None):
        exog=X[:,0].reshape(-1,1)
        endog=X[:,1].reshape(-1,1)
        resid=endog - linreg.predict(exog)
        return np.c_[resid]

``` python
transf_pipeline=Pipeline([('resid_attrib_adder', BinOnResiduals()),
                          ("GMM_Estimator",GMM(2, max_iter=500, random_state=23))])


```

In [None]:
transf_pipeline=Pipeline([('resid_attrib_adder', BinOnResiduals()),
                          ("GMM_Estimator",GMM(2, max_iter=500, random_state=23))])

``` python
def trans_run(gmm_n_estimators=2):
    transf_pln=Pipeline([('resid_attrib_adder', BinOnResiduals()),
                         ("GMM_Estimator",GMM(n_components=gmm_n_estimators, 
                                          max_iter=500, random_state=23))])    
    y_newg=transf_pln.fit(X.values).predict(X.values)
    datanew_df=X.join(pd.DataFrame({"outlierresg":y_newg}))    
    sns.lmplot(x=datanew_df.columns[0], 
               y=datanew_df.columns[1], data=datanew_df,
               size=10,hue="outlierresg",fit_reg=False,scatter_kws={"s": 25})
    plt.show()    
```

In [None]:
def trans_run(gmm_n_estimators=2):
    transf_pln=Pipeline([('resid_attrib_adder', BinOnResiduals()),
                         ("GMM_Estimator",
                          GMM(n_components=gmm_n_estimators, 
                              max_iter=500, random_state=23))])
    
    y_newg=transf_pln.fit(X.values).predict(X.values)
    datanew_df=X.join(pd.DataFrame({"outlierresg":y_newg}))
    
    sns.lmplot(x=datanew_df.columns[0], 
               y=datanew_df.columns[1], data=datanew_df,
               size=10,hue="outlierresg",fit_reg=False,scatter_kws={"s": 25})
    plt.show()
    

In [None]:
trans_run();

In [None]:
clf=transf_pipeline.fit(X.values)

In [None]:
clf.predict(np.array([[60], [80.5]]).reshape((1,2)))

In [None]:
joblib.dump(clf,'demo.pkl');

In [None]:
clf2=joblib.load('demo.pkl')

In [None]:
clf2.predict(np.array([[60], [80.5]]).reshape((1,2)))

In [None]:
widgets.interact(trans_run,gmm_n_estimators=(2,12,1));

In [None]:
x=transf_pipeline.named_steps['resid_attrib_adder'].fit_transform(X.values)
n_estimators = np.arange(1, 12)
clfs = [GMM(n, max_iter=1000,random_state=23).fit(x) for n in n_estimators]
bics = [clf.bic(x) for clf in clfs]

plt.plot(n_estimators, bics, label='BIC')
plt.legend();

**BIC plot and code above is courtesy of Jake VanderPlas, Python Data Science Handbook**