In [2]:
import numpy as np
import os
import datetime
import pandas as pd
import random
from scipy import stats
# from tqdm import tqdm
from matplotlib import pyplot as plt
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
from sklearn import preprocessing
from sklearn import datasets
from matplotlib.colors import ListedColormap

In [1]:
from sklearn import model_selection
from sklearn import pipeline
from sklearn import feature_extraction
from sklearn import dummy
from sklearn import naive_bayes
from sklearn import calibration

### Naive Bayes
- P(A | B) = ( P(B | A) * P(A) ) / P(B)
- Posterior = Likelihood * prior / marginal probability
- Probability observation class Y given X features = (likelihood of observations's features valueus if class Y) * (belief of class Y before looking at the data) / (marginal probability)
- Assume likelihood is a certain distribution: gaussian, bernoulli or multinomial, etc
- Assume each feature and its resulting lielihood is independent

### Continuous Features

In [3]:
iris = datasets.load_iris()
features = iris.data
target = iris.target

In [4]:
clf = naive_bayes.GaussianNB()
model = clf.fit(features, target)

In [5]:
model.predict([[4, 4, 4, 0.4]]), model.predict_proba([[4, 4, 4, 0.4]])

(array([1]), array([[1.34715602e-38, 9.99949727e-01, 5.02727760e-05]]))

Prior is adjusted based on the data unless passed, raw predicted probabilities are not calibrated, if wanted to use should calibrate them using an isotonic regression

P(H | E) = P(E | H) * P(H) \ P(E)
- https://dataaspirant.com/naive-bayes-classifier-machine-learning/
- P(H) is the probability of hypothesis H being true. This is known as the prior probability.
- P(E) is the probability of the evidence(regardless of the hypothesis).
- P(E|H) is the probability of the evidence given that hypothesis is true.
- P(H|E) is the probability of the hypothesis given that the evidence is there.


### Discrete & Count Features

In [9]:
text_data = np.array(['all the same', 'same old same old', 'old is good'])
word_count = feature_extraction.text.CountVectorizer()
bag_of_words = word_count.fit_transform(text_data)
features = bag_of_words.toarray()
features

array([[1, 0, 0, 0, 1, 1],
       [0, 0, 0, 2, 2, 0],
       [0, 1, 1, 1, 0, 0]], dtype=int64)

In [11]:
target = np.array([0, 0, 1])
clf = naive_bayes.MultinomialNB(class_prior=[0.25, 0.50])
model = clf.fit(features, target)

### Binary Features

In [12]:
features = np.random.randint(2, size=(100, 3))
target = np.random.randint(2, size=(100, 1)).ravel()
clf = naive_bayes.BernoulliNB(class_prior=[0.25, 0.5])
model = clf.fit(features, target)

### Calibrating Probabilities

In [13]:
iris = datasets.load_iris()
features = iris.data
target = iris.target
clf = naive_bayes.GaussianNB()
sig_clf = calibration.CalibratedClassifierCV(clf, cv=2, method='sigmoid')
sig_clf.fit(features, target)

CalibratedClassifierCV(base_estimator=GaussianNB(priors=None, var_smoothing=1e-09),
            cv=2, method='sigmoid')

In [14]:
new_obs = [[2.6, 2.6, 2.6, 0.4]]
sig_clf.predict_proba(new_obs)

array([[0.31859969, 0.63663466, 0.04476565]])

# Clustering

In [15]:
from sklearn import cluster
from sklearn import pipeline
from sklearn import feature_extraction

### KMeans
- K clusters at random locations, obs assigned to nearest center
- center moved based on average of thsoe assigned
- repeated until no changes

In [16]:
iris = datasets.load_iris()
features = iris.data
my_scaler = preprocessing.StandardScaler()
features_stand = my_scaler.fit_transform(features)

In [17]:
clust = cluster.KMeans(n_clusters=3, random_state=0, n_jobs=-1)
model = clust.fit(features_stand)

- assumes clusters are convex shaped (circle / sphere)
- features are equally scaled
- groups are balanced

In [18]:
# view assingments
model.labels_

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 2, 2, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2,
       0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2,
       2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 0, 2, 0, 2, 2, 0, 2, 2, 2, 2, 2,
       2, 0, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 0])

In [19]:
# View Centers
model.cluster_centers_

array([[-0.05021989, -0.88337647,  0.34773781,  0.2815273 ],
       [-1.01457897,  0.85326268, -1.30498732, -1.25489349],
       [ 1.13597027,  0.08842168,  0.99615451,  1.01752612]])

### Speed Up KMeans (Mini batches)

In [20]:
iris = datasets.load_iris()
features = iris.data
my_scaler = preprocessing.StandardScaler()
features_stand = my_scaler.fit_transform(features)

In [21]:
clu = cluster.MiniBatchKMeans(n_clusters=3, random_state=0, batch_size=100)
model = clu.fit(features_stand)
model.labels_

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 2, 2, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 2, 2, 2, 0, 2, 0, 2,
       2, 0, 2, 0, 0, 2, 2, 2, 2, 0, 2, 0, 2, 0, 2, 2, 0, 0, 2, 2, 2, 2,
       2, 0, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 0])

### Meanshift
- bandwidth: radius of the area an observation uses to shift
- cluster_all: False : orphan observations are given label of -1 verses closest kernel
- https://www.efavdb.com/mean-shift

In [22]:
iris = datasets.load_iris()
features = iris.data
my_scaler = preprocessing.StandardScaler()
features_stand = my_scaler.fit_transform(features)

In [23]:
clu = cluster.MeanShift(n_jobs=-1)
model = clu.fit(features_stand)
model.labels_

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

### DBScan
- pick point at random, if has a min number of close neighbors part of a cluster
- repeat step two for all of neighbors, and their neighbors, etc
- once no more close neighbors, a new random point and start over
- those non core but close to a cluster, assigned to it, otherwise outlier -1

- eps: maximum distance to be considered a neighbor
- min_samples: min number of observations to be considered core
- metric: distance metric used for calculating didstance

In [25]:
iris = datasets.load_iris()
features = iris.data
my_scaler = preprocessing.StandardScaler()
features_stand = my_scaler.fit_transform(features)

In [26]:
clu = cluster.DBSCAN(n_jobs=-1)
model = clu.fit(features_stand)
model.labels_

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1,
        0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  1,
        1,  1,  1,  1,  1, -1, -1,  1, -1, -1,  1, -1,  1,  1,  1,  1,  1,
       -1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
       -1,  1, -1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1, -1,  1, -1,  1,
        1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1,  1, -1,  1,  1, -1, -1,
       -1,  1,  1, -1,  1,  1, -1,  1,  1,  1, -1, -1, -1,  1,  1,  1, -1,
       -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1],
      dtype=int64)

### Heirarchical Merging
- all observations start as own clustere and then are merged if meet some criteria
- linkage: determines merging strategy:
    - ward: variance of merged clusters
    - average: distance between observations from pairs of clusters
    - complete: max distance between observations from pairs of clusters
- affinity: determines the distance metric
- n_clusters: number attempt to find

In [None]:
iris = datasets.load_iris()
features = iris.data
my_scaler = preprocessing.StandardScaler()
features_stand = my_scaler.fit_transform(features)

In [27]:
clu = cluster.AgglomerativeClustering(n_clusters=3)
model = clu.fit(features_stand)
model.labels_

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 2, 0, 2, 0, 2, 0, 2, 2, 0, 2, 0, 2, 0,
       2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 0, 2, 0, 0, 2,
       2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 2, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)