In [36]:
import seaborn as sns
import pandas as pd
import numpy as np
import sklearn
import sklearn.datasets
from sklearn.metrics import accuracy_score
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

# The Iris Dataset

Here are some pictures of the iris subspecies that we will be working with!

### Iris Setosa
![alt text](images/irissetosa2.jpg "setosa")
### Iris Versicolor
![alt text](images/versicolor.jpg "versicolor")
### Iris Virginica
![alt text](images/virginica.jpg "virginica")

## About the Iris Dataset

In [None]:
#  in sklearn datasets, the data is stored in 'data'.  the columns are stored in 'feature_names'
iris = sklearn.datasets.load_iris()
iris_df=pd.DataFrame(iris.data, columns=iris['feature_names'])
iris_df.describe()

In [38]:
#  We will add labels for later to see how well our labeling goes
iris_labeled_df = iris_df.copy(deep=True)
iris_labeled_df['species_index'] = iris['target']
#  'target'  contains the coded species label (0,1,2).  The lookup is in 'target names'.
iris_names = iris['target_names']
# we convert to a dict to add the labels to iris_labeled_df
iris_name_dict = dict(zip(range(4), iris_names ))
iris_labeled_df['species'] = iris_labeled_df['species_index'].map(iris_name_dict)

In [None]:
#  We can use pairplot to plot scatterplots of everything with everything else.  We put hisograms on the diagonal
cols = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)','petal width (cm)']
sns.pairplot(iris_labeled_df, vars=cols,  hue='species')

## K-Means

In [40]:
from sklearn.cluster import KMeans

In [41]:
kmeans = KMeans(n_clusters=3, random_state=0)

In [42]:
iris_df['labels'] = kmeans.fit_predict(iris_df)


In [None]:
iris_df

In [None]:
cols = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)','petal width (cm)']
sns.pairplot(iris_df, vars=cols,  hue='labels')

### Accuracy and Confusion

There are a number of accuracy metrics we can use when we have the "correct"  answer to compare to.  In this case, we can compare the species to see how often it gets the correct answer.  The *accuracy* is the ratio of correct assigments to the number of total assignments.  The "confusion matrix" is a $N\times N$ matrix that displays the number of predictions in eatch categorical value by the number of given values.

As a check, the accuracy is the trace (sum of the diagonal values) divided by the total number of cases


In [45]:
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(iris_df.labels, iris_labeled_df['species_index'])

In [47]:
#  K-means doesn't know how to associate its labels with the orig ones.  We can fix that:
map_cluster_dict = dict(zip((0,1,2), (1,0,2) ))


In [None]:
# Better!
conf = confusion_matrix(iris_df.labels.map(map_cluster_dict), iris_labeled_df['species_index'])
conf

In [None]:
accuracy_score(iris_df.labels.map(map_cluster_dict), iris_labeled_df['species_index'])

In [None]:
# Note that this is the same as:
np.trace(conf)/np.sum(conf)

## Hierarchical Clustering

In [51]:
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
from matplotlib import pyplot as plt

In [52]:
def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)


In [None]:
# courtosy sklearn.org
import numpy as np
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram

from sklearn.cluster import AgglomerativeClustering
from sklearn.datasets import load_iris


X = iris.data

# setting distance_threshold=0 ensures we compute the full tree.
model = AgglomerativeClustering(distance_threshold=0, n_clusters=None)

model = model.fit(X)
plt.title('Hierarchical Clustering Dendrogram')
# plot the top three levels of the dendrogram
plot_dendrogram(model, truncate_mode='level', p=3)
plt.xlabel('Number of points in node (or index of point if no parenthesis).')
plt.show()

In [54]:
newmodel = AgglomerativeClustering(n_clusters=3)
iris_df['heirarchical_labels'] = newmodel.fit_predict(X)


In [None]:
cols = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)','petal width (cm)']
sns.pairplot(iris_df, vars=cols, hue='heirarchical_labels')

In [None]:
confusion_matrix(iris_df.heirarchical_labels, iris_labeled_df['species_index'])

In [None]:
# again, we shuffle since our algorithm is "untrained"
# performance is similar
h_map_cluster_dict = dict(zip((1, 0, 2), (0, 1, 2) ))
confusion_matrix(iris_df.heirarchical_labels.map(h_map_cluster_dict), iris_labeled_df['species_index'])

In [None]:
accuracy_score(iris_df.heirarchical_labels.map(h_map_cluster_dict), iris_labeled_df['species_index'])

## DBSCAN 

In [None]:
from sklearn import metrics
from sklearn.cluster import DBSCAN

db = DBSCAN().fit(X)
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)

DBSCAN  doesn't do a great job, since the metrics of two of the species are spatially similar.  So, it finds 2 clusters rather than 3

# Market Analysis with APRIORI:  Coffee Shop Orders

In [60]:
# %pip install mlxtend
# %pip install networkx
import numpy as np 
import pandas as pd 
from mlxtend.frequent_patterns import apriori, association_rules 

In [None]:
df = pd.read_csv('data/Coffe Shop Sales.xlsx - MBA_Master.csv')

# Show the first 10 data points.
df.head(10)

In [None]:
df_pivot = df.pivot_table(index='transaction_number', columns='item', values='amount', aggfunc='sum').fillna(0)

# Show the size of the pivoted table.
print('Table size: %d rows \u00d7 %d columns' % df_pivot.shape)

In [None]:
# Show what the top transactions with the highest amounts look like transposed.
def is_within_threshold(value):
    return 4 < value

df_pivot[df_pivot.select_dtypes(np.number)
                 .apply(is_within_threshold, axis=1)
                 .any(axis=1)] \
        .T \
        .style \
        .background_gradient(axis=None) \
        .format('{:,.0f}')

In [None]:
def encode(value):        # You get a False if the value is not greater than 0, which becomes a 0;
    return int(0 < value) # you get a True if the value is greater than 0, which becomes a 1.

df_pivot = df_pivot.map(encode)

# Show what the first 10 look like transposed.
def determine_style(condition):                                      # The condition is an array of True or False (from previous encoding step)
    return np.where(condition, 'background: navy; color: white', '') # If True, the background color is defined (navy/white); if False, it is not defined.

df_pivot.head(10) \
        .T \
        .style \
        .apply(determine_style, axis=None)

In [None]:
# our minimum support
support = 0.01 
frequent_items = apriori(df_pivot, min_support=support, use_colnames=True)
frequent_items.sort_values('support', ascending=False)

"Lift" is the ratio of the target reponse given the antecedent relative to no condition. In conditional probability speak this is:
$$
L = \frac{P_{A|B}}{P_B}
$$
High lift suggests that the products tend to purchased together

In [None]:
metric = 'lift'
min_treshold = 1

rules = association_rules(frequent_items, metric=metric, min_threshold=min_treshold)[['antecedents','consequents','support','confidence','lift']]
rules.reset_index(drop=True).sort_values('confidence', ascending=False, inplace = True)

# Show the metrics.
rules.sort_values(['antecedents', 'consequents']) \
     .set_index(['antecedents', 'consequents']) \
     .style \
     .set_table_styles([{'selector': 'th.row_heading', 'props': [('vertical-align', 'text-top')]}]) \
     .background_gradient(axis=0) \
     .format_index(', '.join)

In [None]:
# First we build a network from the association rule data
import networkx as nx

from_nodes = rules.antecedents.apply(', '.join)
to_nodes = rules.consequents.apply(', '.join)
cxns = zip(from_nodes, to_nodes)

G = nx.MultiDiGraph()
G.add_edges_from(cxns)


In [None]:
plt.figure(1, figsize=(8, 8))
nx.draw_networkx(G, arrows=True, node_size=1000, font_size=10, node_color='tab:green', font_color='blue', connectionstyle='arc3, rad=0.1')


Here, we see that the arrows flow both ways.  This shouldn't be suprising since the data contains purchases that happen at the same time.  

In [69]:
rules_even = rules.iloc[::2, :].copy() # Keep only even rows.
rules_even.antecedents = rules_even.antecedents.apply(', '.join) # Get rid of frozensets.
rules_even.consequents = rules_even.consequents.apply(', '.join) # Get rid of frozensets.
rules_even['cxns'] = rules_even['antecedents'] + ' \u27f7 ' + rules_even['consequents']
rules_even= rules_even.sort_values('lift', ascending=False)

In [None]:
sns.barplot(rules_even, y='cxns', x='lift').set_title('Coffee Shop Product Lift')

Apparently, the sweet tooth wins out!  

Let’s make some predictions!

If a person is placing an order for certain items, what are the best suggestions for additional items based on the association rules we’ve mined?

In [None]:
from IPython.display import display, display_markdown

def suggest_product_for(antecedents, limit=5):                  # Given a list of antecedents, predict some consequents up to the limit.
    antecedents = frozenset(antecedents)
    condition = rules.antecedents.map(antecedents.issuperset)   # The given antecedents must be a superset of the rules' antecedents.
    candidates = rules[condition].explode('consequents')        # Explode to expand the tuple/set/list to individual rows.
    suggestions = candidates[~candidates.consequents.isin(antecedents)] \
                            .sort_values('lift', ascending=False) \
                            .drop_duplicates('consequents')     # Sort and then remove suggestions which are already among the antecedents and deduplicate.
    return suggestions.head(limit)

for antecedents in [{'Plain Croisant'},
                    {'Decaf Coffee'},
                    {'Plain Croisant', 'Decaf Coffee'}]:
    display_markdown(f'Suggestions for *{", ".join(antecedents)}*', raw=True)
    display(suggest_product_for(antecedents))