# Agglomerative clustering

In [None]:
import multiprocessing as mp
import pandas as pd
import numpy as np

import spacy

from cytoolz import *

%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
from scipy.cluster import hierarchy
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import *
from sklearn.preprocessing import *
from sklearn.cluster import *
from sklearn.decomposition import *
from sklearn.manifold import *
from sklearn.neighbors import *

In [None]:
nlp = spacy.load('en', disable=['tagger', 'parser', 'ner'])

## Load data

In [None]:
df = pd.read_csv("../input/wine-data/wine_data.csv")
df.head()

Only keep reviews for wine variants with more than 500 reviews

In [None]:
df = df.groupby('wine_variant').filter(lambda x: len(x) > 500).copy()
df.shape

In [None]:
df.dtypes

## Tokenization

In [None]:
def tokenize(text):
    return [tok.lower for tok in nlp.tokenizer(text)]

We need to apply the `tokenize` function to every review, and we've got a few options.  We've been using the pandas `apply` method:

````
df['tokens'] = df['review_text'].apply(tokenize)
````

A list comprehension would also work:

````
df['tokens'] = [tokenize(text) for text in df['review_text']]
````

Or the built-in `map` function:

````
df['tokens'] = list(map(tokenize, df['review_text']))
````

Since each review can be tokenized independently of the others, this task can be easily parallelized on a multi-core CPU:

In [None]:
%%time

with mp.Pool(4) as p: 
    df['tokens'] = p.map(tokenize, df['review_text'])

## Clustering

First we'll construct `D`, the document/term matrix:

In [None]:
dtm = make_pipeline(CountVectorizer(analyzer=identity),
                    TfidfTransformer())                    

In [None]:
D = dtm.fit_transform(df.groupby('wine_variant')['tokens'].apply(concat))
D.shape

Then apply agglomerative clustering:

In [None]:
clusters = AgglomerativeClustering(n_clusters=4).fit_predict(D.toarray())

In [None]:
labels = df.groupby('wine_variant')['wine_variant'].apply(pd.unique).index
for cl in set(clusters):
    print(cl, ', '.join((labels[clusters==cl])))

Draw a dendrogram

In [None]:
Z = hierarchy.linkage(D.toarray(), 'ward')

In [None]:
plt.figure(figsize=(7,14))
hierarchy.dendrogram(Z, labels=labels, orientation='left', leaf_font_size=10)
plt.show()

In [None]:
reduce = make_pipeline(TruncatedSVD(100),
                       TSNE(2, perplexity=50, learning_rate=50, early_exaggeration=50))
xy = reduce.fit_transform(D.toarray())

In [None]:
plt.figure(figsize=(10,10))
plt.scatter(xy[:,0], xy[:,1])
plt.xticks([], [])
plt.yticks([], [])
plt.show()

In [None]:
plt.figure(figsize=(10,10))
plt.scatter(xy[:,0], xy[:,1], alpha=0.25)
for x, y, t in zip(xy[:,0], xy[:,1], labels):
    plt.text(x, y, t, ha='center', va='center')
plt.xticks([], [])
plt.yticks([], [])
plt.show()    

In [None]:
plt.figure(figsize=(10,10))
plt.scatter(xy[:,0], xy[:,1], alpha=0.25)
text = [plt.text(x, y, t, ha='center', va='center') for x, y, t in zip(xy[:,0], xy[:,1], labels)]
#adjust_text(text, arrowprops=dict(arrowstyle='-', color='red'))
plt.xticks([], [])
plt.yticks([], [])
plt.show()

## Affinity propogation

In [None]:
A = AffinityPropagation().fit(D)

In [None]:
clusters = dict(zip(labels, A.labels_))
for c in sorted(set(clusters.values())):
    print(c, ':', ', '.join(k for k,v in clusters.items() if v==c))

## Nearest neighbors

In [None]:
neighbors = NearestNeighbors(metric='euclidean').fit(D)

In [None]:
V = list(labels)
V

In [None]:
V.index('Merlot')

In [None]:
D[25,:]

In [None]:
neighbors.kneighbors(D[25,:])

In [None]:
def get_neighbors(wine, k=10):
    _, terms = neighbors.kneighbors(D[V.index(wine)], n_neighbors=k)
    return [V[t] for t in terms[0]]

In [None]:
get_neighbors('Grüner Veltliner')

In [None]:
get_neighbors('Pinot Noir')