# Clustering Netflix Titles

<img src='me_hoy_medoid.png' width='30%'>

## Load

In [47]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [48]:
import pandas as pd
import numpy as np

from scipy.spatial.distance import pdist, squareform

# !pip install pyclustering
from pyclustering.cluster.kmedoids import kmedoids

import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline


url = "https://raw.githubusercontent.com/AdamSpannbauer/flixable_ml_dsi/master/data/movies_2020_01_23_13_15_04.csv"
movie = pd.read_csv(url)

# Drop rows where genre is na
movie = movie.dropna(subset=["Genre"])

# Proceed with sample of rows to make things run faster for class time
movie = movie.sample(2000, random_state=42)

# Subset down to a small feature set
# fmt: off
drop_columns = ['Poster', 'flixable_url', 'Response', 
                'Awards', 'Rated', 'imdbID', 'DVD', 'Website',
                'BoxOffice', 'Released', 'added_to_netflix',
                'Writer', 'Actors', 'Plot',
                'Metascore', 'Production',
                'totalSeasons', 'Runtime', 'Director',
                'Title', 'Ratings', 'Year', 'imdbRating',
                'imdbVotes']
# fmt: on
movie = movie.drop(columns=drop_columns)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [49]:
movie.head()

Unnamed: 0,Country,Genre,Language,Type,mpaa_rating
3136,Hong Kong,"Action, Comedy","Cantonese, Mandarin",movie,TV-14
1648,Egypt,"Action, Comedy, Drama",Arabic,movie,TV-14
3641,USA,Drama,English,movie,TV-14
4221,India,Comedy,,movie,TV-PG
158,South Korea,"Comedy, Drama, Family",Korean,series,TV-14


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Preprocess

Create a copy of the dataframe to preserve this original structure for cluster analysis later.

In [50]:
og_movie = movie.copy()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Use [`pd.Series.str.get_dummies()`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.get_dummies.html) to convert dummy encode `'Genre'`, `'Language'`, and `'Country'`.

In [51]:
genre_dummies = movie["Genre"].str.get_dummies(", ")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [52]:
language_dummies = movie["Language"].str.get_dummies(", ")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [53]:
country_dummies = movie["Country"].str.get_dummies(", ")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Combine all 3 dummy dataframes into a single dataframe

In [54]:
str_dummies = pd.concat((genre_dummies, language_dummies, country_dummies), axis=1)
str_dummies.head()

Unnamed: 0,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,...,Thailand,Tunisia,Turkey,UK,USA,Uganda,Ukraine,United Arab Emirates,Uruguay,Zimbabwe
3136,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1648,1,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3641,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
4221,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
158,0,0,0,0,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

* Drop the original `'Genre'`, `'Language'`, and `'Country'` columns from the `movie` dataframe.
* Add the data from `str_dummies` to the `movie` dataframe

In [55]:
movie = movie.drop(columns=["Genre", "Language", "Country"])
movie = pd.concat((movie, str_dummies), axis=1)
movie.head()

Unnamed: 0,Type,mpaa_rating,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,...,Thailand,Tunisia,Turkey,UK,USA,Uganda,Ukraine,United Arab Emirates,Uruguay,Zimbabwe
3136,movie,TV-14,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1648,movie,TV-14,1,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3641,movie,TV-14,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
4221,movie,TV-PG,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
158,series,TV-14,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Use [`pd.get_dummies()`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.get_dummies.html) to dummy encode `'Type'` and `'mpaa_rating'`.

In [56]:
movie = pd.get_dummies(movie)
movie.head()

Unnamed: 0,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,...,mpaa_rating_PG,mpaa_rating_PG-13,mpaa_rating_R,mpaa_rating_TV-14,mpaa_rating_TV-G,mpaa_rating_TV-MA,mpaa_rating_TV-PG,mpaa_rating_TV-Y,mpaa_rating_TV-Y7,mpaa_rating_TV-Y7-FV
3136,1,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1648,1,0,0,0,1,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
3641,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
4221,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
158,0,0,0,0,1,0,0,1,1,0,...,0,0,0,1,0,0,0,0,0,0


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Calculate distances

* Use `pdist` and `squareform` to calculate the distance between each row
    * What distance metric makes the most sense here?

In [11]:
dist = pdist(movie, metric="dice")
dist_mat = squareform(dist)
dist_mat.shape

(2000, 2000)

<IPython.core.display.Javascript object>

## Cluster with K-medoids

We need to initialize the starting 'medoids' for our clusters.  To do this, `pyclustering` wants us to provide the indices of our starting points.

* Generate `k` random indices from our distance matrix

In [12]:
k = 5

<IPython.core.display.Javascript object>

In [13]:
nrows = dist_mat.shape[0]
init_medoids = np.random.randint(0, 2001, k)
init_medoids

array([ 274,  293, 1546,  413, 1125])

<IPython.core.display.Javascript object>

In [14]:
kmed = kmedoids(
    dist_mat, initial_index_medoids=init_medoids, data_type="distance_matrix"
)

kmed.process()

<pyclustering.cluster.kmedoids.kmedoids at 0x120fa5cd0>

<IPython.core.display.Javascript object>

Use the `.get_medoids()` method to find the index for each cluster center.

In [15]:
medoid_idxs = kmed.get_medoids()
medoid_idxs

[274, 195, 1712, 74, 745]

<IPython.core.display.Javascript object>

Use the `.predict()` method to output the cluster label for each record in a dataset.

In [16]:
labels = kmed.predict(dist_mat)
labels

array([1, 1, 2, ..., 0, 2, 3])

<IPython.core.display.Javascript object>

Put these labels into both the `og_movie` and `movie` dataframes.

In [58]:
og_movie["label"] = labels
movie["label"] = labels

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Explore Clusters

Use the `medoid_idxs` to pull out our cluster centers from `og_movie`.

In [18]:
medoid_idxs

[274, 195, 1712, 74, 745]

<IPython.core.display.Javascript object>

In [25]:
og_movie.iloc[medoid_idxs, :]

Unnamed: 0,Country,Genre,Language,Type,mpaa_rating,label
602,Japan,Drama,Japanese,series,TV-14,0
5463,India,"Comedy, Drama",Hindi,movie,TV-14,1
3699,USA,"Crime, Drama, Thriller",English,movie,TV-MA,2
4633,USA,"Documentary, Comedy",English,movie,TV-MA,3
344,India,"Comedy, Drama, Romance",Hindi,movie,TV-14,4


<IPython.core.display.Javascript object>

Analyze clusters

In [43]:
cluster_3 = og_movie.loc[og_movie["label"] == 3]
for col in cluster_3:
    counts_head = cluster_3[col].value_counts().head()
    display(counts_head)

USA            376
UK              27
Canada          12
USA, Canada      6
UK, USA          6
Name: Country, dtype: int64

Documentary            123
Comedy                 119
Documentary, Comedy     32
Documentary, Short      14
Comedy, Romance         12
Name: Genre, dtype: int64

English             439
English, Spanish      8
Spanish               6
English, French       6
English, German       3
Name: Language, dtype: int64

movie     481
series     37
Name: Type, dtype: int64

TV-MA    243
TV-14     78
TV-PG     67
PG        32
PG-13     31
Name: mpaa_rating, dtype: int64

3    518
Name: label, dtype: int64

<IPython.core.display.Javascript object>

In [44]:
cluster_2 = og_movie.loc[og_movie["label"] == 2]
for col in cluster_2:
    counts_head = cluster_2[col].value_counts().head()
    display(counts_head)

USA        254
UK          30
Canada      24
UK, USA     15
Spain       14
Name: Country, dtype: int64

Drama                     57
Comedy, Drama             21
Crime, Drama, Thriller    17
Horror, Thriller          16
Comedy, Drama, Romance    16
Name: Genre, dtype: int64

English             348
Spanish              19
English, Spanish     16
English, French      10
French                7
Name: Language, dtype: int64

movie     490
series     47
Name: Type, dtype: int64

TV-MA    237
R        155
PG-13     52
TV-14     43
PG        17
Name: mpaa_rating, dtype: int64

2    537
Name: label, dtype: int64

<IPython.core.display.Javascript object>

In [69]:
cluster_2 = movie[movie["label"] == 2]
cluster_3 = movie[movie["label"] == 3]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [70]:
cluster_2.drop(columns=["label"]).sum().sort_values(ascending=False).head(10)

Type_series          334
Drama                159
mpaa_rating_TV-14    154
English              136
mpaa_rating_TV-MA    135
Type_movie            94
Action                77
USA                   76
Animation             70
mpaa_rating_TV-PG     66
dtype: int64

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [67]:
cluster_3.drop(columns=["label"]).sum().sort_values(ascending=False).head(10)

English              495
Type_movie           481
USA                  435
Comedy               279
mpaa_rating_TV-MA    243
Documentary          239
mpaa_rating_TV-14     78
mpaa_rating_TV-PG     67
Family                65
UK                    46
dtype: int64

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>