<a href="https://colab.research.google.com/github/tadyoung8/Projects/blob/main/projectweek10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import nltk
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from collections import Counter


df = pd.read_csv('netflix_titles.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [None]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [None]:
df = df[['type', 'title', 'release_year', 'description']]
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8807 entries, 0 to 8806
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   type          8807 non-null   object
 1   title         8807 non-null   object
 2   release_year  8807 non-null   int64 
 3   description   8807 non-null   object
dtypes: int64(1), object(3)
memory usage: 344.0+ KB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
df

Unnamed: 0,type,title,release_year,description
0,Movie,Dick Johnson Is Dead,2020,"As her father nears the end of his life, filmm..."
1,TV Show,Blood & Water,2021,"After crossing paths at a party, a Cape Town t..."
2,TV Show,Ganglands,2021,To protect his family from a powerful drug lor...
3,TV Show,Jailbirds New Orleans,2021,"Feuds, flirtations and toilet talk go down amo..."
4,TV Show,Kota Factory,2021,In a city of coaching centers known to train I...
...,...,...,...,...
8802,Movie,Zodiac,2007,"A political cartoonist, a crime reporter and a..."
8803,TV Show,Zombie Dumb,2018,"While living alone in a spooky town, a young g..."
8804,Movie,Zombieland,2009,Looking to survive in a world taken over by zo...
8805,Movie,Zoom,2006,"Dragged from civilian life, a former superhero..."


In [None]:
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = nltk.word_tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)

norm_corpus = normalize_corpus(list(df['description']))
len(norm_corpus)

8807

In [None]:
stop_words = stop_words + ['one', 'two', 'get']
cv = CountVectorizer(ngram_range=(1, 2), min_df=10, max_df=0.8, stop_words=stop_words)
cv_matrix = cv.fit_transform(norm_corpus)
cv_matrix.shape

(8807, 2685)

In [None]:
NUM_CLUSTERS = 4
km = KMeans(n_clusters=NUM_CLUSTERS, max_iter=10000, n_init=50, random_state=42).fit(cv_matrix)
km

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=10000,
       n_clusters=4, n_init=50, n_jobs=None, precompute_distances='auto',
       random_state=42, tol=0.0001, verbose=0)

In [None]:
Counter(km.labels_)

Counter({0: 6765, 1: 468, 2: 933, 3: 641})

In [None]:
df['kmeans_cluster'] = km.labels_

In [None]:

netflix_clusters = (df[['title', 'kmeans_cluster', 'release_year']]
                  .sort_values(by=['kmeans_cluster', 'release_year'], 
                               ascending=False)
                  .groupby('kmeans_cluster').head(20))
netflix_clusters = netflix_clusters.copy(deep=True)

In [None]:
feature_names = cv.get_feature_names()
topn_features = 15
ordered_centroids = km.cluster_centers_.argsort()[:, ::-1]

# key features for each cluster
# movies belonging to each cluster
for cluster_num in range(NUM_CLUSTERS):
    key_features = [feature_names[index] 
                        for index in ordered_centroids[cluster_num, :topn_features]]
    netflix = netflix_clusters[netflix_clusters['kmeans_cluster'] == cluster_num]['title'].values.tolist()
    print('CLUSTER #'+str(cluster_num+1))
    print('Key Words:', key_features)
    print('Popular Movies and shows on Netflix:', netflix)
    print('-'*80)

CLUSTER #1
Key Words: ['life', 'family', 'world', 'friends', 'series', 'documentary', 'must', 'school', 'find', 'three', 'woman', 'takes', 'home', 'lives', 'group']
Popular Movies and shows on Netflix: ['Blood & Water', 'Ganglands', 'Kota Factory', 'My Little Pony: A New Generation', 'The Great British Baking Show', 'The Starling', 'Vendetta: Truth, Lies and The Mafia', 'Crime Stories: India Detectives', 'Dear White People', 'Jaguar', 'Monsters Inside: The 24 Faces of Billy Milligan', 'Go! Go! Cory Carson: Chrissy Takes the Wheel', 'Chicago Party Aunt', 'Squid Game', 'The Father Who Moves Mountains', 'The Stronghold', 'Chhota Bheem', 'He-Man and the Masters of the Universe', 'My Heroes Were Cowboys', 'Castle and Castle']
--------------------------------------------------------------------------------
CLUSTER #2
Key Words: ['love', 'life', 'falls', 'young', 'woman', 'friends', 'falls love', 'true', 'fall', 'family', 'new', 'finds', 'fall love', 'romance', 'love life']
Popular Movies and