# Category Suggestion using Co-occurrence Analysis

Dataset Link: https://www.kaggle.com/datasets/rahulvyasm/netflix-movies-and-tv-shows

In [1]:
!pip install kagglehub
!pip install pandas

Collecting kagglehub
  Using cached kagglehub-0.3.10-py3-none-any.whl.metadata (31 kB)
Collecting pyyaml (from kagglehub)
  Using cached PyYAML-6.0.2-cp313-cp313-macosx_11_0_arm64.whl.metadata (2.1 kB)
Collecting requests (from kagglehub)
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting tqdm (from kagglehub)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting charset-normalizer<4,>=2 (from requests->kagglehub)
  Using cached charset_normalizer-3.4.1-cp313-cp313-macosx_10_13_universal2.whl.metadata (35 kB)
Collecting idna<4,>=2.5 (from requests->kagglehub)
  Using cached idna-3.10-py3-none-any.whl.metadata (10 kB)
Collecting urllib3<3,>=1.21.1 (from requests->kagglehub)
  Using cached urllib3-2.3.0-py3-none-any.whl.metadata (6.5 kB)
Collecting certifi>=2017.4.17 (from requests->kagglehub)
  Using cached certifi-2025.1.31-py3-none-any.whl.metadata (2.5 kB)
Using cached kagglehub-0.3.10-py3-none-any.whl (63 kB)
Using cached PyYAML-6.0.2-cp31

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("rahulvyasm/netflix-movies-and-tv-shows")

print("Path to dataset files:", path)

!ls -la $path


  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /Users/ad/.cache/kagglehub/datasets/rahulvyasm/netflix-movies-and-tv-shows/versions/1
total 6904
drwxr-xr-x  3 ad  staff       96 Mar 25 00:25 [1m[36m.[m[m
drwxr-xr-x  3 ad  staff       96 Mar 25 00:25 [1m[36m..[m[m
-rw-r--r--  1 ad  staff  3532881 Mar 25 00:25 netflix_titles.csv


In [3]:
import pandas as pd

In [4]:
csv_path = path + "/netflix_titles.csv"
print("csv path = " + csv_path)
df = pd.read_csv(csv_path, encoding='latin1')
df = df.loc[:, 'show_id':'description']

csv path = /Users/ad/.cache/kagglehub/datasets/rahulvyasm/netflix-movies-and-tv-shows/versions/1/netflix_titles.csv


In [5]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [6]:
print(df.shape)

(8809, 12)


In [7]:
genres_split = df['listed_in'].str.split(',', expand=True)
genres_2d = genres_split.values
genres_2d

array([['Documentaries', None, None],
       ['International TV Shows', ' TV Dramas', ' TV Mysteries'],
       ['Crime TV Shows', ' International TV Shows',
        ' TV Action & Adventure'],
       ...,
       ['Dramas', ' International Movies', ' Music & Musicals'],
       ['Sci-fi', ' Horror', ' Action'],
       ['Drama', ' Romance', ' Thriller']], shape=(8809, 3), dtype=object)

In [8]:
# clean up
genres_split = genres_split.dropna()
genres_split = pd.DataFrame(genres_split)
genres_split = genres_split.map(lambda x: x.strip() if isinstance(x, str) else x)
genres_list = genres_split.to_numpy().flatten()
all_genres = list(set(genres_list))
all_genres_df = pd.DataFrame(all_genres)
print('# of genres = ' + str(len(all_genres)))
print(genres_split)
print(all_genres)

# of genres = 45
                           0                       1                      2
1     International TV Shows               TV Dramas           TV Mysteries
2             Crime TV Shows  International TV Shows  TV Action & Adventure
4     International TV Shows       Romantic TV Shows            TV Comedies
5                  TV Dramas               TV Horror           TV Mysteries
7                     Dramas      Independent Movies   International Movies
...                      ...                     ...                    ...
8802             Cult Movies                  Dramas              Thrillers
8803                Kids' TV         Korean TV Shows            TV Comedies
8806                  Dramas    International Movies       Music & Musicals
8807                  Sci-fi                  Horror                 Action
8808                   Drama                 Romance               Thriller

[3731 rows x 3 columns]
["Kids' TV", 'Docuseries', 'Sports Movies', 'S

In [9]:
freq_table = all_genres_df.merge(all_genres_df, how='cross')
freq_table.columns = ['genre1', 'genre2']
freq_table['count'] = 0
freq_table

Unnamed: 0,genre1,genre2,count
0,Kids' TV,Kids' TV,0
1,Kids' TV,Docuseries,0
2,Kids' TV,Sports Movies,0
3,Kids' TV,Sci-fi,0
4,Kids' TV,TV Comedies,0
...,...,...,...
2020,TV Thrillers,Action & Adventure,0
2021,TV Thrillers,Classic & Cult TV,0
2022,TV Thrillers,International Movies,0
2023,TV Thrillers,Anime Features,0


In [10]:
# count pairs (slow)
for i, row in genres_split.iterrows():
    cats = row.values
    for j in range(0, len(cats)):
        for k in range(j, len(cats)):
            genre1 = cats[j]
            genre2 = cats[k]
            freq_table.loc[(freq_table['genre1'] == genre1) & (freq_table['genre2'] == genre2), 'count'] += 1
#
# for genre1 in genres_split.values:
#     for genre2 in genres_split.values:
#         print('{},{}'.format(genre1, genre2))
#         # freq_table.loc[(freq_table['genre1'] == genre1) & (freq_table['genre2'] == genre2), 'count'] += 1


freq_table

Unnamed: 0,genre1,genre2,count
0,Kids' TV,Kids' TV,53
1,Kids' TV,Docuseries,0
2,Kids' TV,Sports Movies,0
3,Kids' TV,Sci-fi,0
4,Kids' TV,TV Comedies,28
...,...,...,...
2020,TV Thrillers,Action & Adventure,0
2021,TV Thrillers,Classic & Cult TV,0
2022,TV Thrillers,International Movies,0
2023,TV Thrillers,Anime Features,0


In [11]:
filtered = freq_table[freq_table['genre1'] != freq_table['genre2']]
filtered = filtered[freq_table['count'] > 0]
filtered.sort_values(by='count', ascending=False)

  filtered = filtered[freq_table['count'] > 0]


Unnamed: 0,genre1,genre2,count
1122,Dramas,International Movies,1121
402,Comedies,International Movies,628
1093,Dramas,Independent Movies,488
384,Comedies,Dramas,473
933,International TV Shows,TV Dramas,393
...,...,...,...
1192,Cult Movies,LGBTQ Movies,1
1181,Cult Movies,Documentaries,1
1172,Cult Movies,Sports Movies,1
1154,Anime Series,Spanish-Language TV Shows,1


In [12]:
top_n = 10
given_cat = 'Dramas'
suggestions = filtered[freq_table['genre1'] == given_cat].sort_values(by='count', ascending=False)[:top_n]
suggestions

  suggestions = filtered[freq_table['genre1'] == given_cat].sort_values(by='count', ascending=False)[:top_n]


Unnamed: 0,genre1,genre2,count
1122,Dramas,International Movies,1121
1093,Dramas,Independent Movies,488
1096,Dramas,Romantic Movies,256
1116,Dramas,Thrillers,174
1087,Dramas,Music & Musicals,80
1082,Dramas,Sports Movies,52
1102,Dramas,LGBTQ Movies,36
1119,Dramas,Sci-Fi & Fantasy,36
1089,Dramas,Faith & Spirituality,26
1117,Dramas,Horror Movies,3
