**Import Library and Load The Data**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv('C:\\project trending\\kdrama.csv')
df.head()

Unnamed: 0,Name,Aired Date,Year of release,Original Network,Aired On,Number of Episodes,Duration,Content Rating,Rating,Synopsis,Genre,Tags,Director,Screenwriter,Cast,Production companies,Rank
0,Move to Heaven,"May 14, 2021",2021,Netflix,Friday,10,52 min.,18+ Restricted (violence & profanity),9.2,Geu Roo is a young autistic man. He works for ...,"Life, Drama, Family","Autism, Uncle-Nephew Relationship, Death, Sava...",Kim Sung Ho,Yoon Ji Ryun,"Lee Je Hoon, Tang Jun Sang, Hong Seung Hee, Ju...","Page One Film, Number Three Pictures",#1
1,Flower of Evil,"Jul 29, 2020 - Sep 23, 2020",2020,tvN,"Wednesday, Thursday",16,1 hr. 10 min.,15+ - Teens 15 or older,9.1,Although Baek Hee Sung is hiding a dark secret...,"Thriller, Romance, Crime, Melodrama","Married Couple, Deception, Suspense, Family Se...","Kim Chul Gyu, Yoon Jong Ho",Yoo Jung Hee,"Lee Joon Gi, Moon Chae Won, Jang Hee Jin, Seo ...",Monster Union,#2
2,Hospital Playlist,"Mar 12, 2020 - May 28, 2020",2020,"Netflix, tvN",Thursday,12,1 hr. 30 min.,15+ - Teens 15 or older,9.1,The stories of people going through their days...,"Friendship, Romance, Life, Medical","Strong Friendship, Multiple Mains, Best Friend...",Shin Won Ho,Lee Woo Jung,"Jo Jung Suk, Yoo Yeon Seok, Jung Kyung Ho, Kim...","Egg Is Coming, CJ ENM",#3
3,Hospital Playlist 2,"Jun 17, 2021 - Sep 16, 2021",2021,"Netflix, tvN",Thursday,12,1 hr. 40 min.,15+ - Teens 15 or older,9.1,Everyday is extraordinary for five doctors and...,"Friendship, Romance, Life, Medical","Workplace, Strong Friendship, Best Friends, Mu...",Shin Won Ho,Lee Woo Jung,"Jo Jung Suk, Yoo Yeon Seok, Jung Kyung Ho, Kim...","Egg Is Coming, CJ ENM",#4
4,My Mister,"Mar 21, 2018 - May 17, 2018",2018,tvN,"Wednesday, Thursday",16,1 hr. 17 min.,15+ - Teens 15 or older,9.1,Park Dong Hoon is a middle-aged engineer who i...,"Psychological, Life, Drama, Family","Age Gap, Nice Male Lead, Strong Female Lead, H...","Kim Won Suk, Kim Sang Woo",Park Hae Young,"Lee Sun Kyun, IU, Park Ho San, Song Sae Byuk, ...",Chorokbaem Media,#5


**Data Preparation**

In [10]:
# Cleaning data
df['Rank'] = df['Rank'].astype(str).str.replace('#', '').astype(int)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Name                  250 non-null    object 
 1   Aired Date            250 non-null    object 
 2   Year of release       250 non-null    int64  
 3   Original Network      250 non-null    object 
 4   Aired On              250 non-null    object 
 5   Number of Episodes    250 non-null    int64  
 6   Duration              250 non-null    object 
 7   Content Rating        245 non-null    object 
 8   Rating                250 non-null    float64
 9   Synopsis              250 non-null    object 
 10  Genre                 250 non-null    object 
 11  Tags                  250 non-null    object 
 12  Director              249 non-null    object 
 13  Screenwriter          249 non-null    object 
 14  Cast                  250 non-null    object 
 15  Production companies  2

In [12]:
df.shape

(250, 17)

In [13]:
df.isnull().sum()

Name                    0
Aired Date              0
Year of release         0
Original Network        0
Aired On                0
Number of Episodes      0
Duration                0
Content Rating          5
Rating                  0
Synopsis                0
Genre                   0
Tags                    0
Director                1
Screenwriter            1
Cast                    0
Production companies    2
Rank                    0
dtype: int64

In [14]:
df.dropna(inplace=True)

In [15]:
# selecting only the relevant columns
selected_feature = ['Name', 'Original Network', 'Synopsis', 'Genre', 'Tags', 'Director', 'Cast']
print(selected_feature)

['Name', 'Original Network', 'Synopsis', 'Genre', 'Tags', 'Director', 'Cast']


In [16]:
# replacing the null values with empty string
for feature in selected_feature:
    df[feature] = df[feature].fillna('')

In [17]:
# combine all seleceted features
combined_features = df['Name'] + ' ' + df['Original Network'] + ' ' + df['Synopsis'] + ' ' + df['Genre'] + ' ' + df['Tags'] + ' ' + df['Director'] + ' ' + df['Cast']

In [18]:
print(combined_features)

0      Move to Heaven Netflix Geu Roo is a young auti...
1      Flower of Evil tvN Although Baek Hee Sung is h...
2      Hospital Playlist Netflix,  tvN  The stories o...
3      Hospital Playlist 2 Netflix,  tvN  Everyday is...
4      My Mister tvN Park Dong Hoon is a middle-aged ...
                             ...                        
245    Live Up to Your Name tvN Heo Im, who is born i...
246    Queen for Seven Days KBS2 This drama is about ...
247    Memory tvN Park Tae Suk is a lawyer who finds ...
248    A Korean Odyssey Netflix, tvN In 2017, Son Oh ...
249    Voice 4: Judgment Hour tvN A serial killer wit...
Length: 242, dtype: object


In [19]:
# converting text data to feature vectors
vectorizer = TfidfVectorizer()

In [20]:
feature_vectors = vectorizer.fit_transform(combined_features)

In [21]:
print(feature_vectors)

  (0, 3589)	0.06477300380338889
  (0, 2596)	0.05922876083126429
  (0, 2204)	0.04147361281555975
  (0, 4464)	0.05922876083126429
  (0, 3862)	0.047161147834646484
  (0, 2222)	0.058811999637883895
  (0, 1881)	0.034319634744705035
  (0, 3559)	0.04564628973053441
  (0, 1947)	0.049261267281165456
  (0, 2221)	0.05771390272715222
  (0, 3946)	0.09391674171700563
  (0, 1950)	0.04425275536078433
  (0, 2181)	0.0712963738278821
  (0, 2331)	0.04585456400012656
  (0, 1933)	0.03787648240685613
  (0, 3866)	0.03590344298022872
  (0, 2258)	0.021601977183709493
  (0, 2811)	0.1009758427932423
  (0, 224)	0.018827318063229785
  (0, 734)	0.1009758427932423
  (0, 865)	0.08184912872038781
  (0, 1400)	0.06838798135401994
  (0, 2339)	0.08502332613478535
  (0, 3695)	0.04564628973053441
  (0, 3960)	0.0712963738278821
  :	:
  (241, 1900)	0.037373780115178036
  (241, 1630)	0.07252451448559483
  (241, 3545)	0.1375294811365035
  (241, 1071)	0.061539653248492206
  (241, 1843)	0.0413469460887678
  (241, 365)	0.0725467211

**Cosine Similarity**

In [22]:
# getting the similarity scores
similarity = cosine_similarity(feature_vectors)
print(similarity)

[[1.         0.08033645 0.05553314 ... 0.11657191 0.06071648 0.05802335]
 [0.08033645 1.         0.05332572 ... 0.10320595 0.12657398 0.12086021]
 [0.05553314 0.05332572 1.         ... 0.0814387  0.09708386 0.07007628]
 ...
 [0.11657191 0.10320595 0.0814387  ... 1.         0.09007262 0.06347431]
 [0.06071648 0.12657398 0.09708386 ... 0.09007262 1.         0.0870322 ]
 [0.05802335 0.12086021 0.07007628 ... 0.06347431 0.0870322  1.        ]]


In [23]:
print(similarity.shape)

(242, 242)


In [42]:
df = pd.DataFrame(similarity)

In [43]:
# write to csv
df.to_csv('C:\\project trending\\cosine_sim.csv', index=False)

**Getting KDRAMA Name**

In [31]:
kdrame_name = input("Enter the name of the movie you want to search: ")

In [32]:
# creating list all the movie names
list_all_kdrama = df['Name'].tolist()
print(list_all_kdrama)

['Move to Heaven', 'Flower of Evil', 'Hospital Playlist', 'Hospital Playlist 2', 'My Mister', 'Reply 1988', 'Weak Hero Class 1', 'Prison Playbook', 'Alchemy of Souls', 'Extraordinary Attorney Woo', 'Mr. Queen', 'Mother', "It's Okay to Not Be Okay", 'Crash Landing on You', 'Vincenzo', 'Navillera', 'Signal', 'Mr. Sunshine', 'Happiness', 'Kingdom: Season 2', 'SKY Castle', 'Tomorrow', 'Healer', 'Stranger', 'Twenty-Five Twenty-One', 'The Red Sleeve', 'Goblin', 'The Uncanny Counter', 'Mouse', 'Kingdom', 'Weightlifting Fairy Kim Bok Joo', 'D.P.', 'The Devil Judge', 'The Penthouse', 'Youth of May', 'Taxi Driver', 'Life on Mars', 'Beyond Evil', 'Racket Boys', 'Hometown Cha-Cha-Cha', 'Six Flying Dragons', 'Our Beloved Summer', 'The Guest', 'Dear My Friends', 'While You Were Sleeping', 'The Penthouse 2', 'Chicago Typewriter', '18 Again', 'Arthdal Chronicles Part 2', 'Arthdal Chronicles Part 3', 'Through the Darkness', 'Dr. Romantic 2', 'Defendant', 'Our Blues', 'Sweet Home', 'Kill Me, Heal Me', '

In [33]:
# finding close matches for kdrama
find_close_matches = difflib.get_close_matches(kdrame_name, list_all_kdrama)
print(find_close_matches)

['Hospital Playlist', 'Hospital Playlist 2']


In [34]:
close_matches = find_close_matches[0]
print(close_matches)

Hospital Playlist


In [35]:
# finding the index of the movie
index_of_kdrama = df[df['Name'] == close_matches]['Rank'].values[0]
print(index_of_kdrama)

3


In [36]:
# getting a list similar movies
similarity_score = list(enumerate(similarity[index_of_kdrama]))
print(similarity_score)

[(0, 0.04099361003328644), (1, 0.044645186646060556), (2, 0.5893710357399935), (3, 1.0000000000000002), (4, 0.08632057583973057), (5, 0.12577683529496356), (6, 0.08443929945835286), (7, 0.08552956119336802), (8, 0.09264530205550607), (9, 0.09219753894729955), (10, 0.060390215558030355), (11, 0.03972266089659452), (12, 0.06379416115242321), (13, 0.07597938770284578), (14, 0.06358927201695953), (15, 0.06207749025995444), (16, 0.044900115474256506), (17, 0.05951739580286145), (18, 0.054713541615726925), (19, 0.07299375805810147), (20, 0.17047024403891875), (21, 0.03332039855980321), (22, 0.07939189515865769), (23, 0.0656514273159145), (24, 0.07919690605353265), (25, 0.05306814827877062), (26, 0.06755210329585805), (27, 0.07038737706580639), (28, 0.02837744681958366), (29, 0.054804794736581834), (30, 0.05594572675834239), (31, 0.04886202271510937), (32, 0.07162122597157591), (33, 0.03568440879015122), (34, 0.079345750494482), (35, 0.04544363994131125), (36, 0.0900397213696742), (37, 0.0737

In [37]:
len(similarity_score)

242

In [38]:
# sorting movies based on similarity score
sorted_similar_movies = sorted(similarity_score, key=lambda x: x[1], reverse=True)
print(sorted_similar_movies)

[(3, 1.0000000000000002), (2, 0.5893710357399935), (51, 0.23812345181605749), (157, 0.18506303050562528), (20, 0.17047024403891875), (65, 0.15381708624103704), (110, 0.15113535308194012), (70, 0.14105815799452856), (64, 0.1382865681408987), (68, 0.13779065851174516), (75, 0.13710737755846134), (161, 0.13329862296180597), (234, 0.13221182865836678), (85, 0.13204883744917836), (122, 0.13021712823151202), (180, 0.12972130616205538), (202, 0.12959698457507082), (93, 0.12957458189314652), (43, 0.12609611968210682), (5, 0.12577683529496356), (114, 0.12384135307426664), (220, 0.12280523279054369), (126, 0.12220392866534958), (99, 0.12066634371456704), (153, 0.11765282358324118), (59, 0.11361242435986978), (226, 0.11350419631379967), (58, 0.11261631432651788), (130, 0.11143247110716163), (71, 0.11047738014838887), (172, 0.10963593914594122), (111, 0.10890087012055895), (215, 0.10886740201496017), (101, 0.10436548156344085), (225, 0.10283185026744882), (131, 0.09650970167503609), (235, 0.095242

In [39]:
# printing the name of similarity movies based on index
print("KDrama suggested for you : \n")

i = 1

for movie in sorted_similar_movies:
    index = movie[0]
    title_from_index = df[df['Rank'] == index]['Name'].values
    if len(title_from_index) > 0:
        title_from_index = title_from_index[0]
        if (i<=20):
            print(i, ".", title_from_index)
            i += 1
    else:
        continue

KDrama suggested for you : 

1 . Hospital Playlist
2 . Flower of Evil
3 . Through the Darkness
4 . Nine: Nine Times Time Travel
5 . Kingdom: Season 2
6 . Dr. Romantic
7 . One Ordinary Day
8 . Tunnel
9 . Hot Stove League
10 . Strangers from Hell
11 . Little Women
12 . Love All Play
13 . D-Day
14 . My Father is Strange
15 . When the Camellia Blooms
16 . Tale of the Nine-Tailed
17 . The Hymn of Death
18 . Mystic Pop-Up Bar
19 . The Guest
20 . My Mister
