In [5]:
import pandas as pd 
import time
import numpy as np 
from rake_nltk import Rake
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer


# Read the csv file
df = pd.read_csv('../datasets/tv-shows-database.csv')

# Choose what columns to show
df = df[['Title','Actors', 'Writer', 'Genre', 'Plot']]
# df = df[['Title', 'Genre', 'Plot']]

df.head()

Unnamed: 0,Title,Actors,Writer,Genre,Plot
0,90210,"Shenae Grimes-Beech, Tristan Mack Wilds, AnnaL...","Darren Star, Jeff Judah, Gabe Sachs, Rob Thomas","Comedy, Drama, Romance","A Kansas family relocates to Beverly Hills, wh..."
1,.9-1-1,"Angela Bassett, Peter Krause, Oliver Stark, Ai...","Brad Falchuk, Tim Minear, Ryan Murphy","Action, Drama, Thriller",Explores the high-pressure experiences of the ...
2,13 Reasons Why,"Caleb Pilkenton, Michael Sadler, Dylan Minnett...",Brian Yorkey,"Drama, Mystery","Follows teenager Clay Jensen, in his quest to ..."
3,24 Hours,"Clive Brook, Kay Francis, Miriam Hopkins, Regi...","Louis Weitzenkorn, Louis Bromfield, William C....",Drama,A nightclub singer is carrying on an affair wi...
4,30 Rock,"Tina Fey, Tracy Morgan, Jack McBrayer, Scott A...",Tina Fey,Comedy,"Liz Lemon, head writer of the sketch comedy sh..."


In [6]:
df.shape

(1178, 5)

In [7]:
# putting the genres in a list of words
df['Genre'] = df['Genre'].map(lambda x: x.lower().split(','))

df['Actors'] = df['Actors'].map(lambda x: x.split(','))

df['Writer'] = df['Writer'].map(lambda x: x.split(' '))

# merging together first and last name for each actor and director, so it's considered as one word 
# and there is no mix up between people sharing a first name
for index, row in df.iterrows():
    row['Actors'] = [x.lower().replace(' ','') for x in row['Actors']]
    row['Writer'] = ''.join(row['Writer']).lower()

In [8]:
df.head()

Unnamed: 0,Title,Actors,Writer,Genre,Plot
0,90210,"[shenaegrimes-beech, tristanmackwilds, annalyn...","darrenstar,jeffjudah,gabesachs,robthomas","[comedy, drama, romance]","A Kansas family relocates to Beverly Hills, wh..."
1,.9-1-1,"[angelabassett, peterkrause, oliverstark, aish...","bradfalchuk,timminear,ryanmurphy","[action, drama, thriller]",Explores the high-pressure experiences of the ...
2,13 Reasons Why,"[calebpilkenton, michaelsadler, dylanminnette,...",brianyorkey,"[drama, mystery]","Follows teenager Clay Jensen, in his quest to ..."
3,24 Hours,"[clivebrook, kayfrancis, miriamhopkins, regist...","louisweitzenkorn,louisbromfield,williamc.lengl...",[drama],A nightclub singer is carrying on an affair wi...
4,30 Rock,"[tinafey, tracymorgan, jackmcbrayer, scottadsit]",tinafey,[comedy],"Liz Lemon, head writer of the sketch comedy sh..."


In [9]:
# initializing the new column
df['Key_words'] = ""

for index, row in df.iterrows():
    plot = row['Plot']
    
    # instantiating Rake, by default is uses english stopwords from NLTK
    # and discard all puntuation characters
    r = Rake()

    # extracting the words by passing the text
    r.extract_keywords_from_text(plot)

    # getting the dictionary whith key words and their scores
    key_words_dict_scores = r.get_word_degrees()
    
    # assigning the key words to the new column
    row['Key_words'] = list(key_words_dict_scores.keys())

# dropping the Plot column
# df.drop(columns = ['Plot'], inplace = True)

In [10]:
df.set_index('Title', inplace = True)
df.head()

Unnamed: 0_level_0,Actors,Writer,Genre,Plot,Key_words
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
90210,"[shenaegrimes-beech, tristanmackwilds, annalyn...","darrenstar,jeffjudah,gabesachs,robthomas","[comedy, drama, romance]","A Kansas family relocates to Beverly Hills, wh...","[two, children, adapt, west, beverly, hills, h..."
.9-1-1,"[angelabassett, peterkrause, oliverstark, aish...","bradfalchuk,timminear,ryanmurphy","[action, drama, thriller]",Explores the high-pressure experiences of the ...,"[frightening, thrust, explores, shocking, firs..."
13 Reasons Why,"[calebpilkenton, michaelsadler, dylanminnette,...",brianyorkey,"[drama, mystery]","Follows teenager Clay Jensen, in his quest to ...","[crush, uncover, follows, teenager, clay, jens..."
24 Hours,"[clivebrook, kayfrancis, miriamhopkins, regist...","louisweitzenkorn,louisbromfield,williamc.lengl...",[drama],A nightclub singer is carrying on an affair wi...,"[found, murdered, crime, married, man, carryin..."
30 Rock,"[tinafey, tracymorgan, jackmcbrayer, scottadsit]",tinafey,[comedy],"Liz Lemon, head writer of the sketch comedy sh...","[liz, lemon, sketch, comedy, show, arrogant, n..."


In [11]:
df['bag_of_words'] = ''
columns = df.columns
for index, row in df.iterrows():
    words = ''
    for col in columns:
        if col != 'Writer':
            words = words + ' '.join(row[col])+ ' '
        else:
            words = words + row[col]+ ' '
    row['bag_of_words'] = words
    
df.drop(columns = [col for col in df.columns if col!= 'bag_of_words'], inplace = True)

In [12]:
df.head()

Unnamed: 0_level_0,bag_of_words
Title,Unnamed: 1_level_1
90210,shenaegrimes-beech tristanmackwilds annalynnem...
.9-1-1,angelabassett peterkrause oliverstark aishahin...
13 Reasons Why,calebpilkenton michaelsadler dylanminnette chr...
24 Hours,clivebrook kayfrancis miriamhopkins registoome...
30 Rock,tinafey tracymorgan jackmcbrayer scottadsit ti...


In [13]:
# instantiating and generating the count matrix
count = CountVectorizer()
count_matrix = count.fit_transform(df['bag_of_words'])

# creating a Series for the movie titles so they are associated to an ordered numerical
# list I will use later to match the indexes
indices = pd.Series(df.index)
indices[:5]

0             90210
1            .9-1-1
2    13 Reasons Why
3          24 Hours
4           30 Rock
Name: Title, dtype: object

In [14]:
# generating the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)
cosine_sim

array([[1.        , 0.12309149, 0.08399211, ..., 0.0949158 , 0.18871284,
        0.08206099],
       [0.12309149, 1.        , 0.04652421, ..., 0.07009996, 0.0836242 ,
        0.09090909],
       [0.08399211, 0.04652421, 1.        , ..., 0.0358748 , 0.04279605,
        0.09304842],
       ...,
       [0.0949158 , 0.07009996, 0.0358748 , ..., 1.        , 0.06448259,
        0.10514995],
       [0.18871284, 0.0836242 , 0.04279605, ..., 0.06448259, 1.        ,
        0.0418121 ],
       [0.08206099, 0.09090909, 0.09304842, ..., 0.10514995, 0.0418121 ,
        1.        ]])

In [15]:

# function that takes in movie title as input and returns the top 10 recommended movies
def recommendations(title, cosine_sim = cosine_sim):
    
    recommended_movies = []
    
    # gettin the index of the movie that matches the title
    idx = indices[indices == title].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar movies
    top_10_indexes = list(score_series.iloc[1:11].index)
    
    # populating the list with the titles of the best 10 matching movies
    print('Baseado no seu gosto por "{}" você deveria assistir: '.format(title))
    for i in top_10_indexes:
        recommended_movies.append(list(df.index)[i])
    
    for x in range(len(recommended_movies)): 
        print('{0}: {1}'.format(x+1, recommended_movies[x]))  

In [16]:
start_time = time.time()

print('----Filtragem de Conteúdo----\n')
recommendations('Criminal Minds')

print ('\nTotal Runtime: {:.2f} seconds'.format(time.time() - start_time))

----Filtragem de Conteúdo----

Baseado no seu gosto por "Criminal Minds" você deveria assistir: 
1: Without a Trace
2: Naked City
3: Law & Order: Special Victims Unit
4: Mindhunter
5: The Following
6: Scream: The TV Series
7: The Sinner
8: Hannibal
9: Numb3rs
10: The Shield

Total Runtime: 0.01 seconds
