In [54]:
import pandas as pd
from rake_nltk import Rake
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [55]:
#!pip install rake_nltk

In [56]:
df = pd.read_csv('IMDB_Top250Engmovies2_OMDB_Detailed.csv')
df.columns, len(df.columns)

(Index(['Unnamed: 0', 'Title', 'Year', 'Rated', 'Released', 'Runtime', 'Genre',
        'Director', 'Writer', 'Actors', 'Plot', 'Language', 'Country', 'Awards',
        'Poster', 'Ratings.Source', 'Ratings.Value', 'Metascore', 'imdbRating',
        'imdbVotes', 'imdbID', 'Type', 'tomatoMeter', 'tomatoImage',
        'tomatoRating', 'tomatoReviews', 'tomatoFresh', 'tomatoRotten',
        'tomatoConsensus', 'tomatoUserMeter', 'tomatoUserRating',
        'tomatoUserReviews', 'tomatoURL', 'DVD', 'BoxOffice', 'Production',
        'Website', 'Response'],
       dtype='object'),
 38)

In [57]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,...,tomatoConsensus,tomatoUserMeter,tomatoUserRating,tomatoUserReviews,tomatoURL,DVD,BoxOffice,Production,Website,Response
0,1,The Shawshank Redemption,1994,R,14 Oct 1994,142 min,"Crime, Drama",Frank Darabont,"Stephen King (short story ""Rita Hayworth and S...","Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",...,,,,,http://www.rottentomatoes.com/m/shawshank_rede...,27 Jan 1998,,Columbia Pictures,,True


In [58]:
df.iloc[0]

Unnamed: 0                                                           1
Title                                         The Shawshank Redemption
Year                                                              1994
Rated                                                                R
Released                                                   14 Oct 1994
Runtime                                                        142 min
Genre                                                     Crime, Drama
Director                                                Frank Darabont
Writer               Stephen King (short story "Rita Hayworth and S...
Actors               Tim Robbins, Morgan Freeman, Bob Gunton, Willi...
Plot                 Two imprisoned men bond over a number of years...
Language                                                       English
Country                                                            USA
Awards               Nominated for 7 Oscars. Another 19 wins & 30 n...
Poster

In [59]:
df = df[['Title','Genre','Director','Actors','Plot']]
df.head()

Unnamed: 0,Title,Genre,Director,Actors,Plot
0,The Shawshank Redemption,"Crime, Drama",Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",Two imprisoned men bond over a number of years...
1,The Godfather,"Crime, Drama",Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Richard ...",The aging patriarch of an organized crime dyna...
2,The Godfather: Part II,"Crime, Drama",Francis Ford Coppola,"Al Pacino, Robert Duvall, Diane Keaton, Robert...",The early life and career of Vito Corleone in ...
3,The Dark Knight,"Action, Crime, Drama",Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart, M...",When the menace known as the Joker emerges fro...
4,12 Angry Men,"Crime, Drama",Sidney Lumet,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",A jury holdout attempts to prevent a miscarria...


In [60]:

# discarding the commas between the actors' full names and getting only the first three names
df['Actors'] = df['Actors'].map(lambda x: x.split(',')[:3])

# putting the genres in a list of words
df['Genre'] = df['Genre'].map(lambda x: x.lower().split(','))

df['Director'] = df['Director'].map(lambda x: x.split(' '))

# merging together first and last name for each actor and director, so it's considered as one word 
# and there is no mix up between people sharing a first name
for index, row in df.iterrows():
    row['Actors'] = [x.lower().replace(' ','') for x in row['Actors']]
    row['Director'] = ''.join(row['Director']).lower()

In [61]:
df.iloc[0]

Title                                The Shawshank Redemption
Genre                                         [crime,  drama]
Director                                        frankdarabont
Actors                 [timrobbins, morganfreeman, bobgunton]
Plot        Two imprisoned men bond over a number of years...
Name: 0, dtype: object

In [62]:
# initializing the new column
df['Key_words'] = ""

for index, row in df.iterrows():
    plot = row['Plot']
    
    # instantiating Rake, by default is uses english stopwords from NLTK
    # and discard all puntuation characters
    r = Rake()

    # extracting the words by passing the text
    r.extract_keywords_from_text(plot)

    # getting the dictionary with key words and their scores
    key_words_dict_scores = r.get_word_degrees()
    
    # assigning the key words to the new column
    row['Key_words'] = list(key_words_dict_scores.keys())

# dropping the Plot column
df.drop(columns = ['Plot'], inplace = True)

In [63]:
df[["Key_words"]].iloc[0]

Key_words    [years, number, common, decency, acts, two, im...
Name: 0, dtype: object

In [64]:
df.set_index('Title', inplace = True)
df.head()

Unnamed: 0_level_0,Genre,Director,Actors,Key_words
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
The Shawshank Redemption,"[crime, drama]",frankdarabont,"[timrobbins, morganfreeman, bobgunton]","[years, number, common, decency, acts, two, im..."
The Godfather,"[crime, drama]",francisfordcoppola,"[marlonbrando, alpacino, jamescaan]","[aging, patriarch, organized, crime, dynasty, ..."
The Godfather: Part II,"[crime, drama]",francisfordcoppola,"[alpacino, robertduvall, dianekeaton]","[vito, corleone, portrayed, family, crime, syn..."
The Dark Knight,"[action, crime, drama]",christophernolan,"[christianbale, heathledger, aaroneckhart]","[ability, physical, tests, gotham, mysterious,..."
12 Angry Men,"[crime, drama]",sidneylumet,"[martinbalsam, johnfiedler, leej.cobb]","[justice, prevent, forcing, miscarriage, colle..."


In [65]:
df['bag_of_words'] = ''
columns = df.columns
for index, row in df.iterrows():
    words = ''
    for col in columns:
        if col != 'Director':
            words = words + ' '.join(row[col])+ ' '
        else:
            words = words + row[col]+ ' '
    row['bag_of_words'] = words
    
df.drop(columns = [col for col in df.columns if col!= 'bag_of_words'], inplace = True)

In [66]:
df.head()


Unnamed: 0_level_0,bag_of_words
Title,Unnamed: 1_level_1
The Shawshank Redemption,crime drama frankdarabont timrobbins morganfr...
The Godfather,crime drama francisfordcoppola marlonbrando a...
The Godfather: Part II,crime drama francisfordcoppola alpacino rober...
The Dark Knight,action crime drama christophernolan christia...
12 Angry Men,crime drama sidneylumet martinbalsam johnfied...


In [67]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
text= ["the boy is in love with that girl", "everyone should love and care animals", "no one should hurt stray animals"]
tfidf_matrix = tfidf.fit_transform(text)

In [68]:
help(TfidfVectorizer)

Help on class TfidfVectorizer in module sklearn.feature_extraction.text:

class TfidfVectorizer(CountVectorizer)
 |  TfidfVectorizer(*, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, analyzer='word', stop_words=None, token_pattern='(?u)\\b\\w\\w+\\b', ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=<class 'numpy.float64'>, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)
 |  
 |  Convert a collection of raw documents to a matrix of TF-IDF features.
 |  
 |  Equivalent to :class:`CountVectorizer` followed by
 |  :class:`TfidfTransformer`.
 |  
 |  Read more in the :ref:`User Guide <text_feature_extraction>`.
 |  
 |  Parameters
 |  ----------
 |  input : {'filename', 'file', 'content'}, default='content'
 |      If 'filename', the sequence passed as an argument to fit is
 |      expected to be a list of filenames that need reading to fetch
 |  

In [69]:
tfidf_matrix.toarray()

array([[0.        , 0.        , 0.36325471, 0.        , 0.        ,
        0.36325471, 0.        , 0.36325471, 0.36325471, 0.27626457,
        0.        , 0.        , 0.        , 0.        , 0.36325471,
        0.36325471, 0.36325471],
       [0.45954803, 0.34949812, 0.        , 0.45954803, 0.45954803,
        0.        , 0.        , 0.        , 0.        , 0.34949812,
        0.        , 0.        , 0.34949812, 0.        , 0.        ,
        0.        , 0.        ],
       [0.        , 0.3349067 , 0.        , 0.        , 0.        ,
        0.        , 0.44036207, 0.        , 0.        , 0.        ,
        0.44036207, 0.44036207, 0.3349067 , 0.44036207, 0.        ,
        0.        , 0.        ]])

In [70]:
tfidf.get_feature_names(), tfidf.get_params(), tfidf.idf_, 

(['and',
  'animals',
  'boy',
  'care',
  'everyone',
  'girl',
  'hurt',
  'in',
  'is',
  'love',
  'no',
  'one',
  'should',
  'stray',
  'that',
  'the',
  'with'],
 {'analyzer': 'word',
  'binary': False,
  'decode_error': 'strict',
  'dtype': numpy.float64,
  'encoding': 'utf-8',
  'input': 'content',
  'lowercase': True,
  'max_df': 1.0,
  'max_features': None,
  'min_df': 1,
  'ngram_range': (1, 1),
  'norm': 'l2',
  'preprocessor': None,
  'smooth_idf': True,
  'stop_words': None,
  'strip_accents': None,
  'sublinear_tf': False,
  'token_pattern': '(?u)\\b\\w\\w+\\b',
  'tokenizer': None,
  'use_idf': True,
  'vocabulary': None},
 array([1.69314718, 1.28768207, 1.69314718, 1.69314718, 1.69314718,
        1.69314718, 1.69314718, 1.69314718, 1.69314718, 1.28768207,
        1.69314718, 1.69314718, 1.28768207, 1.69314718, 1.69314718,
        1.69314718, 1.69314718]))

In [71]:
help(CountVectorizer)


Help on class CountVectorizer in module sklearn.feature_extraction.text:

class CountVectorizer(_VectorizerMixin, sklearn.base.BaseEstimator)
 |  CountVectorizer(*, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, stop_words=None, token_pattern='(?u)\\b\\w\\w+\\b', ngram_range=(1, 1), analyzer='word', max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=<class 'numpy.int64'>)
 |  
 |  Convert a collection of text documents to a matrix of token counts
 |  
 |  This implementation produces a sparse representation of the counts using
 |  scipy.sparse.csr_matrix.
 |  
 |  If you do not provide an a-priori dictionary and you do not use an analyzer
 |  that does some kind of feature selection then the number of features will
 |  be equal to the vocabulary size found by analyzing the data.
 |  
 |  Read more in the :ref:`User Guide <text_feature_extraction>`.
 |  
 |  Parameters
 |  ------

In [72]:

# instantiating and generating the count matrix
count = CountVectorizer()
count_matrix = count.fit_transform(df['bag_of_words'])

# creating a Series for the movie titles so they are associated to an ordered numerical
# list I will use later to match the indexes
# indices = pd.Series(df.index)


In [76]:
indices=pd.Series(df.index)
indices

0      The Shawshank Redemption
1                 The Godfather
2        The Godfather: Part II
3               The Dark Knight
4                  12 Angry Men
                 ...           
245            The Lost Weekend
246               Short Term 12
247             His Girl Friday
248          The Straight Story
249         Slumdog Millionaire
Name: Title, Length: 250, dtype: object

In [77]:
count_matrix.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [78]:
# generating the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)
cosine_sim

array([[1.        , 0.15789474, 0.13764944, ..., 0.05263158, 0.05263158,
        0.05564149],
       [0.15789474, 1.        , 0.36706517, ..., 0.05263158, 0.05263158,
        0.05564149],
       [0.13764944, 0.36706517, 1.        , ..., 0.04588315, 0.04588315,
        0.04850713],
       ...,
       [0.05263158, 0.05263158, 0.04588315, ..., 1.        , 0.05263158,
        0.05564149],
       [0.05263158, 0.05263158, 0.04588315, ..., 0.05263158, 1.        ,
        0.05564149],
       [0.05564149, 0.05564149, 0.04850713, ..., 0.05564149, 0.05564149,
        1.        ]])

In [79]:
# function that takes in movie title as input and returns the top 10 recommended movies
def recommendations(title, cosine_sim = cosine_sim):
    
    recommended_movies = pd.DataFrame()
    
    # gettin the index of the movie that matches the title
    idx = indices[indices == title].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
    #print(score_series)
    # getting the indexes of the 10 most similar movies
    top_10_indexes = list(score_series.index)
    
    # populating the list with the titles of the best 10 matching movies
    for i in top_10_indexes:
        recommended_movies=recommended_movies.append([[df.index[i],score_series.loc[i]]])
        
    return recommended_movies

In [80]:
content=recommendations('Toy Story')

In [81]:
content.head(10)

Unnamed: 0,0,1
0,Toy Story,1.0
0,Toy Story 3,0.228218
0,Song of the Sea,0.175412
0,Inside Out,0.166091
0,Finding Nemo,0.146385
0,Aladdin,0.143019
0,Zootopia,0.136931
0,Up,0.136931
0,"Monsters, Inc.",0.136931
0,The Grand Budapest Hotel,0.129099


In [78]:
content.to_csv('contentrating.csv')