In [1]:
from gensim.models import word2vec
import logging
import pandas as pd
import nltk, string
from nltk import sent_tokenize
import requests
from bs4 import BeautifulSoup
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.metrics.pairwise import linear_kernel

omdb_api = "http://www.omdbapi.com/?apikey=9e1c784d&plot=full&t={}"
url_review_all = "https://www.imdb.com/title/{}/reviews?ref_=tt_urv"
id_reviews = "Data_in_csv/id_review2.csv"
movie_info = "Data_in_csv/movie_info.csv"
movies_score = "Data_in_csv/movie_score.csv"
movies_score2 = "Data_in_csv/movie_score2.csv"
key_words = {'plot': ['plot', 'scenario', 'script', 'scriptwriter', 'screenplay', 'Writing'],
             'actor': ['perform', 'act', 'acting', 'actor', 'performance', 'role'],
             'vision': ['shot', 'frames', 'picture', 'visual', 'vision', 'photography', 'cinematography', 'scenery'],
             'music': ['music', 'musical', 'song', 'sound', 'background', 'soundtrack'],
             'design': ['costume', 'makeup', 'make up', 'clothes']
             }
emotinal_key_words = {'plot':  ['plot', 'scenario', 'script', 'scriptwriter', 'screenplay', 'Writing'],
                      'actor': ['actor','actress'],
                      'vision': ['shot', 'frames', 'picture', 'visual', 'vision', 'photography', 'cinematography', 'scenery'],
                      'music': ['music', 'musical', 'song', 'sound', 'background', 'soundtrack'],
                      'design': ['costume', 'makeup', 'make up', 'clothes'],
                      'thrill': ['horr', 'thrill', 'scary', 'scared', 'flinch', 'shock'],
                      'touched': ['romantic', 'cry', 'warm', 'touched', 'moved','tears'],
                      'happy': ['fun', 'humor','hilarious']
                      }

def load_data():
    df_id_reviews = pd.read_csv(id_reviews, header=0)
    df_movie_info = pd.read_csv(movie_info, header=0)
    return df_id_reviews, df_movie_info

# """
# output: this is the df_id_reviews,df_movie_info output
#
# movie_id                                            reviews
# 0    tt0091064  David Cronenberg redefined what we think of as...
# ..         ...                                                ...
# 999  tt0111161  Why do I want to write the 234th comment on Th...
# ------------------------------------------------------
# Unnamed: 0                                              Title  Year  \
# 0             0                                            The Fly  1986   
# ..          ...                                                ...   ...   
# 999         999                           The Shawshank Redemption  1994   
# ------------------------------------------------------
# Rated     Released  Runtime  \
# 0            R  15 Aug 1986   96 min    
# ..         ...          ...      ...   
# 999          R  14 Oct 1994  142 min   
# ------------------------------------------------------
#                                          Genre  \
# 0                             Drama, Horror, Sci-Fi    
# ..                                              ...    
# 999                                           Drama  
# ------------------------------------------------------
#                                               Director  \
# 0                                     David Cronenberg    
# ..                                                 ...   
# 999                                     Frank Darabont   
# ------------------------------------------------------
#                                   Writer  \
# 0    George Langelaan (short story), Charles Edward...     
# ..                                                 ...     
# 999  Stephen King (short story "Rita Hayworth and S...  
# ------------------------------------------------------
#                                   Actors  \
# 0    Jeff Goldblum, Geena Davis, John Getz, Joy Bou...   
# ..                                                 ...    
# 999  Tim Robbins, Morgan Freeman, Bob Gunton, Willi...
# ------------------------------------------------------
#                                     Plot  \
# 0    Seth Brundle (Jeff Goldblum), a brilliant but ...   
# ..                                                 ...    
# 999  Chronicles the experiences of a formerly succe... 
# ------------------------------------------------------
#                                        Language                     Country  \
# 0                                       English             USA, UK, Canada   
# ..                                          ...                         ...   
# 999                                     English                         USA 
# ------------------------------------------------------
#                                       Awards  \
# 0        Won 1 Oscar. Another 5 wins & 11 nominations.   
# ..                                                 ...    
# 999  Nominated for 7 Oscars. Another 19 wins & 32 n...   
# ------------------------------------------------------
# Ratings  Metascore  imdbRating  \
# 0    [{'Source': 'Internet Movie Database', 'Value'...       79.0         7.5   
# ..                                                 ...        ...         ...    
# 999  [{'Source': 'Internet Movie Database', 'Value'...       80.0         9.3   
# ------------------------------------------------------
# imdbVotes     imdbID   Type  
# 0      138,188  tt0091064  movie  
# ..         ...        ...    ...   
# 999  2,005,476  tt0111161  movie
# ------------------------------------------------------



In [2]:
# train a word vector model
def find_similar_words(data):
    data.head()
    sentences=[[token.strip(string.punctuation).strip() for token in nltk.word_tokenize(doc.lower())
                if token not in string.punctuation and len(token.strip(string.punctuation).strip()) >= 2]
               for doc in data['reviews']]
    print(sentences[0:2])
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    wv_model = word2vec.Word2Vec(sentences, min_count=5, size=200, window=10, workers=4)
    return wv_model

In [3]:
"""
input:  dataframe df_id_reviews
output: output df_id_reviews to .csv file
"""
def score_all_movie_by_keys(df_id_reivews):
    id_score_rows = []
    for _, row in df_id_reivews.iterrows():
        review = row['reviews']
        movie_score = score_movie_by_keys(review)
        movie_score['movie_id'] = row['movie_id']
        id_score_rows.append(movie_score)
    df_movie_score = pd.DataFrame(id_score_rows)
    df_movie_score = df_movie_score[['movie_id', 'actor', 'design', 'happy', 'music', 'plot', 'thrill', 'touched', 'vision']]
    df_movie_score.to_csv(movies_score)

In [4]:
"""
input: review for one movie
output: dict  score sum of this movie
"""
def score_movie_by_keys(review):
    sentences = sent_tokenize(review)
    sid = SentimentIntensityAnalyzer()
    keyword_score = {'plot': 0,'actor': 0,'vision': 0,'music': 0,'design': 0,'thrill': 0,'touched': 0,'happy': 0}
    for key in keyword_score.keys():
        for sentence in sentences:
            for word in emotinal_key_words[key]:
                if word in sentence:
                    ss = sid.polarity_scores(sentence)
                    keyword_score[key] += ss['compound']
            keyword_score[key] = float("{0:.2f}".format(keyword_score[key]))
    return keyword_score

# """
# output: this is keyword_score output
#
# {'plot': -0.07, 'actor': 8.35, 'vision': 4.53, 'music': -1.24, 'design': 0.0, 'thrill': -1.39, 'touched': 0.0, 'happy': 0.77}
# ...
# {'plot': -0.87, 'actor': 2.94, 'vision': -0.07, 'music': 0.0, 'design': 0.0, 'thrill': -0.42, 'touched': -0.55, 'happy': 0.0}
# """

In [5]:
"""
input : list const movies' name
output:  df average score of all dimensions
"""
def score_new_movies(movie_titles):
    list_dict_dimension_score = []
    for movie_title in movie_titles:
        # 1 get the dict_id_reviews
        dict_id_reviews = find_seed_movie_reivew(movie_title)
        # 2 get the dict_dimension_score
        if dict_id_reviews is not None:
            dict_dimension_score = score_movie_by_keys(dict_id_reviews['reviews'])
            list_dict_dimension_score.append(dict_dimension_score)
    if len(list_dict_dimension_score) is 0:
        return None
    df_dimension_score = pd.DataFrame(list_dict_dimension_score)
    list_average_score = df_dimension_score.mean(axis=0)
    df_average_score = pd.DataFrame(list_average_score).transpose()
    print(df_average_score)
    return df_average_score

# """
# output: this is the df_average_score output
#
#     actor  design  happy  music   plot  thrill  touched  vision
# 0  5.485   0.086  1.443   0.19  0.186  -0.043   -0.257   1.038
# """

In [6]:
def find_seed_movie_reivew(movie_title):
    omdb_url = omdb_api.format(movie_title)
    r = requests.get(omdb_url)

    print("Finding movie...")
    if r.status_code is not 200:
        print("Can not find this movie, please check the movie name")
        return
    jdata = r.json()
    if jdata['Response'] == 'False':
        return
    movie_id = jdata['imdbID']
    movie_title = jdata['Title']
    print("Name :" + movie_title , "ID :" + movie_id)
    id_reviews_dict = {'movie_id': movie_id, 'movie_title':movie_title, 'reviews': ""}
    url_review_one = url_review_all.format(movie_id)

    print("Finding movie reviews...")
    page = requests.get(url_review_one)
    soup = BeautifulSoup(page.content, 'html.parser')
    review_list = soup.select('div.text.show-more__control')
    for review in review_list:
        text = review.contents[0]
        if isinstance(text, str):
            id_reviews_dict['reviews'] = id_reviews_dict['reviews'] + text
    return id_reviews_dict

# """
# output: this is the id_reviews_dict output

# {'movie_id': 'tt0887912', 'movie_title': 'The Hurt Locker', 'reviews': 'I am truly sadden that this film got bashed so much. I hear reviews saying this film "
# sucks" or it has too many inaccuracies. Movies like Saving Private Ryan and Schindler\'s List have also "some" inaccuracies in them. (They are Master
# pieces) It is pretty sad this film has been getting this bashed. It doesn\'t deserve to be.Quite easily the best movie of 2009 and the best war movie sin
# ...
# ters and film-making is so realistic, you could have sworn that you were there. This movie has given me a newfound respect for Kathryn Bigelow as a 
# director. *SPOILER* The scene of Beckham as a body bomb is one of the most powerful scenes I\'ve witnessed in a film. James\'s emotions during th
# at scene aren\'t over-acted, but entirely natural.'}
# """

In [None]:
if __name__ == '__main__':
    #1 load .csv file to dataframe
    df_id_reviews, df_movie_info = load_data()
    df_movie_score = pd.read_csv(movies_score, header=0)
    df_movie_score = df_movie_score[['actor','design','happy','music','plot','thrill','touched','vision']]
    repeat = '1'
    while repeat == '1':
        print("\nplease input movies names, each name splited by ',':")
        # 2 let user input movies' names
        string_seed_words = input()
        list_seed_words = string_seed_words.strip().split(",")
        # 3 get dataframe of average score of  input movies
        df_average_score = score_new_movies(list_seed_words)
        #4 calculate cosine similarity and distance 
        cosine_similarity = linear_kernel(df_average_score, df_movie_score).flatten()
        #5 sort distance
        indexs = sorted(range(len(cosine_similarity)), key=lambda i: cosine_similarity[i])[-10:]
        
        #6 print all closest movies
        indexs = list(reversed(indexs))
        print(df_movie_info.iloc[indexs, 1])
        print("input 1 to continue or 0 to quit")
        repeat = input()


please input movies names, each name splited by ',':
Forrest Gump, Schindler's List
Finding movie...
Name :Forrest Gump ID :tt0109830
Finding movie reviews...
Finding movie...
Name :Schindler's List ID :tt0108052
Finding movie reviews...
   actor  design  happy  music  plot  thrill  touched  vision
0   0.74     0.0   0.98  1.235   0.8   -0.98    1.065    1.99
306                 Cabaret
883     Singin' in the Rain
11          West Side Story
273    The Greatest Showman
669             Sing Street
608                 Aladdin
574      The Sound of Music
453            My Fair Lady
242                 Control
582     Fiddler on the Roof
Name: Title, dtype: object
input 1 to continue or 0 to quit
