# Recommendation Part 2:
## Using movie name to get recommendation result

* to run this file successfully, you need the following code:
* !pip install gensim
* !pip install nltk
* !pip install wordcloud

In [None]:
# the package used for this file
import pandas as pd
import numpy as np
from gensim import corpora, models, similarities
from gensim.parsing.preprocessing import STOPWORDS
import nltk
from nltk import sent_tokenize

### Preparation job 1: Text Mining


In [None]:
# access database and store it in a dataframe
a = pd.read_csv('project_data',index_col = 'Unnamed: 0')
a.set_index('index',inplace = True)

In [None]:
# convert datafame to nested dictionary
def dataframe_to_dict(df):
    dict_ = dict()
    for i in df.index:
        dict_[i] = dict()
        if pd.notnull(df['director'][i]):
            dict_[i]['director'] = [i.strip("''") for i in df['director'][i][1:-1].split(', ')]
        else:
            dict_[i]['director'] = 'NaN'
        if pd.notnull(df['genres'][i]):
            dict_[i]['genres'] = [i.strip("''") for i in df['genres'][i][1:-1].split(', ')]
        else:
            dict_[i]['genres'] = 'NaN'
        if pd.notnull(df['keywords'][i]):
            dict_[i]['keywords'] = [i.strip("''") for i in df['keywords'][i][1:-1].split(', ')]
        else:
            dict_[i]['keywords'] = 'NaN'
        if pd.notnull(df['rating_value'][i]):
            dict_[i]['rating_value'] = float(df['rating_value'][i])
        else:
            dict_[i]['rating_value'] = 0
        if pd.notnull(df['release_country'][i]):
            dict_[i]['release_country'] = [i.strip("''") for i in df['release_country'][i][1:-1].split(', ')]
        else:
            dict_[i]['release_country'] = 'NaN'
        if pd.notnull(df['release_date'][i]):
            dict_[i]['release_date'] = df['release_date'][i]
        else:
            dict_[i]['release_date'] = 'NaN'
        if pd.notnull(df['reviews'][i]) and df['reviews'][i] != []:
            dict_[i]['reviews'] = df['reviews'][i].replace('"',"\'").strip("['").strip("']").split("', '")
            dict_[i]['reviews_number'] = len(dict_[i]['reviews'])
        else:
            dict_[i]['reviews'] = 'NaN'
            dict_[i]['reviews_number'] = 0
        if pd.notnull(df['stars'][i]):
            dict_[i]['stars'] = [i.strip("''") for i in df['stars'][i][1:-1].split(', ')]
        else:
            dict_[i]['stars'] = 'NaN'
        if pd.notnull(df['storyline'][i]):
            dict_[i]['storyline'] = df['storyline'][i]
        else:
            dict_[i]['storyline'] = 'NaN'
        if pd.notnull(df['time'][i]):
            dict_[i]['time'] = df['time'][i]
        else:
            dict_[i]['time'] = 'NaN'
        if pd.notnull(df['writers'][i]):
            dict_[i]['writers'] = [i.strip("''") for i in df['writers'][i][1:-1].split(', ')]
        else:
            dict_[i]['writers'] = 'NaN'
    return dict_

In [None]:
# store keywords, genres, storyline, and reviews of each movie into a list as our reference doc
new_dict = dataframe_to_dict(a)
doc_list = list()
for key in new_dict.keys(): 
    if new_dict[key]['keywords'] == 'NaN':
        doc_list.append((new_dict[key]['storyline']+ ' '+' '.join(new_dict[key]['genres'])+' '+' '.join(new_dict[key]['reviews'])).replace('NaN',''))
    else:
        doc_list.append((' '.join(new_dict[key]['keywords'])+ ' '+new_dict[key]['storyline']+ ' '+' '.join(new_dict[key]['genres'])+' '+' '.join(new_dict[key]['reviews'])).replace('NaN',''))

for i in range(len(doc_list)):
    string = doc_list[i]
    string = string.replace('"','')
    doc_list[i] = string  

### Preparation job 2: Similarity
#### Given a corpus of movie documents, when a new movie arrives, find the documents that are similar to it

In [None]:
# construct LSI model with reference doc
for i in range(len(doc_list)):
    story = doc_list[i]
    sents = sent_tokenize(story)
    for j in range(len(sents)):
        sent = sents[j]
        sent = sent.strip().replace('\n','')
        sents[j] = sent
    doc_list[i] = '. '.join(sents)
texts = [[word for word in story.lower().split()
        if word not in STOPWORDS and word.isalnum()]
        for story in doc_list]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=400)

In [None]:
# find similarities of movies based on movie content given the user input - a movie name 
def get_similar_movies(movie_name):
    input_doc = doc_list[a.index.get_loc(movie_name)]
    vec_bow = dictionary.doc2bow(input_doc.lower().split())
    vec_lsi = lsi[vec_bow]
    index = similarities.MatrixSimilarity(lsi[corpus])
    sims = index[vec_lsi]
    sims = sorted(enumerate(sims), key=lambda item: -item[1])
    return sims

In [None]:
# get dataframe with only numerical values 
df_grade = pd.read_csv('grade',index_col = 'Unnamed: 0')

# get dataframe with both categorical and numerical values and nomarlize it
a2 = pd.read_csv('project_data',index_col = 'Unnamed: 0')
a2.set_index('index',inplace = True)
a2['director1'] = df_grade['director']/10
a2['release_country1'] = df_grade['release_country']/10
a2['stars1'] =  df_grade['stars']/10
a2['writers1'] = df_grade['writers']/10
a2['key_story_genre_review'] = np.zeros((10,1))
a2['rating1'] = np.where(a2['rating_value'].isnull(),0,a2['rating_value'])/10

## Recommendation core job:
### Using movie name to get recommendation result
* movie score is between 0 - 1
* scoring rule: heavy weight on content similarities(85%) + minor weight on rating value(5%) + other attributes(total 5%)

In [None]:
# score movies between 0 - 1 based on our scoring rule
def get_scores(movie_name):
    sims= get_similar_movies(movie_name)
    for i in sims:
        a2['key_story_genre_review'].iloc[i[0]] = i[1]/2 + 1/2
    
    a2['total_score'] = (a2.director1+ a2.stars1 + a2.writers1)*0.025 + (a2.release_country1)*0.025+a2.key_story_genre_review * 0.85 + a2.rating1 * 0.05
    return a2.sort_values(by='total_score', ascending=False)[['director','genres','keywords','rating_value','release_country','release_date','reviews','stars','storyline','time','writers','total_score']].iloc[0:10]

In [None]:
# get users inputs - a movie name
your_movie = input('Please enter the relevant movie name you want to search: ')

In [None]:
get_scores(your_movie)

## Simple analysis: Word Clouds
* using function in this part, you can enter any movie name and obtain the idea of genreral content of it by word clouds

In [None]:
# draw out the wordcloud of a given movie 
def get_wordcloud(movie_name):
    text = doc_list[a.index.get_loc(movie_name)]
    # Remove unwanted words
    DELETE_WORDS = ['movie','film']
    for word in DELETE_WORDS:
            text = text.replace(word,' ')
    # Remove words with length less than 4
    word_list = text.strip().split()
    for word in word_list:
        if len(word) <= 3:
            text = text.replace(' '+word+' ',' ')


    wordcloud = WordCloud(stopwords=STOPWORDS,background_color='white',width=4000,height=4000).generate(text)

    plt.imshow(wordcloud)
    plt.axis('off')
    plt.show()

In [None]:
get_wordcloud('Tentacles')