In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
import re
import string
import random
from PIL import Image
import requests
from io import BytesIO
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser
from gensim.models import KeyedVectors

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aryuska/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_csv('description.csv')
df.head()



Unnamed: 0,imdbId,title,description
0,114709,Toy Story,A cowboy doll is profoundly threatened and jea...
1,113497,Jumanji,When two kids find and play a magical board ga...
2,113228,Grumpier Old Men,John and Max resolve to save their beloved bai...
3,114885,Waiting to Exhale,"Based on Terry McMillan's novel, this film fol..."
4,113041,Father of the Bride Part II,George Banks must deal not only with his daugh...


In [3]:
def _removeNonAscii(s):
    return "".join(i for i in s if ord(i)<128)

def make_lower_case(text):
    return text.lower()

def remove_stop_words(text):
    text = text.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text

def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    return text

In [4]:
df['cleaned'] = df['description'].apply(_removeNonAscii)

df['cleaned'] = df.cleaned.apply(func = make_lower_case)
df['cleaned'] = df.cleaned.apply(func = remove_stop_words)
df['cleaned'] = df.cleaned.apply(func=remove_punctuation)
df['cleaned'] = df.cleaned.apply(func=remove_html)

In [5]:
#splitting the description into words

corpus = []
for words in df['cleaned']:
    corpus.append(words.split())
print(corpus[0])

['cowboy', 'doll', 'profoundly', 'threatened', 'jealous', 'new', 'spaceman', 'action', 'figure', 'supplants', 'top', 'toy', 'boy', 's', 'bedroom']


In [6]:
# Downloading the Google pretrained Word2Vec Model


EMBEDDING_FILE = '/Users/aryuska/Documents/Kuliah/smt7/propo/recommendation-system/GoogleNews-vectors-negative300.bin.gz'
google_word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)
# Training our corpus with Google Pretrained Model

google_model = Word2Vec(vector_size = 200, window=5, min_count = 2, workers = 1)

google_model.build_vocab(corpus)
print(google_model.corpus_count)



62471


In [8]:
#Building TFIDF model and calculate TFIDF score

tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df = 5, stop_words='english')
tfidf.fit(df['cleaned'])

# Getting the words from the TF-IDF model

tfidf_list = dict(zip(tfidf.get_feature_names_out(), list(tfidf.idf_)))
tfidf_feature = tfidf.get_feature_names_out()

In [23]:
# Building TF-IDF Word2Vec 

# Storing the TFIDF Word2Vec embeddings
tfidf_vectors = []; 
line = 0;
# for each book description
for desc in corpus: 
  # Word vectors are of zero length (Used 300 dimensions)
    sent_vec = np.zeros(300) 
    # num of words with a valid vector in the book description
    weight_sum =0; 
    # for each word in the book description
    for word in desc: 
        if word in google_model.wv and word in tfidf_feature:
            vec = google_model.wv[word]
            tf_idf = tfidf_list[word] * (desc.count(word) / len(desc))
            print(tf_idf, vec)
            sent_vec += (vec * tf_idf)
            weight_sum += tf_idf
    if weight_sum != 0:
        sent_vec /= weight_sum
    tfidf_vectors.append(sent_vec)
    line += 1

0.489466224636673 [ 4.54825861e-03  3.52426409e-03 -2.30795564e-03 -1.78320764e-03
  3.44420900e-03 -1.05015875e-03 -3.81117524e-03 -2.14770366e-03
  3.35261831e-03  4.38879710e-03 -3.19087924e-03  1.26438681e-03
  2.44347216e-03 -4.31742007e-03 -8.12728424e-04  3.95536702e-03
 -8.66670627e-04  3.85270896e-03 -1.78695982e-03 -3.30757978e-03
  4.65028890e-04  4.38707473e-04 -3.51089961e-03  1.99244023e-04
  4.48510190e-03 -1.95345885e-04  1.69617240e-03  3.86921596e-03
  3.45246424e-03 -1.60619081e-03  1.65327906e-03  2.72157975e-03
 -1.80770096e-03  1.52847532e-03 -3.67033132e-03  3.43273766e-03
  1.32455712e-03  2.33313325e-03 -3.10974428e-03  2.33161100e-03
  2.72353645e-03  6.99630997e-04  2.75292405e-04 -2.40742811e-03
  3.39290267e-03  4.68550203e-03  3.51551641e-03 -3.14511289e-03
 -4.86780424e-03 -3.67824663e-03 -3.09363124e-03 -4.97264694e-03
 -5.73499827e-04  5.17402892e-04 -1.72914029e-03 -4.05600527e-03
 -3.23346374e-03 -2.40880321e-03 -1.21137267e-03  1.86565821e-03
 -1.191

ValueError: operands could not be broadcast together with shapes (300,) (200,) (300,) 

In [9]:
google_model.train(corpus, total_examples=google_model.corpus_count, epochs = 5)

(6695467, 7005030)

In [10]:
# Generate the average word2vec for the each book description

def vectors(x):
    
    # Creating a list for storing the vectors (description into vectors)
    global word_embeddings
    word_embeddings = []

    # Reading the each movie description 
    for line in df['cleaned']:
        avgword2vec = None
        count = 0
        for word in line.split():
            if word in google_model.wv:
                count += 1
                if avgword2vec is None:
                    avgword2vec = google_model.wv[word]
                else:
                    avgword2vec = avgword2vec + google_model.wv[word]
                
        if avgword2vec is not None:
            avgword2vec = avgword2vec / count
        
            word_embeddings.append(avgword2vec)

In [11]:
finalRecommend = []

In [12]:
# Recommending the Top 5 similarmovie

def recommendations(title):
    #Check title movie
    foundedMovie = df[df['title'] == title]
    if foundedMovie.empty == True:
        print("MOVIE NOT FOUND")
        return 
    
    # Calling the function vectors

    vectors(df)
    
    # finding cosine similarity for the vectors
    array_embeddings = word_embeddings
    
    cosine_similarities = cosine_similarity(array_embeddings, array_embeddings)

    movie = df[['title']]
    #Reverse mapping of the index
    indices = pd.Series(df.index, index = df['title']).drop_duplicates()
         
    idx = indices[title]
    sim_scores = list(enumerate(cosine_similarities[idx]))
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    sim_scores = sim_scores[1:100]
    book_indices = [i[0] for i in sim_scores]
    recommend = movie.iloc[book_indices]
    return recommend

In [29]:
foundedMovie = df[df['title'] == 'Her']
foundedMovie


Unnamed: 0,imdbid,title,description,cleaned
20728,1798709,Her,"In a near future, a lonely writer develops an ...",near future lonely writer develops unlikely re...


In [30]:
recommendResult = recommendations("Superman")

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [26]:
recommendResult[0:10]

Unnamed: 0,title
40068,The Right to Romance
19019,"After Fall, Winter"
1782,Go Now
58531,Auf das Leben!
31359,The Disturbance at Dinner
30227,American Translation
23272,Stockholm Stories
56981,The Holiday Calendar
14357,Meng ying tong nian
35301,Ischeznuvshaya imperiya
