In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import scipy
from numba import njit
from numba import jit
from numba import vectorize
from timeit import default_timer as timer
from datetime import timedelta

from concurrent.futures import ThreadPoolExecutor
import threading

movies = pd.read_csv('movie_archive\movies_metadata.csv')
ratings_small = pd.read_csv('movie_archive\\ratings_small.csv')
links_small = pd.read_csv('movie_archive\links_small.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [2]:
# compute all movie vector form specific user into one vector for each user
all_user_profiles = {}
ram = 12    # in GB
count_to_big = 0
count_false_int = 0

# get all movie vector rated from user
def get_user_rated_movies(ids):
    
        movie_vector_list = []
        for i in ids:
            tmdbId = links_small['tmdbId'].loc[links_small['movieId'] == i]
            try:
                index = movieid_list.index(int(tmdbId))
                movie_vector = tfidf_matrix[index]
                movie_vector_list.append(movie_vector)
            except ValueError:
                #print ("ValueError tmdbId: " + str(tmdbId))     # deleted/NaN movies
                count_false_int = count_false_int + 1


        movie_vectors = scipy.sparse.vstack(movie_vector_list)

        return movie_vectors


def create_user_vector_jit(user_rated_movies_vector_array):
        
        user_vector = user_rated_movies_vector_array.sum(axis=0)

        return user_vector


def create_user_vectors(uid):
        
        user_profile = ratings_small.loc[ratings_small['userId'] == uid]
            
        user_rated_movies_vector_list = get_user_rated_movies(user_profile['movieId'].to_list())
        n = user_rated_movies_vector_list.shape[0]

        # variable threshold (change ram)
        thresh = (n * user_rated_movies_vector_list.shape[1] / 8) * 64
        thresh_ram = ram * 1000000000   

        if thresh > thresh_ram:
            count_to_big = count_to_big + 1

        else:
            
            user_vectors_array = scipy.sparse.csr_matrix.toarray(user_rated_movies_vector_list)
            user_vectors_array.reshape(user_rated_movies_vector_list.shape)

            user_vector = create_user_vector_jit(user_vectors_array)

            user_norm = sklearn.preprocessing.normalize(scipy.sparse.csr_matrix(user_vector))
            all_user_profiles[uid] = user_norm


# Sorting Threading
recommendation_users = {}

def sort_user_vectors(uid):
    if uid in all_user_profiles.keys():
        user_vec = all_user_profiles[uid]
        cosine_similarity_user_movies = cosine_similarity(user_vec, tfidf_matrix)
        top_similar = cosine_similarity_user_movies[0].argsort()[:100:-1]
        recommendation_users[uid] = [(cosine_similarity_user_movies[0][i], movies['id'][i]) for i in top_similar]

In [3]:

feature_size_samples = [1000, 2000, 5000, 10000, 20000, 50000, 100000, 200000, 500000, 1000000, 1500000, 2000000, 2500000]

for feature_size in feature_size_samples:
    tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), stop_words='english', max_features= feature_size)
    tfidf_matrix = tfidf.fit_transform(movies['title'].apply(str) + " " + movies['overview'].apply(str))
    tfidf_matrix

    all_user_profiles = {}
    ram = 12    # in GB
    count_to_big = 0
    count_false_int = 0
    recommendation_users = {}

    userid_list = ratings_small['userId'].unique()
    movieid_list = movies['id'].to_list()

    # changes movieId type to int and deletes all false entrys
    movieid_list = [x for x in movieid_list if x.isdigit()]
    movieid_list = list(map(int, movieid_list))

    start = timer()

    with ThreadPoolExecutor(max_workers=8) as executer:
        executer.map(create_user_vectors, userid_list)

    end = timer()
    print("Feature Size: " + str(feature_size))
    print ("Time Calculate Vectors HH:MM:SS: ",timedelta(seconds=end-start))

    
    start = timer()

    with ThreadPoolExecutor(max_workers=8) as executer:
        executer.map(sort_user_vectors, userid_list)

    end = timer()
    print ("Time Sorting HH:MM:SS: ",timedelta(seconds=end-start))


    start = timer()

    percentage_already_seen = []
    for uid in userid_list:
        if uid in all_user_profiles.keys():
            user_profile = ratings_small.loc[ratings_small['userId'] == uid]
            user_recommendation = recommendation_users[uid]

            user_profile_mid_list = user_profile['movieId'].to_list()
            user_tmdb_list = []
            for m in user_profile_mid_list:
                tmdbId = links_small['tmdbId'].loc[links_small['movieId'] == m]
                try:
                    user_tmdb_list.append(int(tmdbId))
                except ValueError:
                    #print("ValueError: " + str(tmdbId))
                    count_false_int = count_false_int + 1
            
            m_count = 0
            for movie_cosine, mid in user_recommendation[:len(user_profile)]:
                try:
                    if int(mid) in user_tmdb_list:
                        m_count += 1
                except ValueError:
                    print("ValueError: " + str(mid) + " not in tmdbid list")    #Fehler weil, schon gesucht aber nicht gefunden // skipped because to big sample size

            #if m_count == len(user_profile):
            #    print(str(uid) + " : the first movies are already viewed")
            #else:
            #    print(str(m_count) + " of " + str(len(user_profile)) + " in first recommended already seen")

            p = m_count/len(user_profile)
            percentage_already_seen.append(p)

    average_percentage = sum(percentage_already_seen) / len(percentage_already_seen)
    print("Average Percentage of already seen movies: " + str(average_percentage))
    
    end = timer()
    print ("Time Percentage HH:MM:SS: ",timedelta(seconds=end-start))
    
    print("\n")


Feature Size: 1000
Time Calculate Vectors HH:MM:SS:  0:00:38.597873
Time Sorting HH:MM:SS:  0:02:22.085813
Average Percentage of already seen movies: 0.19999427516388513
Time Percentage HH:MM:SS:  0:00:16.965027


Feature Size: 2000
Time Calculate Vectors HH:MM:SS:  0:00:37.475862
Time Sorting HH:MM:SS:  0:02:22.784084
Average Percentage of already seen movies: 0.3485532128279689
Time Percentage HH:MM:SS:  0:00:13.994212


Feature Size: 5000
Time Calculate Vectors HH:MM:SS:  0:00:38.189444
Time Sorting HH:MM:SS:  0:02:32.663258
Average Percentage of already seen movies: 0.5182575471209759
Time Percentage HH:MM:SS:  0:00:13.914416


Feature Size: 10000
Time Calculate Vectors HH:MM:SS:  0:00:38.464785
Time Sorting HH:MM:SS:  0:02:25.380818
Average Percentage of already seen movies: 0.6170610853740343
Time Percentage HH:MM:SS:  0:00:14.535380


Feature Size: 20000
Time Calculate Vectors HH:MM:SS:  0:00:40.483292
Time Sorting HH:MM:SS:  0:02:29.353080
Average Percentage of already seen mov

In [None]:
# Recall Test 1
# Big Size not exact because of variable based on RAM sample size

Feature Size: 1000
Time Calculate Vectors HH:MM:SS:  0:00:38.597873
Time Sorting HH:MM:SS:  0:02:22.085813
Average Percentage of already seen movies: 0.19999427516388513
Time Percentage HH:MM:SS:  0:00:16.965027


Feature Size: 2000
Time Calculate Vectors HH:MM:SS:  0:00:37.475862
Time Sorting HH:MM:SS:  0:02:22.784084
Average Percentage of already seen movies: 0.3485532128279689
Time Percentage HH:MM:SS:  0:00:13.994212


Feature Size: 5000
Time Calculate Vectors HH:MM:SS:  0:00:38.189444
Time Sorting HH:MM:SS:  0:02:32.663258
Average Percentage of already seen movies: 0.5182575471209759
Time Percentage HH:MM:SS:  0:00:13.914416


Feature Size: 10000
Time Calculate Vectors HH:MM:SS:  0:00:38.464785
Time Sorting HH:MM:SS:  0:02:25.380818
Average Percentage of already seen movies: 0.6170610853740343
Time Percentage HH:MM:SS:  0:00:14.535380


Feature Size: 20000
Time Calculate Vectors HH:MM:SS:  0:00:40.483292
Time Sorting HH:MM:SS:  0:02:29.353080
Average Percentage of already seen movies: 0.6963516261209928
Time Percentage HH:MM:SS:  0:00:14.920387


Feature Size: 50000
Time Calculate Vectors HH:MM:SS:  0:00:42.748750
Time Sorting HH:MM:SS:  0:02:32.419798
Average Percentage of already seen movies: 0.7849682895278405
Time Percentage HH:MM:SS:  0:00:14.930865


Feature Size: 100000
Time Calculate Vectors HH:MM:SS:  0:00:45.553169
Time Sorting HH:MM:SS:  0:02:28.291945
Average Percentage of already seen movies: 0.8345539306660864
Time Percentage HH:MM:SS:  0:00:14.387165


Feature Size: 200000
Time Calculate Vectors HH:MM:SS:  0:00:54.221364
Time Sorting HH:MM:SS:  0:02:29.235787
Average Percentage of already seen movies: 0.8775904429548552
Time Percentage HH:MM:SS:  0:00:15.100360


Feature Size: 500000
Time Calculate Vectors HH:MM:SS:  0:01:18.192902
Time Sorting HH:MM:SS:  0:02:29.772707
ValueError: 1997-08-20 not in tmdbid list
ValueError: 2012-09-29 not in tmdbid list
Average Percentage of already seen movies: 0.912398952144008
Time Percentage HH:MM:SS:  0:00:14.062708


Feature Size: 1000000
Time Calculate Vectors HH:MM:SS:  0:02:14.997520
Time Sorting HH:MM:SS:  0:02:40.612262
ValueError: 1997-08-20 not in tmdbid list
ValueError: 2012-09-29 not in tmdbid list
ValueError: 1997-08-20 not in tmdbid list
ValueError: 2014-01-01 not in tmdbid list
Average Percentage of already seen movies: 0.9485361587014906
Time Percentage HH:MM:SS:  0:00:14.624399


Feature Size: 1500000
Time Calculate Vectors HH:MM:SS:  0:03:37.627740
Time Sorting HH:MM:SS:  0:02:39.536482
ValueError: 2014-01-01 not in tmdbid list
ValueError: 1997-08-20 not in tmdbid list
ValueError: 2012-09-29 not in tmdbid list
ValueError: 2014-01-01 not in tmdbid list
Average Percentage of already seen movies: 0.9655132954043382
Time Percentage HH:MM:SS:  0:00:14.390111


Feature Size: 2000000
Time Calculate Vectors HH:MM:SS:  0:04:40.599197
Time Sorting HH:MM:SS:  0:02:41.775206
ValueError: 2012-09-29 not in tmdbid list
ValueError: 1997-08-20 not in tmdbid list
ValueError: 2012-09-29 not in tmdbid list
ValueError: 2014-01-01 not in tmdbid list
ValueError: 1997-08-20 not in tmdbid list
ValueError: 2012-09-29 not in tmdbid list
ValueError: 1997-08-20 not in tmdbid list
ValueError: 2014-01-01 not in tmdbid list
Average Percentage of already seen movies: 0.9763566494759693
Time Percentage HH:MM:SS:  0:00:13.195075


Feature Size: 2500000
Time Calculate Vectors HH:MM:SS:  0:05:30.353203
Time Sorting HH:MM:SS:  0:02:44.398707
ValueError: 2012-09-29 not in tmdbid list
ValueError: 2014-01-01 not in tmdbid list
ValueError: 1997-08-20 not in tmdbid list
ValueError: 1997-08-20 not in tmdbid list
ValueError: 2012-09-29 not in tmdbid list
ValueError: 2012-09-29 not in tmdbid list
ValueError: 2014-01-01 not in tmdbid list
ValueError: 2012-09-29 not in tmdbid list
Average Percentage of already seen movies: 0.9823895517779411
Time Percentage HH:MM:SS:  0:00:12.229260