In [1]:
import numpy as np
import pandas as pd
import numpy as np
import pandas as pd

from joblib import delayed, Parallel
from surprise import Dataset, KNNBasic, SVD
from surprise.model_selection import train_test_split
from surprise.model_selection.validation import fit_and_score, print_summary

movies100k = pd.read_csv('./ml100k.u.item', names=['movie_id', 'movie_title', 'release_date', 'video_release_date', 'imdb_url'], delimiter='|', engine='python',encoding = "latin-1", usecols=range(5))
movies1m = pd.read_csv('./ml1m.movies.dat', names=['movie_id', 'movie_title', 'ratings'], delimiter='::', engine='python',encoding = "latin-1")

data100k = Dataset.load_builtin('ml-100k')
data1m = Dataset.load_builtin('ml-1m')

data_training, data_testing = train_test_split(data100k, random_state=22020, train_size=0.80)
data_big_training, data_big_testing = train_test_split(data1m, random_state=22020, train_size=0.80)

In [2]:
# Util functions.

from collections import defaultdict


def getTopNRecommendations(predictions, n=5):
    # code from https://surprise.readthedocs.io/en/stable/FAQ.html#how-to-get-the-top-n-recommendations-for-each-user
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

def getTopRecommendationsByUserId(predictions, userId, is1m=False, n=5):
    top_n = getTopNRecommendations(predictions, n)
    userRating = top_n.get(userId)
    
    it = 1
    for iid, rating in userRating:
        if is1m:
            movieTitle = movies100k.loc[movies100k['movie_id'] == int(iid)]['movie_title']
        else:
            movieTitle = movies1m.loc[movies1m['movie_id'] == int(iid)]['movie_title']
        print()
        print(str(it) + ". " + movieTitle.values + ", Rating: " + str(round(rating, 2)))
        it+=1

def runAlgo(algorithm, data, measures):
    data_training, data_testing = train_test_split(data, random_state=22020, train_size=0.80)
    return fit_and_score(algorithm, data_training, data_testing, measures, True)

def customCrossValidate(algorithm, data):
    # manches wurde hier aus der Surprise-Library-Methode "cross_validate" verwendet. diese funktion wurde angepasst, da nicht mit Folds gearbeitet werden sollte.
    measures = [m.lower() for m in ['MSE']]

    delayed_list = (
        delayed(runAlgo)(algorithm, data, measures)
        for i in range(5)
    )

    out = Parallel(n_jobs=-1,pre_dispatch='2*n_jobs')(delayed_list)
    (test_measures_dicts, train_measures_dicts, fit_times, test_times) = zip(*out)

    test_measures = defaultdict(dict)
    train_measures = defaultdict(dict)
    
    for m in measures:
        test_measures[m] = np.asarray([d[m] for d in test_measures_dicts])
        train_measures[m] = np.asarray([d[m] for d in train_measures_dicts])

    print_summary(algorithm, measures, test_measures, train_measures, fit_times, test_times, 5)


# Movielens 100k
## User Based CF

In [3]:
# Predict Rating for UserID 20, Movie Id
userId = 22
movieId = 20

userBasedAlgorithm = KNNBasic(sim_options={'name':'pearson', 'user_based':True})

def userBasedFiltering(dataTraining, dataTesting, is1m=False):
    algorithm = userBasedAlgorithm
    predictions = algorithm.fit(dataTraining).test(dataTesting)
    
    if dataTraining.knows_user(userId) & dataTraining.knows_item(movieId):
        algorithm.predict(str(userId), str(movieId), verbose=True)
    else:
        if dataTraining.knows_user(userId) == False:
            unknownId = "userId"
        else:
            unknownId = "movieId"
        print(unknownId + " ist unbekannt. Andere ID wählen.")

    top_n = getTopNRecommendations(predictions, n=5)

    userRecommendations = getTopRecommendationsByUserId(predictions, str(userId), is1m)

userBasedFiltering(data_training, data_testing)

customCrossValidate(userBasedAlgorithm, data100k)


Computing the pearson similarity matrix...
Done computing similarity matrix.
user: 22         item: 20         r_ui = None   est = 3.56   {'actual_k': 40, 'was_impossible': False}

['1. Silence of the Palace, The (Saimt el Qusur) (1994), Rating: 4.55']

['2. Superweib, Das (1996), Rating: 4.16']

['3. Nick of Time (1995), Rating: 4.11']

['4. Destiny Turns on the Radio (1995), Rating: 4.07']

["5. White Man's Burden (1995), Rating: 4.06"]
Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Evaluating MSE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MSE (testset)     1.0258  1.0258  1.0258  1.

## Item-Based CF

In [4]:
itemBasedAlgorithm = KNNBasic(sim_options={'name':"cosine", 'user_based':False})
def itemBasedFiltering(dataTraining, dataTesting, is1m=False):

    algorithm = itemBasedAlgorithm
    predictions = algorithm.fit(dataTraining).test(dataTesting)
    
    algorithm.predict(str(userId), str(movieId), verbose=True)

    getTopRecommendationsByUserId(predictions, str(userId), is1m)

itemBasedFiltering(data_training, data_testing)
customCrossValidate(itemBasedAlgorithm, data100k)

Computing the cosine similarity matrix...
Done computing similarity matrix.
user: 22         item: 20         r_ui = None   est = 3.80   {'actual_k': 40, 'was_impossible': False}

['1. Under Siege 2: Dark Territory (1995), Rating: 4.2']

['2. Silence of the Palace, The (Saimt el Qusur) (1994), Rating: 4.2']

['3. Nick of Time (1995), Rating: 4.15']

['4. Destiny Turns on the Radio (1995), Rating: 4.13']

['5. Batman Forever (1995), Rating: 4.12']
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Evaluating MSE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MSE (testset)     1.0626  1.0626  1.0626 

## SVD Based

In [5]:
svdBasedAlgorithm = SVD()
def svdBasedFiltering(dataTraining, dataTesting, is1m=False):
    algo = svdBasedAlgorithm
    predictions = algo.fit(dataTraining).test(dataTesting)
    
    algo.predict(str(userId), str(movieId), verbose=True)
        
    print("The top recommendations are: ")
    getTopRecommendationsByUserId(predictions, str(userId), is1m)

svdBasedFiltering(data_training, data_testing)
customCrossValidate(svdBasedAlgorithm, data100k)

user: 22         item: 20         r_ui = None   est = 3.41   {'was_impossible': False}
The top recommendations are: 

['1. Nick of Time (1995), Rating: 4.7']

['2. Silence of the Palace, The (Saimt el Qusur) (1994), Rating: 4.23']

['3. Superweib, Das (1996), Rating: 4.18']

['4. Wild Bill (1995), Rating: 4.09']

["5. White Man's Burden (1995), Rating: 4.07"]
Evaluating MSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MSE (testset)     0.8784  0.8813  0.8799  0.8838  0.8767  0.8800  0.0024  
MSE (trainset)    0.4696  0.4717  0.4682  0.4718  0.4705  0.4703  0.0013  
Fit time          0.47    0.46    0.46    0.46    0.46    0.46    0.01    
Test time         0.05    0.05    0.05    0.05    0.05    0.05    0.00    


# Movielens 1M
## User-Based CF

In [6]:
userBasedFiltering(data_big_training, data_big_testing, True)
customCrossValidate(userBasedAlgorithm, data1m)

Computing the pearson similarity matrix...
Done computing similarity matrix.
user: 22         item: 20         r_ui = None   est = 2.09   {'actual_k': 40, 'was_impossible': False}

['1. Half Baked (1998), Rating: 4.64']

[]

['3. Kim (1950), Rating: 4.3']

['4. Wild Bunch, The (1969), Rating: 4.28']

['5. Celestial Clockwork (1994), Rating: 4.17']
Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Evaluating MSE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MSE (testset)     0.9223  0.9223  0.9223  0.9223  0.9223  0.9223  0.0000  
MSE (trainset)    0.5154  0.5154  0.5154  0.5154  0.5154  0.51

## Item-Based CF

In [7]:
itemBasedFiltering(data_big_training, data_big_testing, True)
customCrossValidate(itemBasedAlgorithm, data1m)

Computing the cosine similarity matrix...
Done computing similarity matrix.
user: 22         item: 20         r_ui = None   est = 2.75   {'actual_k': 40, 'was_impossible': False}

[]

['2. Spellbound (1945), Rating: 3.78']

['3. Bride of Frankenstein (1935), Rating: 3.7']

['4. Swept from the Sea (1997), Rating: 3.7']

[]
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Evaluating MSE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MSE (testset)     0.9957  0.9957  0.9957  0.9957  0.9957  0.9957  0.0000  
MSE (trainset)    0.8108  0.8108  0.8108  0.8108  0.8108  0.8108  0.0000  
Fit time          

## SVD Based Filtering

In [8]:
svdBasedFiltering(data_big_training, data_big_testing, True)
customCrossValidate(svdBasedAlgorithm, data1m)

user: 22         item: 20         r_ui = None   est = 1.74   {'was_impossible': False}
The top recommendations are: 

[]

['2. Kim (1950), Rating: 4.18']

['3. Bride of Frankenstein (1935), Rating: 3.93']

['4. Celestial Clockwork (1994), Rating: 3.89']

[]
Evaluating MSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MSE (testset)     0.7632  0.7618  0.7619  0.7619  0.7608  0.7619  0.0007  
MSE (trainset)    0.4496  0.4482  0.4460  0.4505  0.4488  0.4486  0.0015  
Fit time          4.12    4.52    4.32    4.25    4.23    4.29    0.13    
Test time         0.82    0.74    0.83    0.82    0.79    0.80    0.03    


# Ergebnisse

> Anzumerken ist: bei den "Folds" in der Ausgabe handelt es sich nicht um Folds, sondern lediglich um die Iterationen. die "Folds"-Ausgabe ergibt sich aus der ``print_summary``-Methode, die ich aus der Library verwendet habe.

Als Algorithmen habe ich:
* einen Userbased k-Next Neighbors Algorithmus mit Pearson Correlation,
* einen Itembased k-Next Neighbors Algorithmus mit Cosine Correlation,
* sowie den SVD-Algorithmus.

In Hinblick auf die durchschnittliche Wirksamkeit (Effectiveness) in Bezug auf den **Mean Squared Error** ergibt sich folgendes (gereiht von bester nach schlechtester) - gemessen am großen Datensatz (1m):

1. **Item Based:**   0.9957 / 0.8108
2. **User Based:**   0.9223 / 0.5154
3. **SVD:**          0.7622 / 0.4493

In Hinblick auf die durchschnittliche Effizienz ergibt sich die folgende Reihung (ebenso am größeren Datensatz gemessen, um Rauschen zu vermeiden):

1. SVD:             Fit:   4.29s     -     Test:  0.80s
2. Item Based:      Fit:   7.16s     -     Test: 20.33s
3. User Based:      Fit: 123.36s     -     Test: 53.60s

Somit ergibt sich, dass SVD im Vergleich zu den anderen beiden Algorithmen ungenau ist und weniger effektiv, allerdingst perfort er sehr gut, auch bei großen Datenmengen.

Der Userbased Algorithmus dauert am längsten, erzielt aber auch bessere Ergebnisse.

Der Item Based Algorithmus zeigt sehr geringe Abweichungen bei den erwarteten Ergebnissen von den echten Ergebnissen, und braucht im Fitting nur etwas länger als SVD, allerdings sehr viel länger beim Testen der Ergebnisse.

---

Die besten Ergebnisse erzielt wohl eine Mischung aus User based und item based Algorithmus. Hier kann man wahrscheinlich die Efficiency sowie Effectiveness optimieren. SVD wird wohl eine gute Methode sein, um halbwegs gute Vorhersagen zu machen, allerdings kann man sich nicht zu sehr auf die Daten verlassen.

> Ich habe zusätzlich eine Funktion aus den Examples der Surprise-Library eingebaut, der zusätzlich die Recommendations ausgibt. Bei den Algorithmen werden unterschiedliche Recommendations gefunden, was allerdings interesstant ist, ist dass beim kleinen (100k) Datensatz in jedem Algorithmus "Silence of the Palace" gefunden wird. Das könnte allerdings damit zu tun haben, dass der Film sehr populär ist, und diese Popularität im Algorithmus nicht berücksichtigt wird.