In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split

In [None]:
users = pd.read_csv(r"D:\dataset\encoding\user_frame.csv")
user_vectors = pd.read_csv(r"D:\dataset\encoding\user_vectors.csv")

uv_train, uv_test = train_test_split(user_vectors, test_size=0.2, random_state=8)

ratings = pd.read_csv(r"D:\dataset\encoding\ratings_frame.csv")
anime = pd.read_csv(r"D:\dataset\anime_cleaned.csv")

Label encode username column so that usernames can be retrieved after K-Nearest Neighbours calculation

In [None]:
encoder = LabelEncoder()
t_encoder = LabelEncoder()
final_vectors = uv_train.copy()
test_vectors = uv_test.copy()
x = encoder.fit(final_vectors["username"])
t = t_encoder.fit(test_vectors["username"])
#print(encoder.classes_)
username_enc = x.transform(final_vectors["username"])
test_enc = t.transform(test_vectors["username"])

#adding encoded usernames to new dataframe
final_vectors["username"] = username_enc
test_vectors["username"] = test_enc
#pd.set_option('display.max_rows', None)
final_arrays = final_vectors.values
test_arrays = test_vectors.values
#final_vectors.loc[final_vectors["username"]==7537]
#get username from number
#encoder.inverse_transform([7537])
#final_vectors
uv_train
#test_arrays
#test_vectors
#uv_test

Compute K-Nearest Neighbours

In [None]:
#function that returns a list of indexes of most similar users in "final_arrays" array
def neighbours(user):
    #user = np.array([7537, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
    nc = NearestNeighbors(n_neighbors = 5, metric="cosine") #euclidean
    #maybe add some weightings
    train = nc.fit(final_arrays) #fit the model with the user vector data (training)
    n = train.kneighbors([user], return_distance = False)
    return n

#returns a dataframe of users that were in k-nearest neighbours list 
def sim_frame(neighbours):
    sim_names = []
    #neighbours = neighbours()

    for i in neighbours[0]: #loop through nested array
        #print(final_arrays[i][0])
        name = x.inverse_transform([final_arrays[i][0]]) #find usernames of similar users
        sim_names.append(name)

    #build new dataframe of neighbours
    neighbour_frame = pd.DataFrame()
    for i in users.columns:
        neighbour_frame[i] = None

    for name in sim_names:
        user_row = users.loc[users["username"]==name[0]]
        neighbour_frame = pd.concat([neighbour_frame, user_row])

    pd.options.display.max_rows = 999
    pd.set_option('display.max_columns', 500)
    return neighbour_frame

neighbour_ids = neighbours(np.array([6490, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0]))
#print(neighbour_ids)
neighbour_frame = sim_frame(neighbour_ids)
neighbour_frame
#final_vectors.loc[final_vectors["username"]==7537]
#(final_vectors.columns)    

Create anime recommendations

In [None]:

#Put anime that have most members at the top of each users "lists"
def recommendations(neighbours, media_type):
    #Get top rated TV shows or movies from each user
    suggestions = [] #anime_id's of recommended anime
    for i in neighbours["username"]:
        user_ratings = ratings.loc[ratings["username"]==i]
        tv_shows = user_ratings[user_ratings["type"]=="TV"]
        movies = user_ratings[user_ratings["type"]=="Movie"]

        if media_type == "TV":
            show_ratings = tv_shows
        else:
            show_ratings = movies
        
        #There may be 0 movies in the user's top 10 list in which case we skip that user's recommendationan
        if len(show_ratings) > 1:
            top_rated = show_ratings.iloc[0]["title"] #change to ids when creating the website to lookup the anime
            suggestions.append(top_rated)
            #print(top_rated)

    if len(suggestions)==0:
        print("No movies to recommend :/")
    
    return suggestions

recommendations(neighbour_frame, "TV")
#ratings

Based on model fit with training data (K-Neighbours), evaluate recommended anime for users in test set


In [None]:
real_ratings = pd.read_csv(r"D:\dataset\animelists_cleaned.csv")

In [None]:
def evaluation():
    #Create kneighbours dataframe for all users in the dataset
    overall_scores = []
    num_likes = []
    num_shows_watched = []
    for user in test_arrays:
        user = np.array(user)
        #print([user[0]])
        username = t.inverse_transform([user[0]])[0]
        #print(username)
        user_neighbours = neighbours(user)
        nframe = sim_frame(user_neighbours)
        #print(user_neighbours)
        user_recommendations = recommendations(nframe, "TV")
        #print(nframe["username"], user_recommendations)
        
        #Use original ratings dataframe with all shows user has rated instead of simply the top 10
        rt = real_ratings.loc[real_ratings["username"]==username]
        rt = rt.loc[rt["my_watched_episodes"]>3]
        score = 0
        likes = 0
        shows_watched = 0
        #Check whether recommended shows are present in original ratings table
        for show in user_recommendations:
            a_id = int(anime.loc[anime["title"]==show]["anime_id"])
            #u_ratings = real_ratings.loc[real_ratings["username"]==username]
            if int(a_id) in list(rt["anime_id"]): #if user watched the recommended show, give 1 point
                rating_row = real_ratings.loc[(real_ratings["username"]==username)&(real_ratings["anime_id"]==a_id)]
                #print(rating_row)
                shows_watched+=1
                score+=1 #add a point to overall score because user has watched recommended show before
                if int(rating_row["my_score"])>=7:#user "likes" the show if they rated it higher than or equal to 7
                    score+=1 #add an extra point to overall score if the user liked the show
                    likes+=1

        score = (score/10)*100 #max number of points is 2 x number of neighbours (k): 10
        num_likes.append(likes)
        overall_scores.append(score)
        num_shows_watched.append(shows_watched)
        #print(f"Score for user {username}: {score}%")

        #remove percentage 
        #standard deviation of results
        #distribution of scores - how many score are 1,2,3,4,5,6,7,8,9,10
        #plot on graph
        #potentially take top 2 instead of top 1

    avg_score = np.average(overall_scores)
    avg_likes = np.average(num_likes)
    avg_shows_watched = np.average(num_shows_watched)
    print(f"Average overall recommendation score for Test users: {avg_score}%")
    print(f"Average number of recommended shows that users have watched before: {avg_shows_watched}")
    print(f"Average number of recommended anime that users liked: {avg_likes}")
    return avg_score, avg_shows_watched, avg_likes

evaluation()
#include analysis of distance metric between neighbours e.g. cosine, euclidean 

Function that converts user information from website into vector