In [11]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
import import_ipynb #requires pip install
from joblib import dump, load

Import new dataset

In [2]:
users = pd.read_csv(r"D:\dataset\encoding\user_frame.csv")
anime = pd.read_csv(r"D:\dataset\encoding\anime_frame.csv")
ratings = pd.read_csv(r"D:\dataset\encoding\ratings_frame.csv")
#Function to turn "genres" columns from string into list
def string_to_list(genres):
    return eval(genres)

users["fav_genres"] = users["fav_genres"].apply(string_to_list)
anime["genre"] = anime["genre"].apply(string_to_list)
ratings["genre"] = ratings["genre"].apply(string_to_list)

In [3]:
#Build genre list
def split(genres):
    return str(genres).split(",")

def build_genre_list(dataframe):
    #Updating the set to get the individual genre names in the set instead of the original list of genres
    genre_set = set()
    for i in dataframe: 
        print(i)
        genre_set.update(i)
    result = list(genre_set)
 
    #Removing blank spaces
    final_result = []
    for i in result:
        if i!="nan":
            #new = i.replace(" ", "")
            new = i.strip()
            final_result.append(new)

    #Removing duplicates now that spaces have been removed
    genre_list = list(set(final_result))
    return genre_list

Vectorize users records

In [9]:


user_cols = users.loc[:, ["username", "experience","gender", "generation",  "fav_anime_period"]]

encoder = LabelEncoder()
#Function to create a separate feature for each column and vectorize
def genre_columns():

    user_genres = users.loc[:, ["username", "fav_genres"]]
    genre_list = build_genre_list(user_genres["fav_genres"])

    for i in genre_list:
        user_cols[i] = 0

    for x, y in user_genres.iterrows():
        user_row = user_cols.loc[user_cols["username"] == y["username"]]
        #print(y)
        #encode each of the three favourite genres with value: 1
        user_row[y["fav_genres"][0]] = 1
        user_row[y["fav_genres"][1]] = 1
        user_row[y["fav_genres"][2]] = 1

        user_cols.loc[user_cols["username"] == y["username"]] = user_row

    genre_cols = user_cols.drop(columns=["experience","gender", "generation","fav_anime_period"])
    return genre_cols

#Vectorise features of a single user
def vectorize_single(user):
    #Get row of one user
    user_row = user_cols.loc[user_cols["username"] == user]
    user_vector = user_row.copy()
    
    #Fit the encoder with the data of original df: user_cols
    for col in user_cols.columns:
        enc = encoder.fit(user_cols[col])
        #save the conder to file

        dump(enc, f"{col}.pkl")
        #print(list(enc.classes_))
        user_vector[col]=enc.transform(user_vector[col]) #encode each column of target user

    return user_vector

#Vectorise features of all users in the dataframe
def vectorize_all():
    all_users = user_cols.copy()

    for user in user_cols["username"]:
        user_vector = vectorize_single(user)
        all_users.loc[all_users["username"] == user] = user_vector
    #Make sure username field is in string form for identifier    
    all_users["username"] = user_cols["username"]
    #K-Nearest Neighbours
    #nb = KNeighborsClassifier(n_neighbors=5)
    return all_users

#Construct final vector of encoded features
def construct_vectors():
    string_features = vectorize_all()
    genre_features = genre_columns()
    final_vector = pd.merge(string_features, genre_features, on = "username")
    return final_vector

#result = construct_vectors()
#result.to_csv(r"D:\dataset\encoding\user_vectors.csv", index=False)
#genre_columns()
#vectorize_all()
#user_cols
# need to translate genres that user is asked to anime genres


In [None]:
pd.set_option('display.max_columns', 500)
result


In [24]:
test = load("fav_anime_period.pkl")
x = test.transform(["New-Gen"])[0]
x

ValueError: y contains previously unseen labels: 'New-Genn'

In [10]:
vectorize_single("badking95")


Unnamed: 0,username,experience,gender,generation,fav_anime_period
9301,6443,3,1,1,1


In [None]:

anime_vector = anime.loc[:, ["airing","studio", "genre", "time_period", "fame"]]
anime_vector
#anime
#anime_genres = users.loc[:, "genre"]
#genre_list = build_genre_list(anime_genres)
#for i in genre_list:
            #user_cols[i] = 0

In [None]:
rf = pd.read_csv(r"D:\dataset\encoding\collab_frame.csv")

full_ratings = rf[rf["username"].isin(users["username"])] #ensure that only ratings from users that are in the users dataframe are included

print(len(full_ratings))

Build user-item matrix with ratings (collaborative filtering)

In [None]:

#not using genres because this should focus on ratings 
anime_matrix = pd.DataFrame(columns = users["username"].values)
anime_matrix["anime_id"] = anime["anime_id"]
anime_matrix.set_index("anime_id")
col = anime_matrix.pop("anime_id")
anime_matrix.insert(0, col.name, col)
#anime_matrix.iloc[:]=0
rating_matrix = pd.DataFrame(0, columns=anime_matrix.columns, index=anime_matrix.index) #set all values to 0 by default
rating_matrix["anime_id"] = anime["anime_id"]

#Filter rows in rating_matrix to only include anime that are also in the full_ratings table
rating_matrix = rating_matrix[rating_matrix["anime_id"].isin(full_ratings["anime_id"])]

for show in rating_matrix.iterrows():
    show_id = int(show[1]["anime_id"])
    #show_id = int(full_anime.loc[full_anime["title"]==show[0]]["anime_id"])
    anime_ratings = full_ratings.groupby("anime_id").get_group(show_id)
    usernames = list(anime_ratings["username"])
    scores = list(anime_ratings["my_score"]) #normalise ratings
    #print(anime_ratings)
    mean = list(anime_ratings["mean_score"])
    #mean = float(users.loc[users["username"]==users["stats_mean_score"]])
    norm = np.subtract(scores, mean)
    #print(norm)
    #print(usernames)
    #print(scores)      

    rating_matrix.loc[rating_matrix["anime_id"]==show_id,usernames] = scores #test with raw scores and normalised scores
#adding extra features to be used in combination with ratings for each show
extra_features = anime.loc[:, ["anime_id", "type", "studio", "time_period", "fame"]]
rating_matrix = rating_matrix.merge(extra_features, on="anime_id")

#Label encode the extra features
collab_encoder = LabelEncoder()
for col in extra_features.columns[1:]:
    col_fit = collab_encoder.fit(rating_matrix[col])
    rating_matrix[col] = col_fit.transform(rating_matrix[col])

rating_matrix.to_csv(r"D:\dataset\encoding\collab_scores.csv", index = False)
rating_matrix
#extra_features
