In [32]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
import import_ipynb #requires pip install

Import new dataset

In [33]:
users = pd.read_csv(r"D:\dataset\encoding\user_frame.csv")
anime = pd.read_csv(r"D:\dataset\encoding\anime_frame.csv")
ratings = pd.read_csv(r"D:\dataset\encoding\ratings_frame.csv")

#Function to turn "genres" columns from string into list
def string_to_list(genres):
    return eval(genres)

users["fav_genres"] = users["fav_genres"].apply(string_to_list)
anime["genre"] = anime["genre"].apply(string_to_list)
ratings["genre"] = ratings["genre"].apply(string_to_list)

In [34]:
#Build genre list
def split(genres):
    return str(genres).split(",")

def build_genre_list(dataframe):
    #Updating the set to get the individual genre names in the set instead of the original list of genres
    genre_set = set()
    for i in dataframe: 
        genre_set.update(i)
    result = list(genre_set)

    #Removing blank spaces
    final_result = []
    for i in result:
        if i!="nan":
            new = i.replace(" ", "")
            final_result.append(new)

    #Removing duplicates now that spaces have been removed
    genre_list = list(set(final_result))
    return genre_list

In [35]:


user_cols = users.loc[:, ["username", "experience","gender", "generation",  "fav_anime_period"]]

encoder = LabelEncoder()

def genre_columns():
    tfidf = TfidfVectorizer()
    user_genres = users.loc[:, "fav_genres"]
    genre_list = build_genre_list(user_genres)
    #for i in genre_list:
        #user_cols[i] = 0
    return user_cols

#save encoded results to new file to make it faster
def vectorize_single(user):
    #Get row of one user
    user_row = user_cols.loc[user_cols["username"] == user]
    user_vector = user_row.copy()
    
    #Fit the encoder with the data of original df: user_cols
    for col in user_cols.columns:
        enc = encoder.fit(user_cols[col])
        #print(list(enc.classes_))
        user_vector[col]=enc.transform(user_vector[col]) #encode each column of target user

    return user_vector

def vectorize_multiple():
    all_users = user_cols.copy()

    for user in user_cols["username"]:
        user_vector = vectorize_single(user)
        all_users.loc[all_users["username"] == user] = user_vector
    #Make sure username field is in string form for identifier    
    all_users["username"] = user_cols["username"]
    #K-Nearest Neighbours
    #nb = KNeighborsClassifier(n_neighbors=5)
    
    #encoder.fit_transform(user_row["experience"])
    return all_users
#similar_users("karthiga")
#genre_columns()
vectorize_multiple()
#user_cols
# encoding users: 
# 1. encode fav_genres as bag of words: use term frequency vector for each
# 2. encode other features
# 3. join encoded genres with the rest of the vectorised features
# need to translate genres that user is asked to anime genres


Unnamed: 0,username,experience,gender,generation,fav_anime_period
0,karthiga,0,0,2,0
1,Damonashu,0,1,2,0
2,bskai,3,1,2,0
3,Slimak,3,1,2,0
4,MistButterfly,3,0,2,1
...,...,...,...,...,...
9297,purgatorio,3,1,2,1
9298,Kagekori-chan,3,0,1,1
9299,Sicka,3,1,2,0
9300,winry-chan,2,0,2,0


In [36]:
vectorize_single("badking95")
#print(user_genres)

Unnamed: 0,username,experience,gender,generation,fav_anime_period
9301,6443,3,1,1,1


In [37]:

anime_vector = anime.loc[:, ["airing","studio", "genre", "time_period", "fame"]]
anime_vector
#anime

Unnamed: 0,airing,studio,genre,time_period,fame
0,False,David Production,"[Comedy, Supernatural, Romance, Shounen]",New-Gen,famous
1,False,Gonzo,"[Comedy, Parody, Romance, School, Shounen]",Classic,semi-famous
2,False,Satelight,"[Comedy, Magic, School, Shoujo]",Classic,well-known
3,False,Hal Film Maker,"[Comedy, Drama, Magic, Romance, Fantasy]",Classic,semi-famous
4,False,J.C.Staff,"[Comedy, Drama, Romance, Shounen]",New-Gen,semi-famous
...,...,...,...,...,...
3502,False,Millepensee,"[Music, Drama]",New-Gen,niche
3503,False,Production I.G,"[Comedy, Sports, School, Shounen]",New-Gen,famous
3504,False,"Satelight, Production Reed","[Action, Mecha, Romance, Super Power, Supe...",Classic,well-known
3505,False,Toei Animation,"[Adventure, Fantasy]",Classic,unknown


In [38]:

ratings

Unnamed: 0,username,anime_id,title,genre,type,my_score,my_watched_episodes,aired_from_year,time_period
0,--FallenAngel--,164,Mononoke Hime,"[Action, Adventure, Fantasy]",Movie,10,1,1997,Classic
1,--FallenAngel--,5341,Ookami to Koushinryou II,"[Adventure, Historical, Romance, Fantasy]",TV,10,12,2009,Classic
2,--FallenAngel--,877,Nana,"[Music, Slice of Life, Comedy, Drama, Roma...",TV,10,47,2006,Classic
3,--FallenAngel--,3731,Itazura na Kiss,"[Comedy, Romance, Shoujo]",TV,10,25,2008,Classic
4,--FallenAngel--,1914,Saiunkoku Monogatari 2nd Season,"[Adventure, Comedy, Drama, Fantasy, Histor...",TV,10,39,2007,Classic
...,...,...,...,...,...,...,...,...,...
92927,zzs,4898,Kuroshitsuji,"[Action, Comedy, Demons, Fantasy, Historic...",TV,8,24,2008,Classic
92928,zzs,60,Chrno Crusade,"[Action, Historical, Demons, Supernatural, ...",TV,8,24,2003,Classic
92929,zzs,167,Scrapped Princess,"[Adventure, Comedy, Drama, Fantasy, Mecha,...",TV,8,24,2003,Classic
92930,zzs,1974,Glass no Kamen (2005),"[Drama, Shoujo]",TV,8,51,2005,Classic
