In [32]:
import numpy as np
import pandas as pd
#import scipy
from datetime import date, datetime

In [33]:

users = pd.read_csv(r"D:\dataset\users_cleaned.csv")
anime = pd.read_csv(r"D:\dataset\anime_cleaned.csv")
ratings = pd.read_csv(r"D:\dataset\animelists_cleaned.csv", nrows = 4000000)
af = pd.DataFrame(anime)
rf = pd.DataFrame(ratings)
uf = pd.DataFrame(users)

Preparing dataframes so that data is consistent

In [34]:
af = af.loc[af["type"].isin(["TV","Movie"]) & (~af["rating"].isin(["Rx - Hentai", "R+ - Mild Nudity"]))] #only include TV shows and Movies and remove nsfw shows
uf = users.loc[:, ["user_id", "username", "user_completed", "stats_mean_score", "user_days_spent_watching", "gender", "birth_date", "stats_episodes"]]
uf["birth_date"] = pd.to_datetime(uf["birth_date"])
print(len(af))
#Only include users who have completed at least 5 anime
uf = uf[uf["user_completed"] >= 5]

#print(len(rf.groupby("username")))
rf = rf[rf["username"].isin(uf["username"])] #ensure that only ratings from users that are in the users dataframe are included
print(len(uf))
uf = uf[uf["username"].isin(rf["username"])] #ensure that only users that have ratings in the ratings dataframe are included
print(len(uf))

3613
104869
9302


Creating Anime Features

In [35]:

def split(genres):
    return str(genres).split(",")

#Building a dictionary of all genres in a dataframe
def build_genre_list(dataframe):
    split_genres = dataframe["genre"].apply(split)
    #Updating the set to get the individual genre names in the set instead of the original list of genres
    genre_set = set()
    for i in split_genres: 
        genre_set.update(i)
    result = list(genre_set)

    #Removing blank spaces
    final_result = []
    for i in result:
        if i!="nan":
            new = i.replace(" ", "")
            final_result.append(new)

    #Removing duplicates now that spaces have been removed
    genre_list = list(set(final_result))
    print(genre_list)

#user_genres = build_genre_list(af)

#CREATING time_period FEATURE (Anime)

#new gen = >2010 
#if anime is currently airing new_gen 
# if an anime is complete, if it aired before 2010 its old gen
#Filtering only relevant columns
av = af.loc[:, ["anime_id", "title", "type", "airing", "scored_by", "members", "studio", "genre", "aired_from_year"]]
av["genre"] = av["genre"].apply(split) #splitting genre column so it can be used as a bag of words
av = av.loc[av["scored_by"]>=100]
av["aired_from_year"] = av["aired_from_year"].astype("int")

cond = [
    (av["airing"] == True),
    (av["aired_from_year"] >= 2010) & (av["airing"]==False),
    (av["aired_from_year"] < 2010) & (av["airing"]==False)
]
vals = [
    "New-Gen",
    "New-Gen",
    "Classic"
]

av["time_period"] = np.select(cond, vals, default=av["aired_from_year"])
print(len(av))
  
#CREATING "FAME" FEATURE
stats = av.members.quantile([0.25, 0.5, 0.75, 0.9])
print(stats)

members = [
    (av["members"] >= 0) & (av["members"]<=6800),
    (av["members"] > 6800) & (av["members"]<=27000),
    (av["members"] > 27000) & (av["members"]<=89000),
    (av["members"] > 89000) & (av["members"]<=211000),
    (av["members"] > 210000)
]
member_num = [
    "unknown",
    "niche",
    "well-known",
    "semi-famous",
    "famous"
]
av["fame"] = np.select(members, member_num, default=av["members"])
av.to_csv(r"D:\dataset\encoding\anime_frame.csv", index=False)
av

#remember to split the data
#filter type to TV only

3240
0.25      6275.25
0.50     25396.00
0.75     84522.75
0.90    203026.40
Name: members, dtype: float64


Unnamed: 0,anime_id,title,type,airing,scored_by,members,studio,genre,aired_from_year,time_period,fame
0,11013,Inu x Boku SS,TV,False,139250,283882,David Production,"[Comedy, Supernatural, Romance, Shounen]",2012,New-Gen,famous
1,2104,Seto no Hanayome,TV,False,91206,204003,Gonzo,"[Comedy, Parody, Romance, School, Shounen]",2007,Classic,semi-famous
2,5262,Shugo Chara!! Doki,TV,False,37129,70127,Satelight,"[Comedy, Magic, School, Shoujo]",2008,Classic,well-known
3,721,Princess Tutu,TV,False,36501,93312,Hal Film Maker,"[Comedy, Drama, Magic, Romance, Fantasy]",2002,Classic,semi-famous
4,12365,Bakuman. 3rd Season,TV,False,107767,182765,J.C.Staff,"[Comedy, Drama, Romance, Shounen]",2012,New-Gen,semi-famous
...,...,...,...,...,...,...,...,...,...,...,...
6612,34522,"Wake Up, Girls! Shin Shou",TV,False,1848,7772,Millepensee,"[Music, Drama]",2017,New-Gen,niche
6613,24415,Kuroko no Basket 3rd Season,TV,False,174065,287422,Production I.G,"[Comedy, Sports, School, Shounen]",2015,New-Gen,famous
6614,478,Sousei no Aquarion,TV,False,23090,55092,"Satelight, Production Reed","[Action, Mecha, Romance, Super Power, Supe...",2005,Classic,well-known
6618,4948,Shounen Sarutobi Sasuke,Movie,False,314,815,Toei Animation,"[Adventure, Fantasy]",1959,Classic,unknown


Creating Additional User Features

In [36]:
#CREATE "GENERATION" FEATURE
#Convert birth years to generations
uf["birth_date"] = pd.to_datetime(uf["birth_date"])
def get_birth_year(date):
    birth_year = date.strftime("%Y")
    return int(birth_year)

uf["birth_date"] = uf["birth_date"].apply(get_birth_year)

#Assign generations based on birth year
generations = [
    (uf["birth_date"] >= 2013) & (uf["birth_date"] <= 2025),
    (uf["birth_date"] >= 1995) & (uf["birth_date"] <= 2012),
    (uf["birth_date"] >= 1980) & (uf["birth_date"] <= 1994),
    (uf["birth_date"] >= 1965) & (uf["birth_date"] <= 1979)
]

gen_values = [
    "Gen Alpha",
    "Gen Z",
    "Millenials",
    "Gen X"
]
#uf = uf.rename(columns={"birth_date":"generation"})
uf["generation"] = np.select(generations, gen_values, default=uf["birth_date"])

#CREATE "EXPERIENCE" FEATURE
#Cast days_watched to integers so that users are not missed in conditions below
uf["user_days_spent_watching"] = uf["user_days_spent_watching"].astype("int")
#Assign experience levels based on days watched
user_exp = [
    (uf["user_days_spent_watching"] >= 0) & (uf["user_days_spent_watching"] <= 5),
    (uf["user_days_spent_watching"] >= 6) & (uf["user_days_spent_watching"] <= 50),
    (uf["user_days_spent_watching"] >= 51) & (uf["user_days_spent_watching"] <= 99),
    (uf["user_days_spent_watching"] >= 100)
]

exp_values = [
    "Newbie",
    "Regular",
    "Active",
    "Veteran"
]

uf["user_days_spent_watching"] = np.select(user_exp, exp_values, default=uf["user_days_spent_watching"])
uf = uf.rename(columns={"user_days_spent_watching":"experience"})

#CREATE FAVOURITE GENRE AND FAVOURITE_TIME_PERIOD FEATURES

#Merge the ratings and anime dataframes
merge_rf = pd.merge(rf, av, on="anime_id")

#Remove all anime ratings where the user has watched less than 10 episodes unless it is a movie
movies = merge_rf[merge_rf["type"]=="Movie"]
valid_tv_shows = merge_rf[(merge_rf["type"]=="TV") & (merge_rf["my_watched_episodes"]>=10)]
merge_rf = pd.concat([movies, valid_tv_shows])
collab_frame = merge_rf.copy()

merge_rf = merge_rf.loc[:, ["username", "anime_id","title","genre","type", "my_score","my_watched_episodes", "aired_from_year", "time_period"]]
#Sort by highest rated anime by each user
merge_rf.set_index("username")
merge_rf = merge_rf.groupby("username").apply(lambda i:i.sort_values(by="my_score", ascending=False))
merge_rf.reset_index(drop=True, inplace=True)


#Only include top 10 highest rated anime per user
merge_rf = merge_rf.groupby("username").head(10)

fav_dict = {"username": [], "fav_genres":[]}
fav_time_dict = {"username":[], "fav_anime_period":[]}
for u, df in merge_rf.groupby("username"):
    user_ratings = merge_rf.loc[merge_rf["username"]==u] #top 10 ratings for each user
    user_genres = []
    user_favs = []
    
    time_periods = [] #time_period values for each user
    for i in user_ratings["time_period"]:
        time_periods.append(i)

    #Concatenate genres for each anime into one list
    for i in user_ratings["genre"]:
        i = " ".join(i) #convert list to string and back to list to remove spaces
        x = i.split()
        user_genres = user_genres + x

    final_result = []
    #for i,v in enumerate(user_genres):
        #if v!="nan":
            #new = v.replace(" ", "")#Remove spaces
            #user_genres[i]=new
            
    #Find top 3 favourite most frequent genres in the top 10
    for i in range(0, 3):
        most_popular = max(set(user_genres), key=user_genres.count)
        user_favs.append(most_popular)
        #Remove genre from the list once added to favourites
        for i in range(0, user_genres.count(most_popular)):
            user_genres.remove(most_popular)
    #Add fav_genres to dictionary 
    fav_dict["username"].append(u)
    fav_dict["fav_genres"].append(user_favs)

    #Find most frequent anime time_period in the top 10
    fav_time = max(set(time_periods), key=time_periods.count)
    #Add to the dictionary
    fav_time_dict["username"].append(u)
    fav_time_dict["fav_anime_period"].append(fav_time)
    
  
if ['Action','Comedy', 'Shounen'] in fav_dict["fav_genres"]:
    print(f"Found at position: {fav_dict['fav_genres'].index(['Action','Comedy', 'Shounen'])}, count: {fav_dict['fav_genres'].count(['Action','Comedy', 'Shounen'])}")
else:
    print("Broken")

if ['Action', 'Drama', 'Sci-Fi'] in fav_dict["fav_genres"]:
    print(f"Found at position: {fav_dict['fav_genres'].index(['Action', 'Drama', 'Sci-Fi'])}, count: {fav_dict['fav_genres'].count(['Action', 'Drama', 'Sci-Fi'])}")
else:
    print("Broken")


print(f"num favourites: {len(fav_dict['username'])}", len(merge_rf.groupby("username")))
print(len(uf))
#print(fav_list)

fav_dict = pd.DataFrame(fav_dict)
uf = pd.merge(uf, fav_dict, on="username")

#print(fav_dict.shape, uf.shape)
fav_time_dict = pd.DataFrame(fav_time_dict)
uf = pd.merge(uf, fav_time_dict, on="username")
#fav_time_dict
uf.to_csv(r"D:\dataset\encoding\user_frame.csv",index = False)
merge_rf.to_csv(r"D:\dataset\encoding\ratings_frame.csv", index = False)

#uf
#print(fav_list)
#user_ratings
merge_rf

#user feature: preferred anime length (some anime have incorrectly got 0 episodes)
#  first -> add anime_length feature 
#  change num_episodes to anime length feature
#  then  -> check if user prefers long or short anime

#most popular genres 
#limitation is that existing users' favourite genres are generated by implication compared to 
# new user who is asked directly

#4. see if any shows appear in multiple people's top 10 lists

Index(['username', 'anime_id', 'my_watched_episodes', 'my_start_date',
       'my_finish_date', 'my_score', 'my_status', 'my_rewatching',
       'my_rewatching_ep', 'my_last_updated', 'my_tags', 'title', 'type',
       'airing', 'scored_by', 'members', 'studio', 'genre', 'aired_from_year',
       'time_period', 'fame'],
      dtype='object')
Found at position: 51, count: 225
Found at position: 155, count: 48
num favourites: 9302 9302
9302


Unnamed: 0,username,anime_id,title,genre,type,my_score,my_watched_episodes,aired_from_year,time_period
0,--FallenAngel--,164,Mononoke Hime,"[Action, Adventure, Fantasy]",Movie,10,1,1997,Classic
1,--FallenAngel--,5507,Senjou no Valkyria,"[Action, Military, Romance]",TV,10,26,2009,Classic
2,--FallenAngel--,17265,Log Horizon,"[Action, Game, Adventure, Magic, Fantasy]",TV,10,25,2013,New-Gen
3,--FallenAngel--,6547,Angel Beats!,"[Action, Comedy, Drama, School, Supernatural]",TV,10,13,2010,New-Gen
4,--FallenAngel--,10800,Chihayafuru,"[Drama, Game, Josei, School, Slice of Life...",TV,10,25,2011,New-Gen
...,...,...,...,...,...,...,...,...,...
1966010,zzs,60,Chrno Crusade,"[Action, Historical, Demons, Supernatural, ...",TV,8,24,2003,Classic
1966011,zzs,1974,Glass no Kamen (2005),"[Drama, Shoujo]",TV,8,51,2005,Classic
1966012,zzs,269,Bleach,"[Action, Adventure, Comedy, Super Power, S...",TV,8,207,2004,Classic
1966013,zzs,4898,Kuroshitsuji,"[Action, Comedy, Demons, Fantasy, Historic...",TV,8,24,2008,Classic


Collaborative filtering (Ratings)

In [61]:
#Dataframe of ratings that includes all ratings from users instead of just top 10
collab_frame = collab_frame.loc[collab_frame["my_score"]>0, ["username", "anime_id","title","genre","type", "my_score", "my_watched_episodes", "aired_from_year", "time_period"]]  
#stats_mean_score
collab_frame.set_index("username")
collab_frame = collab_frame.groupby("username").apply(lambda i:i.sort_values(by="my_score", ascending=False))

collab_frame.reset_index(drop=True, inplace=True)

for u in collab_frame.groupby("username"):
    mean = float(users.loc[users["username"]==u[0]]["stats_mean_score"])
    #Adding mean score column to ratings dataframe so scores can be normalised
    collab_frame.loc[collab_frame["username"]==u[0], "mean_score"] = mean
    #print(u[1])
collab_frame.to_csv(r"D:\dataset\encoding\collab_frame.csv", index = False)
collab_frame

Unnamed: 0,username,anime_id,title,genre,type,my_score,my_watched_episodes,aired_from_year,time_period,mean_score
0,--FallenAngel--,164,Mononoke Hime,"[Action, Adventure, Fantasy]",Movie,10,1,1997,Classic,8.41
1,--FallenAngel--,5507,Senjou no Valkyria,"[Action, Military, Romance]",TV,10,26,2009,Classic,8.41
2,--FallenAngel--,585,Mimi wo Sumaseba,"[Slice of Life, Drama, Romance, Shoujo]",Movie,10,1,1995,Classic,8.41
3,--FallenAngel--,513,Tenkuu no Shiro Laputa,"[Adventure, Fantasy, Romance, Sci-Fi]",Movie,10,1,1986,Classic,8.41
4,--FallenAngel--,10800,Chihayafuru,"[Drama, Game, Josei, School, Slice of Life...",TV,10,25,2011,New-Gen,8.41
...,...,...,...,...,...,...,...,...,...,...
1626915,zzs,1498,Black Blood Brothers,"[Action, Comedy, Fantasy, Shounen, Superna...",TV,7,12,2006,Classic,7.57
1626916,zzs,1313,Digimon Adventure 02,"[Action, Adventure, Comedy, Drama, Fantasy...",TV,6,50,2000,Classic,7.57
1626917,zzs,1562,Yamato Nadeshiko Shichihenge♥,"[Comedy, Shoujo]",TV,6,25,2006,Classic,7.57
1626918,zzs,1735,Naruto: Shippuuden,"[Action, Adventure, Comedy, Super Power, M...",TV,6,68,2007,Classic,7.57
