In [83]:
import numpy as np
import pandas as pd
#import scipy
from datetime import date, datetime

In [84]:

users = pd.read_csv(r"D:\dataset\users_cleaned.csv")
anime = pd.read_csv(r"D:\dataset\anime_cleaned.csv")
ratings = pd.read_csv(r"D:\dataset\animelists_cleaned.csv", nrows = 4000000)
af = pd.DataFrame(anime)
rf = pd.DataFrame(ratings)
uf = pd.DataFrame(users)

Preparing dataframes so that data is consistent

In [85]:
af = af.loc[af["type"].isin(["TV","Movie"]) & (af["rating"]!="Rx - Hentai")] #only include TV shows and Movies and remove nsfw shows
uf = users.loc[:, ["user_id", "username", "user_completed", "user_days_spent_watching", "gender", "birth_date", "stats_episodes"]]
uf["birth_date"] = pd.to_datetime(uf["birth_date"])
print(len(af))
#Only include users who have completed at least 5 anime
uf = uf[uf["user_completed"] >= 5]

#print(len(rf.groupby("username")))
rf = rf[rf["username"].isin(uf["username"])] #ensure that only ratings from users that are in the users dataframe are included
#print(len(rf.groupby("username")))
uf = uf[uf["username"].isin(rf["username"])] #ensure that only users that have ratings in the ratings dataframe are included
#print(len(uf))

3887


Creating Additional User Features

In [86]:
#CREATE "GENERATION" FEATURE
#Convert birth years to generations
uf["birth_date"] = pd.to_datetime(uf["birth_date"])
def get_birth_year(date):
    birth_year = date.strftime("%Y")
    return int(birth_year)

uf["birth_date"] = uf["birth_date"].apply(get_birth_year)

#Assign generations based on birth year
generations = [
    (uf["birth_date"] >= 2013) & (uf["birth_date"] <= 2025),
    (uf["birth_date"] >= 1995) & (uf["birth_date"] <= 2012),
    (uf["birth_date"] >= 1980) & (uf["birth_date"] <= 1994),
    (uf["birth_date"] >= 1965) & (uf["birth_date"] <= 1979)
]

gen_values = [
    "Gen Alpha",
    "Gen Z",
    "Millenials",
    "Gen X"
]
uf = uf.rename(columns={"birth_date":"generation"})
uf["generation"] = np.select(generations, gen_values, default=uf["generation"])

#CREATE "EXPERIENCE" FEATURE
#Cast days_watched to integers so that users are not missed in conditions below
uf["user_days_spent_watching"] = uf["user_days_spent_watching"].astype("int")
#Assign experience levels based on days watched
user_exp = [
    (uf["user_days_spent_watching"] >= 0) & (uf["user_days_spent_watching"] <= 5),
    (uf["user_days_spent_watching"] >= 6) & (uf["user_days_spent_watching"] <= 50),
    (uf["user_days_spent_watching"] >= 51) & (uf["user_days_spent_watching"] <= 99),
    (uf["user_days_spent_watching"] >= 100)
]

exp_values = [
    "Newbie",
    "Regular",
    "Active",
    "Veteran"
]

uf["user_days_spent_watching"] = np.select(user_exp, exp_values, default=uf["user_days_spent_watching"])
uf = uf.rename(columns={"user_days_spent_watching":"experience"})

#CREATE FAVOURITE GENRE FEATURE

#Merge the ratings and anime dataframes
merge_rf = pd.merge(rf, af, on="anime_id")
merge_rf = merge_rf.loc[:, ["username", "anime_id","title","genre","type", "my_score","my_watched_episodes", "episodes"]]

#Remove all anime ratings where the user has watched less than 10 episodes unless it is a movie
movies = merge_rf[merge_rf["type"]=="Movie"]
valid_tv_shows = merge_rf[(merge_rf["type"]=="TV") & (merge_rf["my_watched_episodes"]>=10)]
merge_rf = pd.concat([movies, valid_tv_shows])

#Sort by highest rated anime by each user
merge_rf.set_index("username")
merge_rf = merge_rf.groupby("username").apply(lambda i:i.sort_values(by="my_score", ascending=False))
merge_rf.reset_index(drop=True, inplace=True)

#Only include top 10 highest rated anime per user
merge_rf = merge_rf.groupby("username").head(10)

#def single_user():     
# 
#     karthiga = ['Comedy', 'Shounen', 'Action']
#     Damonashu = ['Drama', 'Action', 'Sci-Fi']
#     bskai = ['Comedy', 'Romance', 'School']                                                        
#    #Example user/ must make work for all users
#    user = "bskai"
#    user_ratings = merge_rf.loc[merge_rf["username"]==user]
#    user_genres = []
    #Creating a list all genres in the top 10 for each user (including duplicates)
#    fav_list = [] #list of most popular genres for each user
#    for i in user_ratings["genre"]:
#            x = i.split()
#            user_genres = user_genres + x

#    final_result = []
#    for i,v in enumerate(user_genres):
#        if v!="nan":
#            new = v.replace(" ", "")
#            new = v.replace(",", "")
#            user_genres[i]=new
            

    #Find top 3 favourite most frequent genres in the top 10
#    for i in range(0, 3):
#        user_favs = []
#        most_popular = max(set(user_genres), key=user_genres.count)
#        user_favs.append(most_popular)
        #Remove genre from the list once added to favourites
#        for i in range(0, user_genres.count(most_popular)):
#            user_genres.remove(most_popular)

#        fav_list.append(user_favs)

fav_dict = {"username": [], "fav_genres":[]}
for u, df in merge_rf.groupby("username"):
    user_ratings = merge_rf.loc[merge_rf["username"]==u]
    user_genres = []
    user_favs = []
    #user_favs.append(u)
    #Concatenate genres for each anime into one list
    for i in user_ratings["genre"]:
        x = i.split()
        user_genres = user_genres + x

    final_result = []
    for i,v in enumerate(user_genres):
        if v!="nan":
            new = v.replace(" ", "")
            new = v.replace(",", "")
            user_genres[i]=new
            

    #Find top 3 favourite most frequent genres in the top 10
    for i in range(0, 3):
        most_popular = max(set(user_genres), key=user_genres.count)
        user_favs.append(most_popular)
        #Remove genre from the list once added to favourites
        for i in range(0, user_genres.count(most_popular)):
            user_genres.remove(most_popular)
    
    fav_dict["username"].append(u)
    fav_dict["fav_genres"].append(user_favs)

    
    #print(user_ratings)
if ['Action','Comedy', 'Shounen'] in fav_dict["fav_genres"]:
    print(f"Found at position: {fav_dict['fav_genres'].index(['Action','Comedy', 'Shounen'])}, count: {fav_dict['fav_genres'].count(['Action','Comedy', 'Shounen'])}")
else:
    print("Broken")

if ['Action', 'Drama', 'Sci-Fi'] in fav_dict["fav_genres"]:
    print(f"Found at position: {fav_dict['fav_genres'].index(['Action', 'Drama', 'Sci-Fi'])}, count: {fav_dict['fav_genres'].count(['Action', 'Drama', 'Sci-Fi'])}")
else:
    print("Broken")


print(f"num favourites: {len(fav_dict['username'])}", len(merge_rf.groupby("username")))
print(len(uf))
#print(fav_list)

fav_dict = pd.DataFrame(fav_dict)
uf = pd.merge(uf, fav_dict, on="username")

#print(fav_dict.shape, uf.shape)
#fav_dict
uf
#print(fav_list)
#user_ratings

#merge_rf

#user feature: preferred anime length (some anime have incorrectly got 0 episodes)
#  first -> add anime_length feature 
#  change num_episodes to anime length feature
#  then  -> check if user prefers long or short anime

#add new-gen/classic feature to anime
#check if users prefer new-gen/classic
#make sure that frequency difference between new and old is above a certain amount, otherwise put "both" as value

#user_id  |  fav_genre | preferred anime length | new-gen/classic
#account for number of episodes watched

#get most popular genres 
#create the feature
#limitation is that existing users' favourite genres are generated by implication compared to 
# new user who is asked directly

#anime feature: popularity feature

#4. see if any shows appear in multiple people's top 10 lists
#Train set and test set
#Then use lightfm in a new version
# encoding users: 
# 1. encode fav_genres as bag of words
# 2. encode other features
# 3. join encoded genres with the rest of the vectorised features
#need to translate genres that user is asked to anime genres


Found at position: 36, count: 119
Found at position: 87, count: 58
num favourites: 9302 9302
9302


Unnamed: 0,user_id,username,user_completed,experience,gender,generation,stats_episodes,fav_genres
0,2255153,karthiga,49,Active,Female,Millenials,3391,"[Comedy, Shounen, Action]"
1,37326,Damonashu,195,Active,Male,Millenials,4903,"[Drama, Action, Sci-Fi]"
2,228342,bskai,414,Veteran,Male,Millenials,9701,"[Comedy, Romance, School]"
3,61677,Slimak,224,Veteran,Male,Millenials,7447,"[Action, Power, Super]"
4,2485327,MistButterfly,3923,Veteran,Female,Millenials,39703,"[Drama, Shounen, Comedy]"
...,...,...,...,...,...,...,...,...
9297,363965,purgatorio,727,Veteran,Male,Millenials,10701,"[Comedy, Ecchi, Action]"
9298,5180847,Kagekori-chan,497,Veteran,Female,Gen Z,11192,"[Action, Supernatural, Comedy]"
9299,332815,Sicka,1912,Veteran,Male,Millenials,13168,"[Action, Adventure, Comedy]"
9300,25554,winry-chan,7,Regular,Female,Millenials,662,"[Comedy, Shounen, Adventure]"


In [87]:
#CREATING time_period feature

#new gen = >2010 
#if anime is currently airing new_gen 
# if an anime is complete, if it aired before 2010 its old gen


#Building a dictionary of all genres in a dataframe
def build_genre_list(dataframe):

    def split(genres):
        return str(genres).split(",")

    split_genres = dataframe["genre"].apply(split)
    #Updating the set to get the individual genre names in the set instead of the original list of genres
    genre_set = set()
    for i in split_genres: 
        genre_set.update(i)
    result = list(genre_set)

    #Removing blank spaces
    final_result = []
    for i in result:
        if i!="nan":
            new = i.replace(" ", "")
            final_result.append(new)

    #Removing duplicates now that spaces have been removed
    genre_list = list(set(final_result))
    print(genre_list)

#user_genres = build_genre_list(af)

#CREATING time_period FEATURE (Anime)

#Filtering only relevant columns
av = af.loc[:, ["anime_id", "title", "type", "episodes", "airing", "scored_by", "popularity", "studio", "genre", "aired_from_year"]]
av = av.loc[av["scored_by"]>=100]
av["aired_from_year"] = av["aired_from_year"].astype("int")

cond = [
    (av["airing"] == True),
    (av["aired_from_year"] >= 2010) & (av["airing"]==False),
    (av["aired_from_year"] < 2010) & (av["airing"]==False)
]
vals = [
    "New-Gen",
    "New-Gen",
    "Classic"
]
#new column
av["time_period"] = np.select(cond, vals, default=av["aired_from_year"])
print(len(av))
av

#label encoding for normal fields
#one-hot encoding (binary) for genres

#Go through anime_cleaned file
#Merge this file with the users file
#Find all the users that rated each anime
#Find what those users rated those anime and which groups those users belong to
# So you know users from group X liked anime Y "this" much
#Genre: Make each genre its own binary column

#consider number of ratings anime has
#Exclude "hentai genre"
#remember to split the data
#all "currently running" anime show up as having 0 episodes
#filter type to TV only
#give user ability to choose long or short anime

3507


Unnamed: 0,anime_id,title,type,episodes,airing,scored_by,popularity,studio,genre,aired_from_year,time_period
0,11013,Inu x Boku SS,TV,12,False,139250,231,David Production,"Comedy, Supernatural, Romance, Shounen",2012,New-Gen
1,2104,Seto no Hanayome,TV,26,False,91206,366,Gonzo,"Comedy, Parody, Romance, School, Shounen",2007,Classic
2,5262,Shugo Chara!! Doki,TV,51,False,37129,1173,Satelight,"Comedy, Magic, School, Shoujo",2008,Classic
3,721,Princess Tutu,TV,38,False,36501,916,Hal Film Maker,"Comedy, Drama, Magic, Romance, Fantasy",2002,Classic
4,12365,Bakuman. 3rd Season,TV,25,False,107767,426,J.C.Staff,"Comedy, Drama, Romance, Shounen",2012,New-Gen
...,...,...,...,...,...,...,...,...,...,...,...
6612,34522,"Wake Up, Girls! Shin Shou",TV,12,False,1848,4088,Millepensee,"Music, Drama",2017,New-Gen
6613,24415,Kuroko no Basket 3rd Season,TV,25,False,174065,226,Production I.G,"Comedy, Sports, School, Shounen",2015,New-Gen
6614,478,Sousei no Aquarion,TV,26,False,23090,1440,"Satelight, Production Reed","Action, Mecha, Romance, Super Power, Supernatu...",2005,Classic
6618,4948,Shounen Sarutobi Sasuke,Movie,1,False,314,8659,Toei Animation,"Adventure, Fantasy",1959,Classic
