In [28]:
import pandas as pd 
import numpy as np 
import string
from nltk import word_tokenize, pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet

In [29]:
df = pd.read_csv("../data/dirty_data.csv").dropna().rename(columns = {"sypnopsis" : "synopsis"})
df = df.rename(columns = {col_name : col_name.lower() for col_name in df.columns})


df["score"] = [score if score != "Unknown" else 0.00 for score in df["score"]]
df["score"] = df["score"].astype(np.float64)

df.head(n=10)



Unnamed: 0,mal_id,name,score,genres,synopsis
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space","In the year 2071, humanity has colonized sever..."
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space","other day, another bounty—such is the life of ..."
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0..."
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",ches are individuals with special powers like ...
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",It is the dark century and the people are suff...
5,15,Eyeshield 21,7.95,"Action, Sports, Comedy, Shounen",Sena is like any other shy kid starting high s...
6,16,Hachimitsu to Clover,8.06,"Comedy, Drama, Josei, Romance, Slice of Life","Yuuta Takemoto, a sophomore at an arts college..."
7,17,Hungry Heart: Wild Striker,7.59,"Slice of Life, Comedy, Sports, Shounen",Kyosuke Kano has lived under the shadow of his...
8,18,Initial D Fourth Stage,8.15,"Action, Cars, Sports, Drama, Seinen",Takumi Fujiwara finally joins Ryousuke and Kei...
9,19,Monster,8.76,"Drama, Horror, Mystery, Police, Psychological,...","Dr. Kenzou Tenma, an elite neurosurgeon recent..."


In [30]:
stopping_words = stopwords.words("english")

lemmatizer = WordNetLemmatizer()

POS_TAG_MAP = {
    "N" : wordnet.NOUN,
    "V" : wordnet.VERB,
    "R" : wordnet.ADV,
    "J" : wordnet.ADJ
}

In [34]:
def normalize_synopsis(synopsis : str):
    synopsis = synopsis.lower()
    raw_tokens = word_tokenize(synopsis)
    clean_tokens = []

    for token in raw_tokens:
        
        # strip the token, in case it has leading or trailing spaces
        token = token.strip()

        # replace all english punctutation with an empty string, to improve the amount of tokens being real words
        for punct in string.punctuation:
            token = token.replace(punct, "")

        if token and token not in stopping_words:
            clean_tokens.append(token)

    final_tokens = set()
    for word, tag in pos_tag(clean_tokens):
        tag_type = POS_TAG_MAP.get(tag[0], "n")
        lemmatized_token = lemmatizer.lemmatize(word, tag_type)
        final_tokens.add(lemmatized_token)

    return " ".join(final_tokens)
    

df["normalized_synopsis"] = df.synopsis.apply(normalize_synopsis)

df

Unnamed: 0,mal_id,name,score,genres,synopsis,normalized_synopsis
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space","In the year 2071, humanity has colonized sever...",lighthearted mellow improvise western dark spi...
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space","other day, another bounty—such is the life of ...",routine unlucky closely spike woolong crew due...
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0...",waste meryl rumor villain eye oppose follow th...
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",ches are individuals with special powers like ...,place user power arrive recently first name on...
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",It is the dark century and the people are suff...,end five hunt defeat zenon power carry squad b...
...,...,...,...,...,...,...
16209,48481,Daomu Biji Zhi Qinling Shen Shu,0.00,"Adventure, Mystery, Supernatural",No synopsis information has been added to this...,help information title add synopsis database i...
16210,48483,Mieruko-chan,0.00,"Comedy, Horror, Supernatural",ko is a typical high school student whose life...,paranormal girl carry student deal endure stor...
16211,48488,Higurashi no Naku Koro ni Sotsu,0.00,"Mystery, Dementia, Horror, Psychological, Supe...",Sequel to Higurashi no Naku Koro ni Gou .,naku ni gou higurashi koro sequel
16212,48491,Yama no Susume: Next Summit,0.00,"Adventure, Slice of Life, Comedy",New Yama no Susume anime.,susume new yama anime


In [36]:
sample_descs = np.array(df.normalized_synopsis)[:5]

for sample_desc in sample_descs: print(f"{sample_desc} \n")

lighthearted mellow improvise western dark spike genetically crew outlaw name homage pragmatic whiz leave comedy smooth hunter loss memory space individual highly attempt galaxy corgi unravel meet colonize refer wong—the thrill peace high solar faye surface fatale part team aid live inter enigmatic new moon ragtag wellbalanced police density music 2071 black mysterious adventure kid edward jet make collect thrown bounty classic system two intelligent strange course welsh reward carefree embark travels—ein bebop computer humanity boisterous pair addition planet several action partner valentine earth keep engineer aboard cowboy chase uninhabitable year spiegel femme spaceship balance trickster little past member behind 

routine unlucky closely spike woolong crew due follow investigation spring alba lead hunter whopping across individual mass stake bounty—such interrupt head money price scheme faye team casualty realize mar pharmaceutical perpetrator blast ragtag disease involve split pl

In [50]:

word_synonym_corpus = {}

for syn in df.normalized_synopsis:
    for token in syn.split(" "):
        if token not in word_synonym_corpus:
            syns = wordnet.synsets(token)
            word_syns = set()

            for syn in syns:
                for l in syn.lemmas():
                    word_syns.add(l.name())

            word_synonym_corpus[token] = list(word_syns)



In [51]:
import json

with open("../data/synonym_corpus.json", "w") as file:
    file.write(json.dumps(word_synonym_corpus))

In [60]:
def find_most_similar_anime_by_keywords(user_description, synonyms_corpus, target_df, num_recommendations):  

    cleaned_user_description = normalize_synopsis(user_description)
    cleaned_user_tokens = set(cleaned_user_description.split(" "))

    def similarity_score(synopsis : str):
        synopsis_tokens = set(synopsis.split(" "))
        shared = cleaned_user_tokens.intersection(synopsis_tokens)
        user_not_shared = cleaned_user_tokens.difference(shared)
        synopsis_not_shared = synopsis_tokens.difference(shared)

        for w1 in user_not_shared:
            for w2 in synopsis_not_shared:
                w2_syns = synonyms_corpus.get(w2, [])
                if w1 in w2_syns:
                    shared.add(w1)
                    break
    
        return len(shared) / len(cleaned_user_tokens)



    target_df["similarity_score"] = target_df.normalized_synopsis.apply(similarity_score)

    return target_df.sort_values(by="similarity_score").tail(n=num_recommendations)


def jsonify_predictions(pred_df : pd.DataFrame):
    cols_to_keep = [col_name for col_name in pred_df.columns if col_name != "normalized_synopsis"]
    pred_df_json = pred_df[cols_to_keep].to_dict()   
    
    final_pred_json = []    
    
    mal_ids = pred_df_json["mal_id"].values()
    names = pred_df_json["name"].values()
    scores = pred_df_json["score"].values()
    synopses = pred_df_json["synopsis"].values()
    similarity_scores = pred_df_json["similarity_score"].values()


    for mal_id, name, score, synopsis, similarity_score in zip(mal_ids, names, scores, synopses, similarity_scores):
        pred_json = {
            "mal_id" : mal_id,
            "name" : name,
            "score" : score,
            "synopsis" : synopsis,
            "similarity_score" : similarity_score
        }

        final_pred_json.append(pred_json)

    return final_pred_json


def get_predictions(score=None, genres=None, synopsis=None, num_recommendations=3): 

    if score is None and genres is None and synopsis is None:
        return False

    pred_df = df.copy()

    SCORE_DIFF_THRESHOLD = 0.85

    if score:
        pred_df = pred_df[ abs(pred_df["score"] - score) <= SCORE_DIFF_THRESHOLD ]

    if genres:
        genre_bits_or = pred_df["genres"].str.contains(genres[0])

        for genre in genres[1:]:
            genre_bits_or |= pred_df["genres"].str.contains(genre)

        pred_df = pred_df[ genre_bits_or ]

    if synopsis:
        pred_df = find_most_similar_anime_by_keywords(synopsis, word_synonym_corpus, pred_df, num_recommendations)  

    return jsonify_predictions(pred_df)



[{'mal_id': 30205,
  'name': 'Aoharu x Kikanjuu',
  'score': 7.18,
  'synopsis': 'Hotaru Tachibana has a strong sense of justice and just cannot help confronting those who choose to perform malicious acts. Furthermore, Hotaru is actually a girl who likes to disguise herself as a boy. After hearing rumors that her best friend was tricked by the popular host of a local club, Hotaru seeks to punish the evildoer. Upon arriving at the club, however, she is challenged to a so-called "survival game" by the host Masamune Matsuoka, where the first person hit by the bullet of a toy gun will lose. After a destructive fight which results in Hotaru\'s loss, Masamune forces the young "boy" to join his survival game team named Toy Gun Gun, in order to repay the cost of the damages that "he" has caused inside the club. Although she is initially unhappy with this turn of events, Hotaru quickly begins to enjoy what survival games have to offer and is determined to pay off her debt, much to the dismay of

In [63]:
all_genres = set()

for g_set in df["genres"]:
    g_set_genres = g_set.split(",")
    for genre in g_set_genres:
        all_genres.add(genre)

with open("../data/genres.json", "w") as file:
    file.write(json.dumps(list(all_genres)))


df.to_csv("../data/final_data.csv", index=False)