In [1]:
import  pandas as pd
from sentence_transformers import SentenceTransformer
import  faiss
import numpy as np


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_csv('dataset/processed.csv', sep='/')
model = SentenceTransformer('model/')
index = faiss.read_index('animes.index')

In [13]:
# define function to return top_k anime dictionary based query

def fetch_anime(dataframe_idx):
    info = data.iloc[dataframe_idx]
    meta = dict()
    meta['title'] = info['title']
    return meta
    
def search(query, top_k, index, model):
    query_vector = model.encode([query])
    top_k = index.search(query_vector, top_k)
    result_id = top_k[1].tolist()[0]
    result_id = list(np.unique(result_id))
    results =  [fetch_anime(idx) for idx in result_id]
    return results

In [14]:
query="Anime about food and cooking"
results=search(query, top_k=10, index=index, model=model)
print("\n")
for result in results:
    print('\t',result)



	 {'title': 'Yakitate!! Japan'}
	 {'title': 'Bakuretsu Tenshi'}
	 {'title': 'Ben-To'}
	 {'title': 'Koufuku Graffiti'}
	 {'title': 'Shokugeki no Souma'}
	 {'title': 'Pan de Peace!'}
	 {'title': 'Amaama to Inazuma'}
	 {'title': 'Isekai Shokudou'}
	 {'title': 'Ramen Daisuki Koizumi-san'}
	 {'title': 'Emiya-san Chi no Kyou no Gohan'}


In [5]:
review_data = pd.read_csv('dataset/raw/reviews.csv')
anime_data = pd.read_csv('dataset/raw/animes.csv')

In [6]:
selected_reviews = review_data[['anime_uid', 'text']].groupby(by=['anime_uid']).count().sort_values('text', ascending=False)
selected_reviews = selected_reviews[selected_reviews.text > 15].reset_index()

# Adding all reviews/sum up all reviews from the same anime

reviews_joined = review_data[['anime_uid', 'text']][review_data['anime_uid'].isin(selected_reviews['anime_uid'])]

anime_data = anime_data.drop_duplicates(subset=['uid'])
reviews_joined.rename(columns={'anime_uid' : 'uid', 'text' : 'review'}, inplace=True)
animes_reviews = pd.merge(anime_data, reviews_joined, how='right', on=['uid']).drop_duplicates()

# we will only take columns that we think have the potential to be used as descriptions

animes_reviews = animes_reviews.drop(['members', 'popularity', 'ranked', 'img_url', 'link'],axis=1)
animes_reviews = animes_reviews.dropna()
animes_reviews.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 108246 entries, 0 to 158139
Data columns (total 8 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   uid       108246 non-null  int64  
 1   title     108246 non-null  object 
 2   synopsis  108246 non-null  object 
 3   genre     108246 non-null  object 
 4   aired     108246 non-null  object 
 5   episodes  108246 non-null  float64
 6   score     108246 non-null  float64
 7   review    108246 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 7.4+ MB


In [7]:
import random

def preProcessReview(row):
    res = str(row['review'])
    
    while '\n' in res or '\r' in res or "\'" in res or '  ' in res:
        res = res.replace('\n','')
        res = res.replace('\r','')
        res = res.replace("\'",'')
        res = res.replace('  ',' ')
    
    return res

def preProcessScore(row):
    res = str(row['score'])
    
    return 'Score ' + res

def preProcessEpisodes(row):
    res = str(row['episodes'])
    res = res.replace(".0",'')
    
    return 'with ' + res + ' Episodes'

def preProcessGenre(row):
    res = row['genre']
    res = res.replace("'",'')
    res = res.replace('[','')
    res = res.replace(']','')
    
    return 'Genre ' + res

def preProcessSynopsis(row):
    res = str(row['synopsis'])
    while '\n' in res or '\r' in res or "\'" in res or '  ' in res:
        res = res.replace('\n','')
        res = res.replace('\r','')
        res = res.replace("\'",'')
        res = res.replace('  ',' ')
        
    return 'Synopsis, ' + res

animes_reviews['synopsis'] = animes_reviews.apply(preProcessSynopsis, axis=1)
animes_reviews['genre'] = animes_reviews.apply(preProcessGenre, axis=1)
animes_reviews['episodes'] = animes_reviews.apply(preProcessEpisodes, axis=1)
animes_reviews['score'] = animes_reviews.apply(preProcessScore, axis=1)
animes_reviews['review'] = animes_reviews.apply(preProcessReview, axis=1)

In [8]:
def transformText(row):
    name = str(row['title']) + ' '
    episodes = str(row['episodes']) + ' episode ' 
    tags = str(row['genre']) + ' '
    year = str(row['aired']) + ' '
    desc = str(row['synopsis']) + ' '
    score = str(row['score']) + ' '
    review = str(row['review']) + ' '
    
    num_features = [2,3]
    features = [episodes, tags, year, desc, score, review]
    
    selected = random.sample(features, random.sample(num_features, 1)[0])
    
    res = str()
    for text in selected:
        res = res + text
    
    return res

test = pd.DataFrame(columns=['title','text'])
copi = animes_reviews.copy()
test['text'] = copi.apply(transformText, axis=1)
test['title'] = animes_reviews['title']

In [9]:
acc = 0
sample = 1000
import time
t = time.time()
for ind in test.sample(sample).index:
    results=search(test['text'].loc[ind], top_k=3, index=index, model=model)
    
    for result in results:
        if test['title'].loc[ind] == result['title']:
            acc+=1
            break

print('>>>> Results in Total Time: {}'.format(time.time()-t))

>>>> Results in Total Time: 536.6562509536743


In [10]:
print(acc/sample * 100)

57.4
