# Getting the data and serializing in pickle archive

In [1]:
# All imports
import numpy as np
import nltk
import pickle
import re
import string
import csv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [33]:
arquivo = open('dados-animes.csv', encoding='utf-8')
linhas = csv.DictReader(arquivo)
animes = []
synopsis = []
for linha in linhas:
    dados = {'name': linha['nome'], 'ranked': linha['ranked'], 'synopsis': linha['synopsis']}
    animes.append(dados)
    synopsis.append(linha['synopsis'])

In [3]:
# Grava os arquivos serializados
# pickle.dump(animes, open('data-animes.pkl', 'wb'))

# Lê os arquivos e de-serializa
# data = pickle.load(open('data-animes.pkl', 'rb'))

In [4]:
# Puting animes in numpy array
_animes = []
for anime in animes:
    _animes.append(np.array(anime))

In [34]:
pickle.dump(animes, open('data-animes.pkl', 'wb'))

In [35]:
data = pickle.load(open('data-animes.pkl', 'rb'))

# Pre-processing the raw text

In [48]:
len(synopsis)

1213

In [43]:
sent_tokens = nltk.sent_tokenize(synopsis[0])  # converts to list of sentences
word_tokens = nltk.word_tokenize(synopsis[0])  # converts to list of words

In [45]:
lemmer = nltk.stem.WordNetLemmatizer()

def LemTokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens]

remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)

def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

In [46]:
# Preprocessing function - Transform to lowercase and remove punctuations.
# Re is a regular expression


def preprocessing(line):
    line = line.lower()
    line = re.sub(r"[{}]".format(string.punctuation), " ", line)
    return line

In [47]:
# Vectorizing the text movies
tfidf_vectorizer = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(synopsis)

  'stop_words.' % sorted(inconsistent))


In [49]:
# Creating a instance of kmeans and fit data
kmeans = KMeans(n_clusters=50).fit(tfidf)

In [54]:
len(kmeans.labels_)

1213

In [56]:
# Dicting clusters
for k, v in enumerate(kmeans.labels_):
    data[k]['cluster'] = v

In [91]:
# Predicting arguments - Before predict we need to vectorizer the text
query = input('Sinopse do anime para pesquisar: ')

index = kmeans.predict(tfidf_vectorizer.transform([query]))
print(f'Cluster: {index}')

Sinopse do anime para pesquisar: Tokyo has become a cruel and merciless city—a place where vicious creatures called “ghouls” exist alongside humans. The citizens of this once great metropolis live in constant fear of these bloodthirsty savages and their thirst for human flesh. However, the greatest threat these ghouls pose is their dangerous ability to masquerade as humans and blend in with society.
Cluster: [23]


In [92]:
# Looking for cluster
for k, v in enumerate(data):
    if v['cluster'] == index[0]:
        print(f"Anime: {v['name']} - Cluster: {v['cluster']}")

Anime: Fullmetal Alchemist: Brotherhood - Cluster: 23
Anime: Mob Psycho 100 II - Cluster: 23
Anime: Mononoke Hime - Cluster: 23
Anime: Code Geass: Hangyaku no Lelouch - Cluster: 23
Anime: Boku dake ga Inai Machi - Cluster: 23
Anime: Mob Psycho 100 - Cluster: 23
Anime: Gintama: Yorinuki Gintama-san on Theater 2D - Cluster: 23
Anime: Tenkuu no Shiro Laputa - Cluster: 23
Anime: Banana Fish - Cluster: 23
Anime: Hinamatsuri - Cluster: 23
Anime: Trigun - Cluster: 23
Anime: xxxHOLiC Rou - Cluster: 23
Anime: Detroit Metal City - Cluster: 23
Anime: Magi: The Labyrinth of Magic - Cluster: 23
Anime: Zankyou no Terror - Cluster: 23
Anime: Akira - Cluster: 23
Anime: Tokyo Magnitude 8.0 - Cluster: 23
Anime: Durarara!!x2 Shou - Cluster: 23
Anime: Phantom: Requiem for the Phantom - Cluster: 23
Anime: Kekkai Sensen & Beyond - Cluster: 23
Anime: Jigoku Shoujo Futakomori - Cluster: 23
Anime: Tokyo Ghoul - Cluster: 23
Anime: Giant Robo the Animation: Chikyuu ga Seishi Suru Hi - Cluster: 23
Anime: Slayers 