## Importar librerías

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer #para tokenizar
from nltk.stem.snowball import SnowballStemmer #para stemmatizar
import joblib

[nltk_data] Downloading package stopwords to C:\Users\CONECTIA
[nltk_data]     BA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\CONECTIA
[nltk_data]     BA\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Extraer datos

In [2]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,isbn13,isbn10,title,subtitle,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count
0,9780002005883,2005883,Gilead,,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0
1,9780002261982,2261987,Spider's Web,A Novel,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0
2,9780006163831,6163831,The One Tree,,Stephen R. Donaldson,American fiction,http://books.google.com/books/content?id=OmQaw...,Volume Two of Stephen Donaldson's acclaimed se...,1982.0,3.97,479.0,172.0
3,9780006178736,6178731,Rage of angels,,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0
4,9780006280897,6280897,The Four Loves,,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0


## Explorar datos

In [3]:
df.shape

(6810, 12)

In [4]:
df.isna().sum()

isbn13               0
isbn10               0
title                0
subtitle          4429
authors             72
categories          99
thumbnail          329
description        262
published_year       6
average_rating      43
num_pages           43
ratings_count       43
dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6810 entries, 0 to 6809
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   isbn13          6810 non-null   int64  
 1   isbn10          6810 non-null   object 
 2   title           6810 non-null   object 
 3   subtitle        2381 non-null   object 
 4   authors         6738 non-null   object 
 5   categories      6711 non-null   object 
 6   thumbnail       6481 non-null   object 
 7   description     6548 non-null   object 
 8   published_year  6804 non-null   float64
 9   average_rating  6767 non-null   float64
 10  num_pages       6767 non-null   float64
 11  ratings_count   6767 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 638.6+ KB


## Procesar datos

In [6]:
#me quedo unicamente con las columnas que sonsidero importantes para recomendar libros
df_features = df[['title','authors','categories', 'description']]
df_features.head(5)

Unnamed: 0,title,authors,categories,description
0,Gilead,Marilynne Robinson,Fiction,A NOVEL THAT READERS and critics have been eag...
1,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,A new 'Christie for Christmas' -- a full-lengt...
2,The One Tree,Stephen R. Donaldson,American fiction,Volume Two of Stephen Donaldson's acclaimed se...
3,Rage of angels,Sidney Sheldon,Fiction,"A memorable, mesmerizing heroine Jennifer -- b..."
4,The Four Loves,Clive Staples Lewis,Christian life,Lewis' work on the nature of love divides love...


In [7]:
df_features.isna().sum()

title            0
authors         72
categories      99
description    262
dtype: int64

In [8]:
#considerando que no hay menos del 10% de datos nulos, se procede a borrarlos
df_clean = df_features.dropna()
df_clean.isna().sum()

title          0
authors        0
categories     0
description    0
dtype: int64

In [9]:
#datos totales
df_clean.shape

(6446, 4)

In [10]:
#eliminar registros repetidos
df_no_dup = df_clean.drop_duplicates()
df_no_dup.shape

(6437, 4)

In [11]:
#se resetea el indice para que no haya problemas más adelante
df_no_dup.reset_index(inplace=True, drop=True)
df_no_dup.head()

Unnamed: 0,title,authors,categories,description
0,Gilead,Marilynne Robinson,Fiction,A NOVEL THAT READERS and critics have been eag...
1,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,A new 'Christie for Christmas' -- a full-lengt...
2,The One Tree,Stephen R. Donaldson,American fiction,Volume Two of Stephen Donaldson's acclaimed se...
3,Rage of angels,Sidney Sheldon,Fiction,"A memorable, mesmerizing heroine Jennifer -- b..."
4,The Four Loves,Clive Staples Lewis,Christian life,Lewis' work on the nature of love divides love...


In [12]:
#ver todas las categorías de la columna 'categories'
df_no_dup['categories'].unique()

array(['Fiction', 'Detective and mystery stories', 'American fiction',
       'Christian life', 'Authors, English', 'Africa, East',
       'Hyland, Morn (Fictitious character)', 'Adventure stories',
       'Arthurian romances', 'Fantasy fiction', 'English drama',
       'Country life', 'English fiction', 'Clergy',
       'Aubrey, Jack (Fictitious character)',
       'Detective and mystery stories, English', 'Black Death',
       'Human cloning', 'Science fiction', 'Great Britain',
       'American essays', 'China', 'Capitalism', 'Ireland',
       'Juvenile Fiction', "Children's stories, English",
       'Male friendship', 'Literary Collections',
       'Beresford, Tommy (Fictitious character)',
       'Imaginary wars and battles', 'Dysfunctional families',
       'Poirot, Hercule (Fictitious character)', 'Christmas stories',
       'Marple, Jane (Fictitious character)', 'Belgians',
       'Battle, Superintendent (Fictitious character)',
       'Baggins, Frodo (Fictitious character)', '

In [13]:
#total de tipos de categorias
df_no_dup['categories'].nunique()

531

In [14]:
#diccionario para reemplazar algunas categorias por otras
categoria_map = { "^science fiction":"Science Fiction", ".*science fiction.*":"Science Fiction",
    "^fiction":"Fiction", "^fantasy":"Fantasy", ".*fiction.*":"Fiction",".*fantasy.*": "Fantasy",
    ".*mystery.*": "Mystery",'Mysticism': 'Spirituality', ".*christian.*":'Religion', ".*religion.*":'Religion',
    ".*biography.*":'Biography', ".*history.*": "History", ".*children.*":"Children", 'Juvenile Nonfiction': 'Children',
    ".*short stories.*": "Fiction",".*adventure.*": "Adventure", ".*drama.*": "Drama", ".*poetry.*": "Poetry",
    ".*love.*": "Romance",".*courtship.*": "Romance",".*sex.*": "Romance",".*romance.*": "Romance", 
    ".*horror.*": "Horror", ".*self-help.*": "Self-Help", ".*psychology.*": "Psychology", ".*philosophy.*": "Psychology",
    ".*spiritual life.*": 'Spirituality',".*computer.*": 'Technology', ".*science.*": "Science", "^political":'Politics', 
    "^crime":'Crime', ".*crime.*":'Crime', "^literary":'Literary Studies', ".*novels.*": 'Graphic Novel', ".*health.*": 'Health', 
    ".*medical.*": 'Health', ".*family.*": 'Family', ".*humor.*": 'Humor', 'language arts & disciplines': 'Education', 
    ".*study.*": 'Education', '.*music.*': 'Arts', '.*art.*': 'Arts', ".*architecture.*": 'Arts', ".*education.*": 'Education', 
    'nature': 'Nature', 'animals': 'Nature', 'pets': 'Nature',".*travel.*": 'Travel', "cook": 'Cooking',
    ".*technology.*": 'Technology',".*games.*": 'Games', ".*sports.*": 'Sports', ".*business.*": 'Business',
    'social science': 'Sociology', 'sociology': 'Sociology'} 

In [15]:
#crear una columna con las categorías ya reemplazadas y de paso aprovecho para que la columna authors sea más "claro"
df_cln_category = df_no_dup.copy()
df_cln_category['categorias_limpias'] = df_cln_category['categories'].str.lower().replace(categoria_map, regex=True)
df_cln_category['authors'] = df_cln_category['authors'].str.replace(';', '; ')
df_cln_category.head()

Unnamed: 0,title,authors,categories,description,categorias_limpias
0,Gilead,Marilynne Robinson,Fiction,A NOVEL THAT READERS and critics have been eag...,Fiction
1,Spider's Web,Charles Osborne; Agatha Christie,Detective and mystery stories,A new 'Christie for Christmas' -- a full-lengt...,Mystery
2,The One Tree,Stephen R. Donaldson,American fiction,Volume Two of Stephen Donaldson's acclaimed se...,Fiction
3,Rage of angels,Sidney Sheldon,Fiction,"A memorable, mesmerizing heroine Jennifer -- b...",Fiction
4,The Four Loves,Clive Staples Lewis,Christian life,Lewis' work on the nature of love divides love...,Religion


In [16]:
#cantidad de caegorias despues del remplazo
df_cln_category['categorias_limpias'].nunique()

410

In [None]:
#crear dos nevas columnas, una que muestre las columnas que se van a usar para crear el modelo (text) 
# y otra donde nos enseñe el titulo del libro y sus autores ('title_author')
df_join = df_cln_category.copy()

df_join['text'] = df_cln_category[['title','authors','categorias_limpias', 'description']].apply(lambda x: ' '.join(x.values.astype(str)), axis=1)
df_join['title_author'] = df_cln_category[['title','authors']].apply(lambda x: ' de '.join(x.values.astype(str)), axis=1)
df_join.head()

Unnamed: 0,title,authors,categories,description,categorias_limpias,text,title_author
0,Gilead,Marilynne Robinson,Fiction,A NOVEL THAT READERS and critics have been eag...,Fiction,Gilead Marilynne Robinson Fiction A NOVEL THAT...,Gilead de Marilynne Robinson
1,Spider's Web,Charles Osborne; Agatha Christie,Detective and mystery stories,A new 'Christie for Christmas' -- a full-lengt...,Mystery,Spider's Web Charles Osborne; Agatha Christie ...,Spider's Web de Charles Osborne; Agatha Christie
2,The One Tree,Stephen R. Donaldson,American fiction,Volume Two of Stephen Donaldson's acclaimed se...,Fiction,The One Tree Stephen R. Donaldson Fiction Volu...,The One Tree de Stephen R. Donaldson
3,Rage of angels,Sidney Sheldon,Fiction,"A memorable, mesmerizing heroine Jennifer -- b...",Fiction,Rage of angels Sidney Sheldon Fiction A memora...,Rage of angels de Sidney Sheldon
4,The Four Loves,Clive Staples Lewis,Christian life,Lewis' work on the nature of love divides love...,Religion,The Four Loves Clive Staples Lewis Religion Le...,The Four Loves de Clive Staples Lewis


In [18]:
#eliminar libros repetidos
df_no_dup_title_author = df_join.drop_duplicates(subset='title_author')
#resetear index
df_no_dup_title_author.reset_index(inplace=True, drop=True)
#tamañ del dataframe
df_no_dup_title_author.shape

(6279, 7)

In [19]:
#columna que se va a usar para crear el modelo de recomendación
input = df_no_dup_title_author['text']
input

0       Gilead Marilynne Robinson Fiction A NOVEL THAT...
1       Spider's Web Charles Osborne; Agatha Christie ...
2       The One Tree Stephen R. Donaldson Fiction Volu...
3       Rage of angels Sidney Sheldon Fiction A memora...
4       The Four Loves Clive Staples Lewis Religion Le...
                              ...                        
6274    Journey to the East Hermann Hesse Adventure Th...
6275    The Monk Who Sold His Ferrari: A Fable About F...
6276    I Am that Sri Nisargadatta Maharaj; Sudhakar S...
6277    The Berlin Phenomenology Georg Wilhelm Friedri...
6278    'I'm Telling You Stories' Helena Grice; Tim Wo...
Name: text, Length: 6279, dtype: object

In [20]:
#función para tokenizar y limpiar datos
def clean_token(corpus, patron):
    stop_words = set(stopwords.words('english'))
    #se tokeniza teniendo en cuenta el patron pasado por parametro
    tokenizer = RegexpTokenizer(patron) 
    
    #lista para ir guardando los textos tokenizados y limpios
    cleaned_tokens= []

    for text in corpus:
        #hacer que los textos esten en minúscula, tokenizarlos y filtrarlos por las stop_words
        text = text.lower()
        tokens = tokenizer.tokenize(text)
        filtered_tokens = [token for token in tokens if token not in stop_words]
        cleaned_tokens.append(filtered_tokens)
        
    #guardarlos en una Serie de pandas
    clean_token = pd.Series(cleaned_tokens)
    return clean_token

In [21]:
#patrón para limpiar los textos
patron = '[^\\s^.^;^,^:,^(, ^)]+'
#usar funcion para limpiar y tokenizar los textos
X_cleaned = clean_token(input, patron)
X_cleaned

0       [gilead, marilynne, robinson, fiction, novel, ...
1       [spider's, web, charles, osborne, agatha, chri...
2       [one, tree, stephen, r, donaldson, fiction, vo...
3       [rage, angels, sidney, sheldon, fiction, memor...
4       [four, loves, clive, staples, lewis, religion,...
                              ...                        
6274    [journey, east, hermann, hesse, adventure, boo...
6275    [monk, sold, ferrari, fable, fulfilling, dream...
6276    [sri, nisargadatta, maharaj, sudhakar, dikshit...
6277    [berlin, phenomenology, georg, wilhelm, friedr...
6278    ['i'm, telling, stories', helena, grice, tim, ...
Length: 6279, dtype: object

In [22]:
#función para stemming
def text_stemmered(corpus_tokens):
    #SnowballStemmer para stemmatizar los textos
    stemmer = SnowballStemmer('english')
    #lista para ir guardando los textos ya stemmatizados
    list_stemmed_texts = []
    
    #stemmatizar los textos
    for texto in corpus_tokens:
        stems = [stemmer.stem(palabra) for palabra in texto]
        texto_stems = ' '.join(stems)
        list_stemmed_texts.append(texto_stems)
        
    #guardar los textos ya stemmatizados en una Serie    
    text_stemmered = pd.Series(list_stemmed_texts)
    return text_stemmered

In [23]:
#Stematizar los textos ya tokenizados y limpios
X_stemmered = text_stemmered(X_cleaned)
X_stemmered

0       gilead marilynn robinson fiction novel reader ...
1       spider web charl osborn agatha christi mysteri...
2       one tree stephen r donaldson fiction volum two...
3       rage angel sidney sheldon fiction memor mesmer...
4       four love clive stapl lewi religion lewi work ...
                              ...                        
6274    journey east hermann hess adventur book tell t...
6275    monk sold ferrari fabl fulfil dream & reach de...
6276    sri nisargadatta maharaj sudhakar dikshit psyc...
6277    berlin phenomenolog georg wilhelm friedrich he...
6278    i'm tell stori helena grice tim wood literari ...
Length: 6279, dtype: object

## Crear modelo

In [24]:
#se usa Tf-Idf para crear el modelo de recomendación
tf_idf = TfidfVectorizer(max_features=2000,)

#se transforman los textos 
X= tf_idf.fit_transform(X_stemmered)
X

<6279x2000 sparse matrix of type '<class 'numpy.float64'>'
	with 174631 stored elements in Compressed Sparse Row format>

In [25]:
#crear serie que contengan los titulos y l autores de los libros
books = pd.Series(df_join.index, index=df_join['title_author'])
books.index = books.index.str.strip()
books

title_author
Gilead de Marilynne Robinson                                                                                      0
Spider's Web de Charles Osborne; Agatha Christie                                                                  1
The One Tree de Stephen R. Donaldson                                                                              2
Rage of angels de Sidney Sheldon                                                                                  3
The Four Loves de Clive Staples Lewis                                                                             4
                                                                                                               ... 
Journey to the East de Hermann Hesse                                                                           6432
The Monk Who Sold His Ferrari: A Fable About Fulfilling Your Dreams & Reaching Your Destiny de Robin Sharma    6433
I Am that de Sri Nisargadatta Maharaj; Sudhakar S. Dikshit 

In [26]:
#Función para buscar recomendaciones de libros
def recomendacion(book, books=books, X=X):
    #se indica el indice del libro dado por parametro y se busca en X
    id = books[book]
    query = X[id].toarray()
    #se usa cosine similarity para ver la similitud
    similari = cosine_similarity(query, X).flatten()
    #se muetran los más similares ordenados del menor a mayor comenzando por el indice 1 para que no repita el libro
    recomend = (-similari).argsort()[1:11]
    #se buscan los indices que tuvieron mayor similitud con el libro dado
    recomends = df_join['title_author'].iloc[recomend]
    print('LIBROS RECOMENDADOS')
    print(recomends)

## Probar modelo

In [27]:
recomendacion("Spider's Web de Charles Osborne; Agatha Christie")

LIBROS RECOMENDADOS
16      Witness for the Prosecution & Selected Plays d...
1546                     The Starry Rift de James Tiptree
5739    Preludes and Nocturnes de Neil Gaiman; Sam Kie...
5741                              The Wake de Neil Gaiman
49               A Murder is Announced de Agatha Christie
55              The Secret of Chimneys de Agatha Christie
6                     An Autobiography de Agatha Christie
72              The Listerdale Mystery de Agatha Christie
2466                            Rainbow Six de Tom Clancy
1428                      Marilyn Manson de Kurt Reighley
Name: title_author, dtype: object


## Guardar archivos importantes

In [28]:
#se crea dataframe con las columnas más importantes
df_final = pd.DataFrame(df_no_dup_title_author['title_author'])
df_final['text_stemmered'] = X_stemmered.values
df_final

Unnamed: 0,title_author,text_stemmered
0,Gilead de Marilynne Robinson,gilead marilynn robinson fiction novel reader ...
1,Spider's Web de Charles Osborne; Agatha Christie,spider web charl osborn agatha christi mysteri...
2,The One Tree de Stephen R. Donaldson,one tree stephen r donaldson fiction volum two...
3,Rage of angels de Sidney Sheldon,rage angel sidney sheldon fiction memor mesmer...
4,The Four Loves de Clive Staples Lewis,four love clive stapl lewi religion lewi work ...
...,...,...
6274,Journey to the East de Hermann Hesse,journey east hermann hess adventur book tell t...
6275,The Monk Who Sold His Ferrari: A Fable About F...,monk sold ferrari fabl fulfil dream & reach de...
6276,I Am that de Sri Nisargadatta Maharaj; Sudhaka...,sri nisargadatta maharaj sudhakar dikshit psyc...
6277,The Berlin Phenomenology de Georg Wilhelm Frie...,berlin phenomenolog georg wilhelm friedrich he...


In [29]:
#guardar X entrenado y df_final
joblib.dump(X, 'X_entrenado.pkl')
df_final.to_csv('df.csv', index=False)