In [1]:
import pandas as pd
import numpy as np
import string
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Cleaning Dataset

## Definición de funciones

### Basic Preprocess

El preprocesado incluye:

- Strip
- Lowercase
- Numbers
- Punctuation and Symbols

In [8]:
def basic_preprocess(sentence):
    # Basic cleaning
    sentence = sentence.strip() ## remove whitespaces
    sentence = sentence.lower() ## lowercase 
    sentence = ''.join(char for char in sentence if not char.isdigit()) ## remove numbers
    
    # Advanced cleaning
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') ## remove punctuation
    
    return sentence

### Preprocesado con todas las técnicas

La función incluye:
- Tokenizar
- Stopwords
- Lemmatizing

In [9]:
def preprocessing_techniques(sentence): 
    tokenized_sentence = word_tokenize(sentence) ## tokenize 
    
    #stopwords – no recomendado para sentiment analysis
    stop_words = set(stopwords.words('english'))
    stopwords_removed = [w for w in tokenized_sentence if not w in stop_words]
    
    # Lemmatizing the verbs
    verb_lemmatized = [
        WordNetLemmatizer().lemmatize(word, pos = "v") 
        for word in stopwords_removed
    ]
    
    # 2 - Lemmatizing the nouns
    noun_lemmatized = [                 
        WordNetLemmatizer().lemmatize(word, pos = "n") # n --> nouns
        for word in verb_lemmatized
     ]

    cleaned_sentence = ' '.join(word for word in noun_lemmatized)
    
    return cleaned_sentence

### Preprocess dejando stopwords

Realizamos preprocesado dejando las stopswords.

La función sólo incluye: 

- Tokenizar
- Lemmatizing

In [10]:
def preprocessing_techniques_2(sentence):
    tokenized_sentence = word_tokenize(sentence) ## tokenize 
    
    # Lemmatizing the verbs
    verb_lemmatized = [
        WordNetLemmatizer().lemmatize(word, pos = "v") 
        for word in tokenized_sentence
    ]
    
    # 2 - Lemmatizing the nouns
    noun_lemmatized = [                 
        WordNetLemmatizer().lemmatize(word, pos = "n") # n --> nouns
        for word in verb_lemmatized
     ]

    cleaned_sentence = ' '.join(word for word in noun_lemmatized)
    
    return cleaned_sentence

## Cleaning datasets

In [11]:
data_book = pd.read_csv('../raw_data/books_with_blurbs.csv')

In [12]:
data_book.head()

Unnamed: 0,ISBN,Title,Author,Year,Publisher,Blurb
0,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,"Here, for the first time in paperback, is an o..."
1,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,"The fascinating, true story of the world's dea..."
2,399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,Winnie and Helen have kept each others worst s...
3,425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000,Berkley Publishing Group,Historians and inquisitive laymen alike love t...
4,1881320189,Goodbye to the Buttermilk Sky,Julia Oliver,1994,River City Pub,This highly praised first novel by fiction wri...


In [6]:
data_book['Author'].value_counts()

Stephen King           257
Agatha Christie        203
William Shakespeare    162
Nora Roberts           149
Terry Pratchett        124
                      ... 
CIJI WARE                1
Anne Truitt              1
John Martin Taylor       1
Edgar-Allan Poe          1
Jane Johnson             1
Name: Author, Length: 24041, dtype: int64

In [13]:
data_book.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57510 entries, 0 to 57509
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ISBN       57510 non-null  object
 1   Title      57510 non-null  object
 2   Author     57510 non-null  object
 3   Year       57510 non-null  int64 
 4   Publisher  57510 non-null  object
 5   Blurb      57510 non-null  object
dtypes: int64(1), object(5)
memory usage: 2.6+ MB


### Cleaning Dataset Books

In [14]:
# Clean book's blurs
data_book['base_cleaned_blur'] = data_book['Blurb'].apply(basic_preprocess)
data_book['full_preprocess_blur'] = data_book['base_cleaned_blur'].apply(preprocessing_techniques)
data_book['preprocess_with_stopw'] = data_book['base_cleaned_blur'].apply(preprocessing_techniques_2)

In [15]:
data_book.head()

Unnamed: 0,ISBN,Title,Author,Year,Publisher,Blurb,base_cleaned_blur,full_preprocess_blur,preprocess_with_stopw
0,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,"Here, for the first time in paperback, is an o...",here for the first time in paperback is an out...,first time paperback outstanding military hist...,here for the first time in paperback be an out...
1,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,"The fascinating, true story of the world's dea...",the fascinating true story of the worlds deadl...,fascinate true story world deadliest disease g...,the fascinate true story of the world deadlies...
2,399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,Winnie and Helen have kept each others worst s...,winnie and helen have kept each others worst s...,winnie helen keep others worst secret fifty ye...,winnie and helen have keep each others worst s...
3,425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000,Berkley Publishing Group,Historians and inquisitive laymen alike love t...,historians and inquisitive laymen alike love t...,historian inquisitive layman alike love ponder...,historian and inquisitive layman alike love to...
4,1881320189,Goodbye to the Buttermilk Sky,Julia Oliver,1994,River City Pub,This highly praised first novel by fiction wri...,this highly praised first novel by fiction wri...,highly praise first novel fiction writer julia...,this highly praise first novel by fiction writ...


### Cleaning Dataset Songs

In [16]:
data_songs = pd.read_csv('../raw_data/spotify_millsongdata.csv')

In [17]:
data_songs.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [18]:
# Clean songs's text
data_songs['base_cleaned_text'] = data_songs['text'].apply(basic_preprocess)
data_songs['full_preprocess_text'] = data_songs['base_cleaned_text'].apply(preprocessing_techniques)
data_songs['preprocess_with_stopw'] = data_songs['base_cleaned_text'].apply(preprocessing_techniques_2)

In [19]:
data_songs.head()

Unnamed: 0,artist,song,link,text,base_cleaned_text,full_preprocess_text,preprocess_with_stopw
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA...",look at her face its a wonderful face \r\nand...,look face wonderful face mean something specia...,look at her face it a wonderful face and it me...
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen...",take it easy with me please \r\ntouch me gent...,take easy please touch gently like summer even...,take it easy with me please touch me gently li...
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...,ill never know why i had to go \r\nwhy i had ...,ill never know go put lousy rotten show boy to...,ill never know why i have to go why i have to ...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...,making somebody happy is a question of give an...,make somebody happy question give take learn s...,make somebody happy be a question of give and ...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...,making somebody happy is a question of give an...,make somebody happy question give take learn s...,make somebody happy be a question of give and ...


# Modelo

## Modelo General de BERT con full preprocess

### Preparación de los datasets

#### Dataset con full preprocess

**Creamos nuevo dataset con columnas seleccionadas**:
- Libros: ISBN, Title, full_preprocess_blur
- Música: artist, song, full_preprocess_text   

In [13]:
#X_book = data_book[['ISBN','Title', 'full_preprocess_blur']]

In [14]:
#X_song = data_songs[['artist', 'song', 'full_preprocess_text']]

#### Training/Validation Split

In [15]:
#from sklearn.model_selection import train_test_split

In [16]:
#X_book_train, X_book_test = train_test_split(X_book, test_size=0.2, random_state=20)

### Modelo General de BERT

Instalamos librerías necesarias para el modelo de Bert

In [21]:
#!pip install torch
#!pip install transformers

Collecting torch
  Downloading torch-2.1.2-cp310-cp310-manylinux1_x86_64.whl.metadata (25 kB)
Collecting filelock (from torch)
  Downloading filelock-3.13.1-py3-none-any.whl.metadata (2.8 kB)
Collecting sympy (from torch)
  Downloading sympy-1.12-py3-none-any.whl (5.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.7/5.7 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting 

[0mCollecting transformers
  Downloading transformers-4.36.1-py3-none-any.whl.metadata (126 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.8/126.8 kB[0m [31m310.0 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting filelock (from transformers)
  Using cached filelock-3.13.1-py3-none-any.whl.metadata (2.8 kB)
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers)
  Downloading huggingface_hub-0.19.4-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<0.19,>=0.14 (from transformers)
  Downloading tokenizers-0.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting fsspec>=2023.5.0 (from huggingface-hub<1.0,>=0.19.3->transformers)
  Downloading fsspec-2023.12.2-py3-none-any.whl.metadata (6.8 kB)
Downloading transformers-4.36.1-py3-none-any.whl (8.3 M

In [28]:
#!pip install torch

[0mCollecting torch
  Using cached torch-2.1.2-cp310-cp310-manylinux1_x86_64.whl.metadata (25 kB)
Collecting sympy (from torch)
  Using cached sympy-1.12-py3-none-any.whl (5.7 MB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-

In [18]:
'''import torch
from transformers import BertTokenizer, BertModel
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from torch.utils.data import DataLoader, Dataset, TensorDataset
from tqdm import tqdm'''

In [51]:
#definimos el modelo y tokenizador
#text = list(X_book_train['full_preprocess_blur'])
#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#tokenizer_text = [tokenizer(b, return_tensors='pt', padding=True, truncation=True, max_length=250) for b in text]

# Añadir tokens especiales [CLS] y [SEP]
#input_ids = [t['input_ids'][0] for t in tokenizer_text]

In [52]:
#Cargamos modelo de Bert
#model = BertModel.from_pretrained('bert-base-uncased')

In [59]:
#Obtenemos representaciones de embedding
#embeddings = []

#for texts in text:
#    tokens = tokenizer(texts, return_tensors='pt')
#    with torch.no_grad():
#        outputs = model(**tokens)
#    last_hidden_state = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
#    embeddings.append(last_hidden_state)

Token indices sequence length is longer than the specified maximum sequence length for this model (574 > 512). Running this sequence through the model will result in indexing errors


RuntimeError: The size of tensor a (574) must match the size of tensor b (512) at non-singleton dimension 1

In [61]:
# Cargar modelo y tokenizador BERT
#model_name = 'bert-base-uncased'
#tokenizer = BertTokenizer.from_pretrained(model_name)
#model = BertModel.from_pretrained(model_name)

# Tokenizar y obtener embeddings para cada blurb
#def get_bert_embeddings(text):
#    tokens = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
#    with torch.no_grad():
#        outputs = model(**tokens)
#    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Aplicar tokenización y obtención de embeddings a todos los blurbs
#X_book_train['bert_embeddings'] = X_book_train['full_preprocess_blur'].apply(get_bert_embeddings)

# Reducción de dimensionalidad con PCA
#pca = PCA(n_components=50)
#reduced_embeddings = pca.fit_transform(X_book_train['bert_embeddings'].tolist())

# Clustering con KMeans
#num_clusters = 30  # Ajusta este valor según tus necesidades
#kmeans = KMeans(n_clusters=num_clusters, random_state=42)
#X_book_train['cluster'] = kmeans.fit_predict(reduced_embeddings)

# Resultados
#print(X_book_train[['Title', 'cluster']])

KeyboardInterrupt: 

## Word2Vec y Kmeans

In [20]:
data_book.head()

Unnamed: 0,ISBN,Title,Author,Year,Publisher,Blurb,base_cleaned_blur,full_preprocess_blur,preprocess_with_stopw
0,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,"Here, for the first time in paperback, is an o...",here for the first time in paperback is an out...,first time paperback outstanding military hist...,here for the first time in paperback be an out...
1,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,"The fascinating, true story of the world's dea...",the fascinating true story of the worlds deadl...,fascinate true story world deadliest disease g...,the fascinate true story of the world deadlies...
2,399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,Winnie and Helen have kept each others worst s...,winnie and helen have kept each others worst s...,winnie helen keep others worst secret fifty ye...,winnie and helen have keep each others worst s...
3,425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000,Berkley Publishing Group,Historians and inquisitive laymen alike love t...,historians and inquisitive laymen alike love t...,historian inquisitive layman alike love ponder...,historian and inquisitive layman alike love to...
4,1881320189,Goodbye to the Buttermilk Sky,Julia Oliver,1994,River City Pub,This highly praised first novel by fiction wri...,this highly praised first novel by fiction wri...,highly praise first novel fiction writer julia...,this highly praise first novel by fiction writ...


In [21]:
def preprocessing_token(sentence): 
    tokenized_sentence = word_tokenize(sentence) ## tokenize
    return tokenized_sentence

In [22]:
from sklearn.model_selection import train_test_split

X_book_train, X_book_test = train_test_split(data_book, test_size=0.2, random_state=20)

In [23]:
X_book_train['tokens'] = X_book_train['base_cleaned_blur'].apply(preprocessing_token)
X_book_test['tokens'] = X_book_test['base_cleaned_blur'].apply(preprocessing_token)

In [24]:
X_book_train.head()

Unnamed: 0,ISBN,Title,Author,Year,Publisher,Blurb,base_cleaned_blur,full_preprocess_blur,preprocess_with_stopw,tokens
55380,082176277X,"Perfect Partners (Zebra Bouquet Romances, No 4)",Karen Drogin,1999,Kensington Publishing Corporation,"For the first time in her career, attorney Che...",for the first time in her career attorney chel...,first time career attorney chelsie russell wan...,for the first time in her career attorney chel...,"[for, the, first, time, in, her, career, attor..."
19021,0345365933,Texas Sunrise,FERN MICHAELS,1994,Fawcett,"""As sizzling and sexy as its predecessors, ""Te...",as sizzling and sexy as its predecessors texas...,sizzle sexy predecessor texas sunrise share fi...,a sizzle and sexy a it predecessor texas sunri...,"[as, sizzling, and, sexy, as, its, predecessor..."
36493,0671456547,Truman,David McCullough,1992,Simon &amp; Schuster,An alternate cover edition of this ISBN can be...,an alternate cover edition of this isbn can be...,alternate cover edition isbn find pulitzer pri...,an alternate cover edition of this isbn can be...,"[an, alternate, cover, edition, of, this, isbn..."
12185,1588370011,Rooster,Beth Nixon Weaver,2001,Winslow Press,"Rooster has always called Kady his madrina, hi...",rooster has always called kady his madrina his...,rooster always call kady madrina godmother bea...,rooster have always call kady his madrina his ...,"[rooster, has, always, called, kady, his, madr..."
13546,274272494X,"Nord perdu ;: Suivi de, Douze France (Un endro...",Nancy Huston,1999,LemÃ©ac,"Nancy Huston, Canadienne anglophone, vit à Par...",nancy huston canadienne anglophone vit à paris...,nancy huston canadienne anglophone vit à paris...,nancy huston canadienne anglophone vit à paris...,"[nancy, huston, canadienne, anglophone, vit, à..."


In [25]:
X_book_train_list = X_book_train['tokens'].tolist()
X_book_test_list = X_book_test['tokens'].tolist()

In [32]:
X_train_prueba = X_book_train_list[0:100]

In [38]:
X_train_prueba[0]

['for',
 'the',
 'first',
 'time',
 'in',
 'her',
 'career',
 'attorney',
 'chelsie',
 'russell',
 'wants',
 'to',
 'lose',
 'a',
 'case',
 'representing',
 'her',
 'parents',
 'in',
 'the',
 'custody',
 'battle',
 'over',
 'their',
 'twoyearold',
 'granddaughter',
 'alix',
 'has',
 'been',
 'a',
 'mistake',
 'and',
 'only',
 'added',
 'to',
 'her',
 'sorrow',
 'over',
 'her',
 'sister',
 'and',
 'brotherinlaws',
 'tragic',
 'death',
 'so',
 'when',
 'custody',
 'is',
 'awarded',
 'to',
 'alixs',
 'bachelor',
 'uncle',
 'griffin',
 'stuart',
 'chelsie',
 'is',
 'relieved',
 'and',
 'determined',
 'not',
 'to',
 'let',
 'her',
 'beloved',
 'niece',
 'slip',
 'out',
 'of',
 'her',
 'life',
 'and',
 'soon',
 'by',
 'uniting',
 'to',
 'bring',
 'tenderness',
 'into',
 'the',
 'little',
 'girls',
 'life',
 'chelsie',
 'and',
 'griffin',
 'form',
 'an',
 'emotional',
 'connection',
 'themselves',
 'a',
 'connection',
 'called',
 'love']

In [39]:
from gensim.models import Word2Vec

word2vec = Word2Vec(sentences=X_train_prueba, vector_size=100, window=5, min_count=1, workers=4)

In [40]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Function to convert a sentence (list of words) into a matrix representing the words in the embedding space
def embed_sentence(word2vec, sentence):
    embedded_sentence = []
    for word in sentence:
        if word in word2vec.wv:
            embedded_sentence.append(word2vec.wv[word])
       
    return np.array(embedded_sentence)

# Function that converts a list of sentences into a list of matrices
def embedding(word2vec, sentences):
    embed = []
    
    for sentence in sentences:
        embedded_sentence = embed_sentence(word2vec, sentence)
        embed.append(embedded_sentence)
        
    return embed

In [42]:
# Embed the training and test sentences
X_book_train_embed = embedding(word2vec, X_train_prueba)

In [43]:
# Pad the training and test embedded sentences
X_book_train_pad = pad_sequences(X_book_train_embed, dtype='float32', padding='post', maxlen=200)

In [49]:
X_book_train_pad[0]

array([[-0.03663874,  0.0364657 ,  0.01652502, ..., -0.02519999,
         0.00521476,  0.01505201],
       [-0.07521074,  0.08860131,  0.02068776, ..., -0.07300383,
        -0.00152524,  0.01981382],
       [-0.00637566,  0.02022692, -0.0017746 , ..., -0.01901261,
        -0.00263104,  0.01149691],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]], dtype=float32)

In [50]:
results = []
for x in X_book_train_pad:
    mean = np.mean(x)
    results.append(mean)

In [54]:
results_df = pd.DataFrame(results)

In [55]:
#normalizar los resultados
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
results_normalized = scaler.fit_transform(results_df)

In [56]:
from sklearn.cluster import KMeans

# Número de clusters que deseas
num_clusters = 5

kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(results_normalized)

  super()._check_params_vs_input(X, default_n_init=10)
