In [2]:
# Importaciones:
import pandas as pd
import string
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Prueba modelo BERT

Pruebas con modelo BERT 'sentence-transformers/msmarco-MiniLM-L-12-v3'.

Este modelo crea embeddings de los textos y luego se comparan los embeddings para encontrar la similitud. 

## Data cleaning:

Funciones para pre-procesado ya creadas:

In [3]:
def basic_preprocess(sentence):
    # Basic cleaning
    sentence = sentence.strip() ## remove whitespaces
    sentence = sentence.lower() ## lowercase 
    sentence = ''.join(char for char in sentence if not char.isdigit()) ## remove numbers
    
    # Advanced cleaning
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') ## remove punctuation
    
    return sentence

In [3]:
def preprocessing_techniques(sentence): 
    tokenized_sentence = word_tokenize(sentence) ## tokenize 
    
    #stopwords – no recomendado para sentiment analysis
    stop_words = set(stopwords.words('english'))
    stopwords_removed = [w for w in tokenized_sentence if not w in stop_words]
    
    # Lemmatizing the verbs
    verb_lemmatized = [
        WordNetLemmatizer().lemmatize(word, pos = "v") 
        for word in stopwords_removed
    ]
    
    # 2 - Lemmatizing the nouns
    noun_lemmatized = [                 
        WordNetLemmatizer().lemmatize(word, pos = "n") # n --> nouns
        for word in verb_lemmatized
     ]

    cleaned_sentence = ' '.join(word for word in noun_lemmatized)
    
    return cleaned_sentence

In [4]:
def preprocessing_techniques_2(sentence):
    tokenized_sentence = word_tokenize(sentence) ## tokenize 
    
    # Lemmatizing the verbs
    verb_lemmatized = [
        WordNetLemmatizer().lemmatize(word, pos = "v") 
        for word in tokenized_sentence
    ]
    
    # 2 - Lemmatizing the nouns
    noun_lemmatized = [                 
        WordNetLemmatizer().lemmatize(word, pos = "n") # n --> nouns
        for word in verb_lemmatized
     ]

    cleaned_sentence = ' '.join(word for word in noun_lemmatized)
    
    return cleaned_sentence

### Dataset books:

In [5]:
book_df = pd.read_csv('../raw_data/books_with_blurbs.csv')

In [6]:
# Clean book's blurs
book_df['base_cleaned_blur'] = book_df['Blurb'].apply(basic_preprocess)
book_df['full_preprocess_blur'] = book_df['base_cleaned_blur'].apply(preprocessing_techniques)
book_df['preprocess_with_stopw'] = book_df['base_cleaned_blur'].apply(preprocessing_techniques_2)

In [7]:
book_df.head(3)

Unnamed: 0,ISBN,Title,Author,Year,Publisher,Blurb,base_cleaned_blur,full_preprocess_blur,preprocess_with_stopw
0,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,"Here, for the first time in paperback, is an o...",here for the first time in paperback is an out...,first time paperback outstanding military hist...,here for the first time in paperback be an out...
1,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,"The fascinating, true story of the world's dea...",the fascinating true story of the worlds deadl...,fascinate true story world deadliest disease g...,the fascinate true story of the world deadlies...
2,399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,Winnie and Helen have kept each others worst s...,winnie and helen have kept each others worst s...,winnie helen keep others worst secret fifty ye...,winnie and helen have keep each others worst s...


In [8]:
book_df.to_csv('books_cleaned.csv', index=False)

### Dataset songs:

In [9]:
songs_df = pd.read_csv('../raw_data/spotify_millsongdata.csv')

In [10]:
# Clean songs's text
songs_df['base_cleaned_text'] = songs_df['text'].apply(basic_preprocess)
songs_df['full_preprocess_text'] = songs_df['base_cleaned_text'].apply(preprocessing_techniques)
songs_df['preprocess_with_stopw'] = songs_df['base_cleaned_text'].apply(preprocessing_techniques_2)

In [11]:
songs_df.head(3)

Unnamed: 0,artist,song,link,text,base_cleaned_text,full_preprocess_text,preprocess_with_stopw
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA...",look at her face its a wonderful face \r\nand...,look face wonderful face mean something specia...,look at her face it a wonderful face and it me...
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen...",take it easy with me please \r\ntouch me gent...,take easy please touch gently like summer even...,take it easy with me please touch me gently li...
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...,ill never know why i had to go \r\nwhy i had ...,ill never know go put lousy rotten show boy to...,ill never know why i have to go why i have to ...


In [12]:
songs_df.to_csv('songs_cleaned.csv', index=False)

## Traer el modelo:

Para poder usar el modelo, primero se debe instalar:

- "pip install sentence-transformers"
- "pip install torch"

Luego traermos el modelo desde esta librería.
Se debe realizar pre-procesado requerido por el modelo:
- Tokenizar (model.tokenizer)
- Truncar a un largo máximo de 512 los textos
- Incluir tokens especiales requeridos por el modelo

Luego del pre-procesado pasamos este texto tokenizado y truncado (si fuera necesario) al modelo -> Embeddings.

La similitud se calcula con la similitud del coseno entre los embeddings, método viene incluido en la librería. 

In [13]:
# Importaciones
from sentence_transformers import SentenceTransformer, util

### Traer el modelo:

In [14]:
model_name = 'sentence-transformers/msmarco-MiniLM-L-12-v3'
model = SentenceTransformer(model_name)

### Pre-procesado 

In [4]:
# # Procesar la columna 'base_cleaned_blur'
# tokens_1 = [model.tokenize(blurb) for blurb in book_df['base_cleaned_blur']]

# # Truncar los tokens si es necesario
# for i in range(len(tokens_1)):
#     if len(tokens_1[i]['input_ids']) > model.max_seq_length - 2:
#         tokens_1[i]['input_ids'] = tokens_1[i]['input_ids'][:model.max_seq_length - 2]
#         tokens_1[i]['token_type_ids'] = tokens_1[i]['token_type_ids'][:model.max_seq_length - 2]
#         tokens_1[i]['attention_mask'] = tokens_1[i]['attention_mask'][:model.max_seq_length - 2]

# # Añadir tokens especiales [CLS] y [SEP] a cada tensor
# tokens_special_1 = [
#     {
#         'input_ids': [101] + tokens['input_ids'].tolist() + [102],
#         'token_type_ids': [0] + tokens['token_type_ids'].tolist() + [0],
#         'attention_mask': [1] + tokens['attention_mask'].tolist() + [1],
#     }
#     for tokens in tokens_1
# ]

# # Codificar los embeddings
# embedding_books = model.encode(tokens_special_1, convert_to_tensor=True)

# # Añadir la columna de embeddings al dataframe
# book_df['embeddings_basic_preproc'] = embedding_books.tolist()