In [1]:
import pandas as pd
import string
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Cleaning Dataset

## Definición de funciones

### Basic Preprocess

El preprocesado incluye:

- Strip
- Lowercase
- Numbers
- Punctuation and Symbols

In [2]:
def basic_preprocess(sentence):
    # Basic cleaning
    sentence = sentence.strip() ## remove whitespaces
    sentence = sentence.lower() ## lowercase 
    sentence = ''.join(char for char in sentence if not char.isdigit()) ## remove numbers
    
    # Advanced cleaning
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') ## remove punctuation
    
    return sentence

### Preprocesado con todas las técnicas

La función incluye:
- Tokenizar
- Stopwords
- Lemmatizing

In [3]:
def preprocessing_techniques(sentence): 
    tokenized_sentence = word_tokenize(sentence) ## tokenize 
    
    #stopwords – no recomendado para sentiment analysis
    stop_words = set(stopwords.words('english'))
    stopwords_removed = [w for w in tokenized_sentence if not w in stop_words]
    
    # Lemmatizing the verbs
    verb_lemmatized = [
        WordNetLemmatizer().lemmatize(word, pos = "v") 
        for word in stopwords_removed
    ]
    
    # 2 - Lemmatizing the nouns
    noun_lemmatized = [                 
        WordNetLemmatizer().lemmatize(word, pos = "n") # n --> nouns
        for word in verb_lemmatized
     ]

    cleaned_sentence = ' '.join(word for word in noun_lemmatized)
    
    return cleaned_sentence

### Preprocess dejando stopwords

Realizamos preprocesado dejando las stopswords.

La función sólo incluye: 

- Tokenizar
- Lemmatizing

In [4]:
def preprocessing_techniques_2(sentence):
    tokenized_sentence = word_tokenize(sentence) ## tokenize 
    
    # Lemmatizing the verbs
    verb_lemmatized = [
        WordNetLemmatizer().lemmatize(word, pos = "v") 
        for word in tokenized_sentence
    ]
    
    # 2 - Lemmatizing the nouns
    noun_lemmatized = [                 
        WordNetLemmatizer().lemmatize(word, pos = "n") # n --> nouns
        for word in verb_lemmatized
     ]

    cleaned_sentence = ' '.join(word for word in noun_lemmatized)
    
    return cleaned_sentence

## Cleaning datasets

In [5]:
data_book = pd.read_csv('../raw_data/books_with_blurbs.csv')

In [6]:
data_book.head()

Unnamed: 0,ISBN,Title,Author,Year,Publisher,Blurb
0,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,"Here, for the first time in paperback, is an o..."
1,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,"The fascinating, true story of the world's dea..."
2,399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,Winnie and Helen have kept each others worst s...
3,425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000,Berkley Publishing Group,Historians and inquisitive laymen alike love t...
4,1881320189,Goodbye to the Buttermilk Sky,Julia Oliver,1994,River City Pub,This highly praised first novel by fiction wri...


### Cleaning Dataset Books

In [7]:
# Clean book's blurs
data_book['base_cleaned_blur'] = data_book['Blurb'].apply(basic_preprocess)
data_book['full_preprocess_blur'] = data_book['base_cleaned_blur'].apply(preprocessing_techniques)
data_book['preprocess_with_stopw'] = data_book['base_cleaned_blur'].apply(preprocessing_techniques_2)

In [8]:
data_book.head()

Unnamed: 0,ISBN,Title,Author,Year,Publisher,Blurb,base_cleaned_blur,full_preprocess_blur,preprocess_with_stopw
0,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,"Here, for the first time in paperback, is an o...",here for the first time in paperback is an out...,first time paperback outstanding military hist...,here for the first time in paperback be an out...
1,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,"The fascinating, true story of the world's dea...",the fascinating true story of the worlds deadl...,fascinate true story world deadliest disease g...,the fascinate true story of the world deadlies...
2,399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,Winnie and Helen have kept each others worst s...,winnie and helen have kept each others worst s...,winnie helen keep others worst secret fifty ye...,winnie and helen have keep each others worst s...
3,425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000,Berkley Publishing Group,Historians and inquisitive laymen alike love t...,historians and inquisitive laymen alike love t...,historian inquisitive layman alike love ponder...,historian and inquisitive layman alike love to...
4,1881320189,Goodbye to the Buttermilk Sky,Julia Oliver,1994,River City Pub,This highly praised first novel by fiction wri...,this highly praised first novel by fiction wri...,highly praise first novel fiction writer julia...,this highly praise first novel by fiction writ...


### Cleaning Dataset Songs

In [10]:
data_songs = pd.read_csv('../raw_data/spotify_millsongdata.csv')

In [11]:
data_songs.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [12]:
# Clean songs's text
data_songs['base_cleaned_text'] = data_songs['text'].apply(basic_preprocess)
data_songs['full_preprocess_text'] = data_songs['base_cleaned_text'].apply(preprocessing_techniques)
data_songs['preprocess_with_stopw'] = data_songs['base_cleaned_text'].apply(preprocessing_techniques_2)

In [14]:
data_songs.head()

Unnamed: 0,artist,song,link,text,base_cleaned_text,full_preprocess_text,preprocess_with_stopw
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA...",look at her face its a wonderful face \r\nand...,look face wonderful face mean something specia...,look at her face it a wonderful face and it me...
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen...",take it easy with me please \r\ntouch me gent...,take easy please touch gently like summer even...,take it easy with me please touch me gently li...
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...,ill never know why i had to go \r\nwhy i had ...,ill never know go put lousy rotten show boy to...,ill never know why i have to go why i have to ...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...,making somebody happy is a question of give an...,make somebody happy question give take learn s...,make somebody happy be a question of give and ...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...,making somebody happy is a question of give an...,make somebody happy question give take learn s...,make somebody happy be a question of give and ...
