# 2.1.- Importamos nuestro Dataset

In [1]:
import json
import pandas as pd 
import itertools

In [2]:
# Descarga del dataset
def get_json_data(file):
    for line in open(file):
        yield json.loads(line)

# Los n=10 primeros
corpus_path = "./Corpus/"
file1 = corpus_path + "reviews_Home_and_Kitchen_5.json"
file2 = corpus_path + "reviews_Tools_and_Home_Improvement_5.json"
n_samples = 100000

df = pd.concat([pd.DataFrame(itertools.islice(get_json_data(file1),n_samples)), 
                pd.DataFrame(itertools.islice(get_json_data(file2),n_samples))],
               axis=0, ignore_index=True)

In [3]:
# testeo
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,APYOBQE6M18AA,615391206,Martin Schwartz,"[0, 0]",My daughter wanted this book and the price on ...,5.0,Best Price,1382140800,"10 19, 2013"
1,A1JVQTAGHYOL7F,615391206,Michelle Dinh,"[0, 0]",I bought this zoku quick pop for my daughterr ...,5.0,zoku,1403049600,"06 18, 2014"
2,A3UPYGJKZ0XTU4,615391206,mirasreviews,"[26, 27]",There is no shortage of pop recipes available ...,4.0,"Excels at Sweet Dessert Pops, but Falls Short ...",1367712000,"05 5, 2013"
3,A2MHCTX43MIMDZ,615391206,"M. Johnson ""Tea Lover""","[14, 18]",This book is a must have if you get a Zoku (wh...,5.0,Creative Combos,1312416000,"08 4, 2011"
4,AHAI85T5C2DH3,615391206,PugLover,"[0, 0]",This cookbook is great. I have really enjoyed...,4.0,A must own if you own the Zoku maker...,1402099200,"06 7, 2014"


# 2.2.- Preparación del dataset

Lo primero vamos a quedarnos con las columnas que nos valen para nuestro análisis del sentimiento

In [4]:
cols_to_drop = ["reviewerID", "asin", "reviewerName", "helpful", "unixReviewTime", "reviewTime", "summary"]
df.drop(columns=cols_to_drop, inplace=True)

Ahora vamos a definir nuestro criterio de sentimiento positivo y sentimiento negativo, tal como lo hicimos en el primer notebook 

In [5]:
def sentiment(row):
    return 0 if int(row[['overall']]) < 4 else 1

In [6]:
df['sentiment'] = df.apply(lambda row: sentiment(row), axis=1)

In [7]:
df.head()

Unnamed: 0,reviewText,overall,sentiment
0,My daughter wanted this book and the price on ...,5.0,1
1,I bought this zoku quick pop for my daughterr ...,5.0,1
2,There is no shortage of pop recipes available ...,4.0,1
3,This book is a must have if you get a Zoku (wh...,5.0,1
4,This cookbook is great. I have really enjoyed...,4.0,1


# 2.3.- Separación en conjuntos de train - test

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    df['reviewText'],
    df['sentiment'],
    train_size=0.75,
    test_size=0.25,
    random_state=52,
    shuffle=True,
    stratify=df['sentiment']
)

# 2.4.- Definición del Pipeline

In [10]:
# Librerías
import nltk

# Descargas de nltk necesarias
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tonyz\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\tonyz\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tonyz\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Vamos a añadir todas las funciones ya vistas en el apartado 1 que va a formar nuestro **pipeline** para el **preprocesado del texto**, como variante se va a intentar dar uso a **generadores** para que no sea tan pesado en memoria (sobretodo en test indivduales donde usaré pocas reviews)

In [11]:
# tokenizado
def tokenizer_word_level(reviews, tokenizer=nltk.RegexpTokenizer(r'\w+')):
    for review in reviews:
        yield tokenizer.tokenize(review)

# A minúsculas
def corpus_to_lower(reviews):
    for review in reviews:
        yield [word.lower() for word in review]

# Stopwords
def drop_stopwords(reviews, stopwords=nltk.corpus.stopwords.words('english')):
    for review in reviews:
        yield [word for word in review if word not in stopwords]
        
# Stemming
def stemming_data(reviews, stemmer=nltk.stem.snowball.EnglishStemmer(ignore_stopwords=True)):
    for review in reviews:
        yield [stemmer.stem(word) for word in review]  
        
# Lemmatizer
def lemmatizing_data(reviews, lemmatizer=nltk.stem.WordNetLemmatizer()):
    for review in reviews:
        yield [lemmatizer.lemmatize(word) for word in review]
        
# Limpiado de no-alphas
def cleaning_no_alpha(reviews):
    for review in reviews:
        yield [word for word in review if str.isalpha(word)]

In [12]:
def pipeline_generator(reviews, to_lower=True, drop_sw=True, stemming=True, lemmatizing=False, only_alpha=True,
                      stopwords=nltk.corpus.stopwords.words('english'), lemmatizer=nltk.stem.WordNetLemmatizer(),
                      stemmer=nltk.stem.snowball.EnglishStemmer(ignore_stopwords=True)):
    
    # Tokenizamos
    corpus = tokenizer_word_level(reviews)
    
    # Pasamos a minusculas
    corpus = corpus_to_lower(corpus) if to_lower else corpus
    
    # Quitamos stopwords
    corpus = drop_stopwords(corpus, stopwords=stopwords) if drop_sw else corpus
    
    # Realizamos Stemming
    corpus = stemming_data(corpus, stemmer=stemmer) if stemming else corpus
    
    # Realizamos Lemmatizing
    corpus = lemmatizing_data(corpus, lemmatizer=lemmatizer) if lemmatizing else corpus
    
    # Limpiamos las palabras que no sean enteras alphabéticas
    corpus = cleaning_no_alpha(corpus) if only_alpha else corpus
    
    # Devolvemos nuestro corpus como generador
    return corpus

Nota: Para el punto 1 usé lemmatización, ya que a la hora de visualizar vemos las palabras enteras y creo que es mejor para la parte de exploración, en este apartado vamos a usar stemming, que en el caso del Inglés aporta mas valor que la lemmatización

# 2.5.- Limpieza y guardado de variables 

In [13]:
# Dado que estamos en analisis de sentimientos, voy a hacer un limpiado de stopwords muy light, ya que es sensible
my_stopwords = ['i', 'you', 'he', 'she', 'it', 'we', 'they', 'the', 'or', 'and', 'is', 'are', 'but', 'a', 'at']

In [14]:
# Primero vamos a ver de que tipo es X_train e Y_train antes de modifcarlo
print(X_train)
print(type(X_train))

64935     This is so cute. I love it. The antenna are ju...
36086     This is very good product, very easy to clean ...
16787     This is a popcorn lover's delight.  I could no...
158933    I am crazy do-it-yourself person around the ho...
114806    It does what it supposed to do. I  moved to an...
                                ...                        
69310     After purchasing a cheap supermarket store ove...
115490    Like the header says this is a must have tool ...
168503    I'm not sure there is a huge difference betwee...
116351    I've owned or used a lot of cordless drills, f...
136387    A bungee is a bungee, so check the price. If t...
Name: reviewText, Length: 150000, dtype: object
<class 'pandas.core.series.Series'>


In [15]:
# Dado que tenemos pandas.core.series.Series, voy a realizar el pipeline solo al texto 
# sin modificar la estructura para evitar futuros problemas

for i, review in enumerate(X_train):
    X_train.iloc[i] = " ".join(next(pipeline_generator([review], stopwords = my_stopwords)))
    
for i, review in enumerate(X_test):
    X_test.iloc[i] = " ".join(next(pipeline_generator([review], stopwords = my_stopwords)))

In [16]:
# Comprobamos
print(X_train)
print(type(X_train))

64935     this so cute love antenna just wire stick out ...
36086     this very good product very easi to clean good...
16787     this popcorn lover s delight could not be happ...
158933    am crazi do yourself person around hous am alw...
114806    does what suppos to do move to an apart where ...
                                ...                        
69310     after purchas cheap supermarket store oven the...
115490    like header say this must have tool if instal ...
168503    m not sure there huge differ between all wirel...
116351    ve own use lot of cordless drill from black de...
136387    bunge bunge so check price if this price suita...
Name: reviewText, Length: 150000, dtype: object
<class 'pandas.core.series.Series'>


In [17]:
# Guardamos
! mkdir Vars
X_train.to_pickle("./Vars/X_train.pkl")
X_test.to_pickle("./Vars/X_test.pkl")
y_train.to_pickle("./Vars/y_train.pkl")
y_test.to_pickle("./Vars/y_test.pkl")