In [1]:
import pandas as pd
import gzip

In [2]:
# List of datasets to use
data = ['Resources/data/reviews_Digital_Music_5.json.gz', 'Resources/data/reviews_Digital_Music_5.json.gz']

# functions to read Amazon data into a pandas data frame
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

# function to concatenate multiple Amazon datasets
def concatDF(data):
    df = pd.DataFrame()
    for dataset in data:
        dftemp = getDF(dataset)
        df = pd.concat([df, dftemp], axis=0)
    # drop unneeded columns
    df.drop(columns = ['reviewerID', 'asin', 'reviewerName', 'helpful', \
                       'summary', 'unixReviewTime', 'reviewTime'], inplace= True)
    return df

In [3]:
df = concatDF(data)

In [4]:
df.head()

Unnamed: 0,reviewText,overall
0,"It's hard to believe ""Memory of Trees"" came ou...",5.0
1,"A clasically-styled and introverted album, Mem...",5.0
2,I never thought Enya would reach the sublime h...,5.0
3,This is the third review of an irish album I w...,5.0
4,"Enya, despite being a successful recording art...",4.0


In [5]:
df['reviewText'] = df['reviewText'].str.lower()
df = df.iloc[:1000]

In [6]:
df.head()

Unnamed: 0,reviewText,overall
0,"it's hard to believe ""memory of trees"" came ou...",5.0
1,"a clasically-styled and introverted album, mem...",5.0
2,i never thought enya would reach the sublime h...,5.0
3,this is the third review of an irish album i w...,5.0
4,"enya, despite being a successful recording art...",4.0


In [7]:
x = df['reviewText']
y = df['overall'].astype(int)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

In [9]:
vectorized_train = vectorizer.fit_transform(X_train).toarray()
vectorized_test = vectorizer.transform(X_test).toarray()

In [19]:
from sklearn.preprocessing import Normalizer
norm = Normalizer().fit(vectorized_train)

norm_vectorized_train = norm.transform(vectorized_train)
norm_vectorized_test = norm.transform(vectorized_test)

In [10]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
label_encoder = LabelEncoder()
label_encoder.fit(y_train)

encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()

model.add(Dense(units=64, activation='relu', input_dim=vectorized_train.shape[1]))
model.add(Dense(units=5, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 64)                695616    
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 325       
Total params: 695,941
Trainable params: 695,941
Non-trainable params: 0
_________________________________________________________________


In [20]:
model.fit(norm_vectorized_train, y_train_categorical, epochs=10, validation_data=(norm_vectorized_test, y_test_categorical))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1b225cbf850>

In [34]:
sentence = ["i love coding in the great python"]
test = vectorizer.transform(sentence).toarray()

In [35]:
model.predict(test)

array([[0.01154668, 0.01791882, 0.03349778, 0.17669913, 0.7603376 ]],
      dtype=float32)

In [36]:
prediction = model.predict(test).tolist()[0]
score = prediction.index((max(prediction)))
print(score + 1)

5


In [None]:
df.head()

In [None]:
import nltk
def provideTokens(row):
    tokens = nltk.word_tokenize(row)
    #removes all tokens that do not contain letters
    token_words = [word for word in tokens if word.isalpha()]
    return token_words

In [None]:
df['tokened_words'] = df['reviewText'].apply(provideTokens) 

In [None]:
df.head()

In [None]:
from nltk.corpus import stopwords
stops = set(stopwords.words("english")) 

def remove_stops(row):
    meaningful_words = [word for word in row if not word in stops]
    return (meaningful_words)

In [None]:
df['train_me'] = df['tokened_words'].apply(remove_stops)

In [None]:
df.head()

In [None]:
x = df['train_me']
y = df['overall'].astype(int)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_words = 6000
max_len = 120
tokenizer = Tokenizer(num_words=max_words)
def padSequences(x):
    tokenizer.fit_on_texts(x)
    sequences = tokenizer.texts_to_sequences(x)
    reviews = pad_sequences(sequences, maxlen=max_len)
    return reviews

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [None]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
label_encoder = LabelEncoder()
label_encoder.fit(y_train)

encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [None]:
X_train_padded = padSequences(X_train)
X_test_padded = padSequences(X_test)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM

model = Sequential()

model.add(Embedding(max_words, 20))
model.add(LSTM(15, dropout=.5))
model.add(Dense(units=5, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
model.summary()

In [None]:
model.fit(
    X_train_padded[:1000],
    y_train_categorical[:1000],
    epochs=50,
    verbose=2
)

In [None]:
model.fit(
    X_train_padded[4000:5000],
    y_train_categorical[4000:5000],
    epochs=50,
    verbose=2
)

In [None]:
counter = 0 
for row in y_train_categorical[:1000]:
    test = [0., 0., 0., 0., 1.]
    identical = True
    index = 0
    for element in row:
        if element != test[index]:
            identical = False
        index += 1
    index = 0
    if identical:
        counter+= 1
    identical = True
print(counter)

In [None]:
sentence = ["this is a terrible bad toy"]
sequence = tokenizer.texts_to_sequences(sentence)
test = pad_sequences(sequence, max_len)


In [None]:
prediction  = model.predict(test).tolist()[0]
print(prediction)
score = prediction.index((max(prediction)))
print(score + 1)