In [169]:
import pandas as pd
import numpy as np
import sqlite3
pd.options.display.float_format = '{:20,.4f}'.format
np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})

In [43]:
## Bring in Data
con = sqlite3.connect("pitchfork-data.db")
review_data = pd.read_sql_query("SELECT * from review_text", con)
con.close()

In [44]:
review_data["score"] = review_data["score"].astype(float)

In [45]:
review_data["score"].mean()

7.018776620918074

In [46]:
review_data.head()

Unnamed: 0,abstract,author,author_type,link,review,score
0,"A decade after their last album, Grandaddy pic...",Ian Cohen,Contributor,http://www.pitchfork.com/reviews/albums/22950-...,Some Grandaddy songs are about technology. Nea...,6.0
1,The new album from New York’s Immolation is a ...,Saby Reyes-Kulkarni,Contributor,http://www.pitchfork.com/reviews/albums/22956-...,"In the early ’90s, death metal luminaries like...",7.7
2,The 20th anniversary remaster of Smith's final...,Matt LeMay,Contributor,http://www.pitchfork.com/reviews/albums/22947-...,About two minutes into Either/Or opener “Speed...,10.0
3,"Yoni Wolf's prog-rap project is rejuvenated, a...",Ian Cohen,Contributor,http://www.pitchfork.com/reviews/albums/22945-...,The end of WHY? had never been too far from Yo...,7.7
4,The Brazilian songwriter Erasmo Carlos remains...,Michael J. Agovino,Contributor,http://www.pitchfork.com/reviews/albums/22908-...,"Over the last half century, few countries have...",8.0


In [148]:
from sklearn.model_selection import train_test_split

seed = 123
x = review_data["abstract"]
Y = np.where(review_data["score"] >= 7.1, 1, 0)
Y = review_data["score"].round().astype(int)

In [149]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

In [150]:
import json
import keras
import keras.preprocessing.text as kpt
from keras.preprocessing.text import Tokenizer

# only work with the 10000 most popular words found in our dataset
max_words = 10000

# create a new Tokenizer
tokenizer = Tokenizer(num_words=max_words)
# feed our tweets to the Tokenizer
tokenizer.fit_on_texts(x)

In [151]:
# Tokenizers come with a convenient list of words and IDs
dictionary = tokenizer.word_index
# Let's save this out so we can use it later
with open('dictionary.json', 'w') as dictionary_file:
    json.dump(dictionary, dictionary_file)

In [152]:
def convert_text_to_index_array(text):
    # one really important thing that `text_to_word_sequence` does
    # is make all texts the same length -- in this case, the length
    # of the longest text in the set.
    return [dictionary[word] for word in kpt.text_to_word_sequence(text)]

allWordIndices = []
# for each tweet, change each token to its ID in the Tokenizer's word_index
for text in x:
    wordIndices = convert_text_to_index_array(text)
    allWordIndices.append(wordIndices)

# now we have a list of all tweets converted to index arrays.
# cast as an array for future usage.
allWordIndices = np.asarray(allWordIndices)

# create one-hot matrices out of the indexed tweets
clean_x = tokenizer.sequences_to_matrix(allWordIndices, mode='binary')

In [155]:
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)

In [158]:
X_train, X_test, y_train, y_test = train_test_split(clean_x, dummy_y, test_size=0.2, random_state=seed)

## Multiclass tutorial

In [161]:
# define baseline model

# create model
model = Sequential()
model.add(Dense(512, input_shape=(max_words,), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256, activation='sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(11, activation='softmax'))
# Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [162]:
model.fit(X_train, y_train,
  batch_size=32,
  epochs=5,
  verbose=1,
  validation_split=0.1,
  shuffle=True)

Train on 12170 samples, validate on 1353 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x16b2b6590>

In [163]:
from sklearn.metrics import mean_absolute_error
score = mean_absolute_error(y_test, model.predict(X_test))

In [176]:
model.predict(X_test)[101]

array([0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.985, 0.014,
       0.000, 0.000], dtype=float32)

In [177]:
y_test[101]

array([0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 1.000, 0.000,
       0.000, 0.000])