In [1]:
import pandas as pd
import numpy as np
import sqlite3
pd.options.display.float_format = '{:20,.4f}'.format
np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})

## Bring in data from the web scraper

In [2]:
## Bring in Data
con = sqlite3.connect("pitchfork-data.db")
review_data = pd.read_sql_query("SELECT * from review_text", con)
con.close()

In [3]:
review_data["score"] = review_data["score"].astype(float)
review_data["score"].mean()

7.03038379530912

In [4]:
review_data.head()

Unnamed: 0,abstract,author,author_type,link,review,score
0,"A decade after their last album, Grandaddy pic...",Ian Cohen,Contributor,http://www.pitchfork.com/reviews/albums/22950-...,Some Grandaddy songs are about technology. Nea...,6.0
1,The new album from New York’s Immolation is a ...,Saby Reyes-Kulkarni,Contributor,http://www.pitchfork.com/reviews/albums/22956-...,"In the early ’90s, death metal luminaries like...",7.7
2,The 20th anniversary remaster of Smith's final...,Matt LeMay,Contributor,http://www.pitchfork.com/reviews/albums/22947-...,About two minutes into Either/Or opener “Speed...,10.0
3,"Yoni Wolf's prog-rap project is rejuvenated, a...",Ian Cohen,Contributor,http://www.pitchfork.com/reviews/albums/22945-...,The end of WHY? had never been too far from Yo...,7.7
4,The Brazilian songwriter Erasmo Carlos remains...,Michael J. Agovino,Contributor,http://www.pitchfork.com/reviews/albums/22908-...,"Over the last half century, few countries have...",8.0


## Split Data into X and Y and Preprocess

In [5]:
x = review_data["review"]
Y = review_data["score"]

In [6]:
import json
import keras
from keras.preprocessing.text import Tokenizer

# only work with the n most popular words found in our dataset
max_words = 5000

# create a new Tokenizer and feed reviews to it
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(x)

Using TensorFlow backend.


In [7]:
# Tokenizers come with a convenient list of words and IDs
# Save it so we can reference it later
dictionary = tokenizer.word_index
with open('dictionary.json', 'w') as dictionary_file:
    json.dump(dictionary, dictionary_file)

In [8]:
import keras.preprocessing.text as kpt

def convert_text_to_index_array(text):
    # one really important thing that `text_to_word_sequence` does
    # is make all texts the same length -- in this case, the length
    # of the longest text in the set.
    return [dictionary[word] for word in kpt.text_to_word_sequence(text)]

# for each review, change each token to its ID in the Tokenizer's word_index 
allWordIndices = []
for text in x:
    wordIndices = convert_text_to_index_array(text)
    allWordIndices.append(wordIndices)

# now we have a list of all tweets converted to index arrays.
# cast as an array for future usage.
allWordIndices = np.asarray(allWordIndices)

# create one-hot matrices out of the indexed tweets
clean_x = tokenizer.sequences_to_matrix(allWordIndices, mode='binary')

In [9]:
from sklearn.model_selection import train_test_split
seed = 123
X_train, X_test, y_tr_raw, y_te_raw = train_test_split(clean_x, Y, test_size=0.2, random_state=seed)

## Regression


In [151]:
from keras.models import Sequential
from keras.layers import Dense

def regression_model(activation, kernal):
    regressor = Sequential()
    regressor.add(Dense(512, activation=activation, kernel_initializer=kernal, input_shape=(max_words,)))
    regressor.add(Dense(256, activation=activation, kernel_initializer=kernal))
    regressor.add(Dense(12, activation=activation, kernel_initializer=kernal))
    regressor.add(Dense(1, activation=activation, kernel_initializer=kernal))
    regressor.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mae'])
    return regressor

regression_model("linear", "uniform").fit(X_train, y_tr_raw,
  batch_size=32,
  epochs=5,
  verbose=1,
  validation_split=0.25,
  shuffle=True)

Train on 10692 samples, validate on 3565 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x13579ad50>

In [155]:
scored = regression_model("linear", "uniform").predict(X_test)

In [156]:
scored

array([[0.034],
       [-0.001],
       [0.017],
       ..., 
       [0.011],
       [0.001],
       [0.007]], dtype=float32)

In [158]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_te_raw, scored)

7.0295280350707232

## Multiclass tutorial

In [10]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

def myround(x, prec=1, base=.5):
    return round(base * round(float(x)/base),prec)

#Convert to multi-class dummy variables 
y_tr, y_te = [myround(x) for x in pd.Series(y_tr_raw)],[myround(x) for x in y_te_raw]

In [11]:
y_train = np.array(pd.get_dummies(y_tr))
y_test = np.array(pd.get_dummies(y_te))

In [12]:
len(y_train[0]), len(y_test[0])

(21, 21)

In [26]:
# create model
model = Sequential()
model.add(Dense(512, kernel_initializer='glorot_normal', input_shape=(max_words,), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256, kernel_initializer='glorot_normal', activation='sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(len(y_train[0]), kernel_initializer='glorot_normal', activation='softmax'))
# Compile model
model.compile(loss='categorical_crossentropy', optimizer='Nadam', metrics=['accuracy'])

In [28]:
model.fit(X_train, y_train,
  batch_size=32,
  epochs=10,
  verbose=1,
  validation_split=0.25,
  shuffle=True)

Train on 10692 samples, validate on 3565 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x18dc4ad50>

In [29]:
predictions = model.predict(X_test)

In [30]:
def scoring_function(prediction):
    arg_max = (prediction.argmax())
    score = (prediction.argmax()+1.0)/2 - \
                (prediction[:arg_max].sum()*3) + \
                (prediction[arg_max+1:].sum()*2)
    return round(score, 1)

In [31]:
scored = pd.Series([scoring_function(x) for x in predictions])

In [32]:
scored.mean()

7.4592426367461435

In [33]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_te_raw, scored)

0.79618513323983164

## Grid Search

In [20]:
# Use scikit-learn to grid search the weight initialization
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation

# Function to create model, required for KerasClassifier
def create_model(init_mode='uniform', optimizer = 'adam'):
   # create model
    model = Sequential()
    model.add(Dense(512, input_shape=(max_words,), kernel_initializer=init_mode, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(256, kernel_initializer=init_mode, activation='sigmoid'))
    model.add(Dropout(0.5))
    model.add(Dense(len(y_train[0]), kernel_initializer=init_mode, activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

model = KerasClassifier(build_fn=create_model, epochs=3, batch_size=10, verbose=2)

In [21]:
# define the grid search parameters
init_mode = ['uniform', 'lecun_uniform', 'normal', 'zero', 'glorot_normal', 
             'glorot_uniform', 'he_normal', 'he_uniform']

optimizer = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam']

param_grid = dict(init_mode=init_mode, optimizer = optimizer)

In [None]:
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)
grid_result = grid.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Epoch 1/3
Epoch 1/3
Epoch 1/3
Epoch 1/3
Epoch 1/3
 - 27s - loss: 2.2673 - acc: 0.2009
Epoch 2/3
Epoch 1/3
 - 32s - loss: 2.2619 - acc: 0.1948
Epoch 2/3
 - 39s - loss: 2.2642 - acc: 0.2031
Epoch 2/3
Epoch 1/3
Epoch 1/3
 - 52s - loss: 2.2388 - acc: 0.2042
Epoch 3/3
 - 56s - loss: 2.2312 - acc: 0.2019
Epoch 3/3
 - 60s - loss: 2.2329 - acc: 0.1976
Epoch 3/3
 - 98s - loss: 2.1473 - acc: 0.2319
Epoch 2/3
 - 109s - loss: 2.1529 - acc: 0.2270
Epoch 2/3
 - 66s - loss: 2.2323 - acc: 0.1977
 - 116s - loss: 2.1442 - acc: 0.2397
Epoch 2/3
 - 65s - loss: 2.2286 - acc: 0.2059
 - 110s - loss: 2.1347 - acc: 0.2301
Epoch 2/3
 - 63s - loss: 2.2323 - acc: 0.2076
 - 111s - loss: 2.1484 - acc: 0.2263
Epoch 2/3
 - 93s - loss: 1.9179 - acc: 0.3198
Epoch 3/3
 - 85s - loss: 1.9431 - acc: 0.3131
Epoch 3/3
 - 69s - loss: 1.7751 - acc: 0.3527
Epoch 3/3
 - 78s - loss: 1.9081 - acc: 0.3292
Epoch 3/3
Epoch 1/3
 - 65s - loss: 1.7681 - acc: 0.3590
Epoch 3/3
Epoch 1/3
Epoch 1/3
 - 82s - loss: 1.7586 - acc: 0.3899
 - 89s

Epoch 3/3
Epoch 1/3
Epoch 1/3
Epoch 1/3
Epoch 1/3
 - 85s - loss: 2.2228 - acc: 0.2156
Epoch 2/3
 - 73s - loss: 2.2510 - acc: 0.2036
Epoch 2/3
 - 98s - loss: 1.6661 - acc: 0.4109
 - 81s - loss: 2.2417 - acc: 0.2067
Epoch 2/3
 - 83s - loss: 2.2451 - acc: 0.2102
Epoch 2/3
 - 143s - loss: 2.1811 - acc: 0.2272
Epoch 2/3
 - 148s - loss: 2.2150 - acc: 0.2048
Epoch 2/3
 - 64s - loss: 2.2309 - acc: 0.2000
Epoch 3/3
 - 62s - loss: 2.2249 - acc: 0.2059
Epoch 3/3
 - 160s - loss: 2.1737 - acc: 0.2204
Epoch 2/3
 - 107s - loss: 1.9450 - acc: 0.3102
Epoch 3/3
 - 61s - loss: 2.2292 - acc: 0.2058
Epoch 3/3
Epoch 1/3
 - 62s - loss: 2.2296 - acc: 0.1999
 - 63s - loss: 2.2255 - acc: 0.2050
 - 62s - loss: 2.2280 - acc: 0.2101
 - 95s - loss: 1.6634 - acc: 0.4073
 - 140s - loss: 1.8360 - acc: 0.3208
Epoch 3/3
 - 135s - loss: 1.8998 - acc: 0.2885
Epoch 3/3
Epoch 1/3
 - 97s - loss: 2.2918 - acc: 0.2098
Epoch 2/3
Epoch 1/3
 - 124s - loss: 1.8289 - acc: 0.3231
Epoch 3/3
Epoch 1/3
Epoch 1/3
 - 82s - loss: 2.3014 -

In [24]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.362909 using {'init_mode': 'glorot_normal', 'optimizer': 'Nadam'}


In [25]:
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

0.206986 (0.022439) with: {'init_mode': 'uniform', 'optimizer': 'SGD'}
0.306446 (0.013955) with: {'init_mode': 'uniform', 'optimizer': 'RMSprop'}
0.333731 (0.008312) with: {'init_mode': 'uniform', 'optimizer': 'Adagrad'}
0.309743 (0.008006) with: {'init_mode': 'uniform', 'optimizer': 'Adadelta'}
0.355825 (0.009553) with: {'init_mode': 'uniform', 'optimizer': 'Adam'}
0.329382 (0.006198) with: {'init_mode': 'uniform', 'optimizer': 'Adamax'}
0.337869 (0.008821) with: {'init_mode': 'uniform', 'optimizer': 'Nadam'}
0.203128 (0.017587) with: {'init_mode': 'lecun_uniform', 'optimizer': 'SGD'}
0.314091 (0.006023) with: {'init_mode': 'lecun_uniform', 'optimizer': 'RMSprop'}
0.330294 (0.004184) with: {'init_mode': 'lecun_uniform', 'optimizer': 'Adagrad'}
0.308129 (0.013032) with: {'init_mode': 'lecun_uniform', 'optimizer': 'Adadelta'}
0.355685 (0.009150) with: {'init_mode': 'lecun_uniform', 'optimizer': 'Adam'}
0.323701 (0.003333) with: {'init_mode': 'lecun_uniform', 'optimizer': 'Adamax'}
0.354