In [18]:
import pandas as pd
import numpy as np
import sqlite3
from comet_ml import Experiment
import pickle
pd.options.display.float_format = '{:20,.4f}'.format
np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})

## Bring in data from the web scraper

In [2]:
## Bring in Data
con = sqlite3.connect("pitchfork-data.db")
review_data = pd.read_sql_query("SELECT * from review_text", con)
con.close()

In [3]:
review_data["score"] = review_data["score"].astype(float)
review_data["score"].mean()

7.03038379530912

In [4]:
review_data.head()

Unnamed: 0,abstract,author,author_type,link,review,score
0,"A decade after their last album, Grandaddy pic...",Ian Cohen,Contributor,http://www.pitchfork.com/reviews/albums/22950-...,Some Grandaddy songs are about technology. Nea...,6.0
1,The new album from New York’s Immolation is a ...,Saby Reyes-Kulkarni,Contributor,http://www.pitchfork.com/reviews/albums/22956-...,"In the early ’90s, death metal luminaries like...",7.7
2,The 20th anniversary remaster of Smith's final...,Matt LeMay,Contributor,http://www.pitchfork.com/reviews/albums/22947-...,About two minutes into Either/Or opener “Speed...,10.0
3,"Yoni Wolf's prog-rap project is rejuvenated, a...",Ian Cohen,Contributor,http://www.pitchfork.com/reviews/albums/22945-...,The end of WHY? had never been too far from Yo...,7.7
4,The Brazilian songwriter Erasmo Carlos remains...,Michael J. Agovino,Contributor,http://www.pitchfork.com/reviews/albums/22908-...,"Over the last half century, few countries have...",8.0


## Split Data into X and Y and Preprocess

In [5]:
x = review_data["review"]
Y = review_data["score"]

In [6]:
import json
import keras
from keras.preprocessing.text import Tokenizer

# only work with the n most popular words found in our dataset
max_words = 10000

# create a new Tokenizer and feed reviews to it
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(x)

Using TensorFlow backend.


In [7]:
# Tokenizers come with a convenient list of words and IDs
# Save it so we can reference it later
dictionary = tokenizer.word_index
with open('dictionary.json', 'w') as dictionary_file:
    json.dump(dictionary, dictionary_file)

In [8]:
import keras.preprocessing.text as kpt

def convert_text_to_index_array(text):
    # one really important thing that `text_to_word_sequence` does
    # is make all texts the same length -- in this case, the length
    # of the longest text in the set.
    return [dictionary[word] for word in kpt.text_to_word_sequence(text)]

# for each review, change each token to its ID in the Tokenizer's word_index 
allWordIndices = []
for text in x:
    wordIndices = convert_text_to_index_array(text)
    allWordIndices.append(wordIndices)

# now we have a list of all tweets converted to index arrays.
# cast as an array for future usage.
allWordIndices = np.asarray(allWordIndices)

# create one-hot matrices out of the indexed tweets
clean_x = tokenizer.sequences_to_matrix(allWordIndices, mode='binary')

In [9]:
from sklearn.model_selection import train_test_split
seed = 123
X_train, X_test, y_tr_raw, y_te_raw = train_test_split(clean_x, Y, test_size=0.2, random_state=seed)

## Regression


In [10]:
from keras.models import Sequential
from keras.layers import Dense
experiment = Experiment(api_key="GkMDLZGX3LFP8DPGfIBtYqzcV")

def regression_model(activation, kernal):
    regressor = Sequential()
    regressor.add(Dense(512, activation=activation, kernel_initializer=kernal, input_shape=(max_words,)))
    regressor.add(Dense(256, activation=activation, kernel_initializer=kernal))
    regressor.add(Dense(12, activation=activation, kernel_initializer=kernal))
    regressor.add(Dense(1, activation=activation, kernel_initializer=kernal))
    regressor.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mae'])
    return regressor

regression_model("linear", "uniform").fit(X_train, y_tr_raw,
  batch_size=32,
  epochs=5,
  verbose=1,
  validation_split=0.25,
  shuffle=True)



Comet.ml support for Ipython Notebook is limited at the moment, automatic monitoring and stdout capturing is deactivated

Experiment is live on comet.ml https://www.comet.ml/timdmulligan/general/38783a6e047847f4979bc93e9cfc9ad1

Train on 10692 samples, validate on 3565 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x12b705d10>

In [155]:
scored = regression_model("linear", "uniform").predict(X_test)

In [156]:
scored

array([[0.034],
       [-0.001],
       [0.017],
       ..., 
       [0.011],
       [0.001],
       [0.007]], dtype=float32)

In [158]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_te_raw, scored)

7.0295280350707232

## Multiclass tutorial

In [16]:
from sklearn.preprocessing import LabelEncoder

from keras.utils import np_utils

def myround(x, prec=1, base=.5):
    return round(base * round(float(x)/base),prec)

#Convert to multi-class dummy variables 
y_tr, y_te = [myround(x) for x in pd.Series(y_tr_raw)],[myround(x) for x in y_te_raw]

In [None]:
output = open('data/y_tr.pkl', 'wb')
pickle.dump(y_tr, output)
output = open('data/X_train.pkl', 'wb')
pickle.dump(X_train, output)
output.close()

In [50]:
y_train = np.array(pd.get_dummies(y_tr))
y_test = np.array(pd.get_dummies(y_te))

In [51]:
len(y_train[0]), len(y_test[0])

(21, 21)

In [52]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation

# create model
model = Sequential()
model.add(Dense(512, kernel_initializer='glorot_normal',
                input_shape=(max_words,),
                activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256, kernel_initializer='glorot_normal', activation='sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(len(y_train[0]), kernel_initializer='glorot_normal', activation='softmax'))
# Compile model
model.compile(loss='categorical_crossentropy', optimizer='Nadam', metrics=['accuracy'])

In [53]:
model.fit(X_train, y_train,
  batch_size=32,
  epochs=5,
  verbose=1,
  validation_split=0.25,
  shuffle=True)

Train on 10692 samples, validate on 3565 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x135974f50>

In [54]:
predictions = model.predict(X_test)

In [55]:
def scoring_function(prediction):
    arg_max = (prediction.argmax())
    score = (prediction.argmax()+1.0)/2 - \
                (prediction[:arg_max].sum()*2) + \
                (prediction[arg_max+1:].sum()*2)
    return round(score, 1)

In [56]:
scored = pd.Series([scoring_function(x) for x in predictions])

In [57]:
scored.mean()

7.526816269284707

In [58]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_te_raw, scored)

0.77102384291725101

In [59]:
model_json = model.to_json()
with open('model.json', 'w') as json_file:
    json_file.write(model_json)

model.save_weights('model.h5')

## Grid Search

In [12]:
# Use scikit-learn to grid search the weight initialization
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation

# Function to create model, required for KerasClassifier
def create_model(init_mode='uniform', optimizer = 'adam'):
   # create model
    model = Sequential()
    model.add(Dense(512, input_shape=(max_words,), kernel_initializer=init_mode, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(256, kernel_initializer=init_mode, activation='sigmoid'))
    model.add(Dropout(0.5))
    model.add(Dense(len(y_train[0]), kernel_initializer=init_mode, activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

model = KerasClassifier(build_fn=create_model, epochs=3, batch_size=32, verbose=0)

In [13]:
# define the grid search parameters
init_mode = ['uniform', 'lecun_uniform', 'normal', 'zero', 'glorot_normal', 
             'glorot_uniform', 'he_normal', 'he_uniform']

# optimizer = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam']

param_grid = dict(init_mode = init_mode)

In [None]:
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)
grid_result = grid.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Process PoolWorker-1:
Process PoolWorker-2:
Traceback (most recent call last):
Traceback (most recent call last):
  File "//anaconda/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
  File "//anaconda/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
    self.run()
  File "//anaconda/lib/python2.7/multiprocessing/process.py", line 114, in run
  File "//anaconda/lib/python2.7/multiprocessing/process.py", line 114, in run
    self._target(*self._args, **self._kwargs)
    self._target(*self._args, **self._kwargs)
  File "//anaconda/lib/python2.7/multiprocessing/pool.py", line 102, in worker
  File "//anaconda/lib/python2.7/multiprocessing/pool.py", line 102, in worker
    task = get()
    task = get()
  File "//anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/pool.py", line 362, in get
  File "//anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/pool.py", line 360, in get
Process PoolWorker-6:
Process PoolWorker-5:


Traceback (most recent call last):
Traceback (most recent call last):
  File "//anaconda/lib/python2.7/site-packages/IPython/core/ultratb.py", line 1132, in get_records
  File "//anaconda/lib/python2.7/site-packages/IPython/core/ultratb.py", line 1132, in get_records
    return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)
  File "//anaconda/lib/python2.7/site-packages/IPython/core/ultratb.py", line 313, in wrapped
    return f(*args, **kwargs)
    return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)
  File "//anaconda/lib/python2.7/site-packages/IPython/core/ultratb.py", line 313, in wrapped
  File "//anaconda/lib/python2.7/site-packages/IPython/core/ultratb.py", line 358, in _fixed_getinnerframes
    return f(*args, **kwargs)
    records = fix_frame_records_filenames(inspect.getinnerframes(etb, context))
  File "//anaconda/lib/python2.7/site-packages/IPython/core/ultratb.py", line 358, in _fixed_getinnerframes
  File "//anaconda/lib/python2.7/i


KeyboardInterrupt
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.


KeyboardInterrupt
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.





Traceback (most recent call last):


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "//anaconda/lib/python2.7/site-packages/IPython/core/ultratb.py", line 1132, in get_records
  File "//anaconda/lib/python2.7/site-packages/IPython/core/ultratb.py", line 1132, in get_records
  File "//anaconda/lib/python2.7/site-packages/IPython/core/ultratb.py", line 1132, in get_records
  File "//anaconda/lib/python2.7/site-packages/IPython/core/ultratb.py", line 1132, in get_records
  File "//anaconda/lib/python2.7/site-packages/IPython/core/ultratb.py", line 1132, in get_records
Traceback (most recent call last):
    return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)
    return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)
    return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)
    return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)
  File "//anaconda/lib

IndexError: string index out of range

IndexError: string index out of range

IndexError: string index out of range

IndexError: string index out of range

IndexError: string index out of range

IndexError: string index out of range

ERROR:tornado.general:Uncaught exception, closing connection.
Traceback (most recent call last):
  File "//anaconda/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "//anaconda/lib/python2.7/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "//anaconda/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 276, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "//anaconda/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 228, in dispatch_shell
    handler(stream, idents, msg)
  File "//anaconda/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 412, in execute_request
    self._abort_queues()
  File "//anaconda/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 628, in _abort_queues
    self._abort_queue(stream)
  File "//anaconda/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 634, in _abort_queue
    iden

ERROR:tornado.general:Uncaught exception, closing connection.
Traceback (most recent call last):
  File "//anaconda/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "//anaconda/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "//anaconda/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "//anaconda/lib/python2.7/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "//anaconda/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 276, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "//anaconda/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 228, in dispatch_shell
    handler(stream, idents, msg)
  File "//anaconda/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 412, in execute_re

ERROR:tornado.general:Uncaught exception, closing connection.
Traceback (most recent call last):
  File "//anaconda/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "//anaconda/lib/python2.7/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "//anaconda/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 276, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "//anaconda/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 228, in dispatch_shell
    handler(stream, idents, msg)
  File "//anaconda/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 412, in execute_request
    self._abort_queues()
  File "//anaconda/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 628, in _abort_queues
    self._abort_queue(stream)
  File "//anaconda/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 634, in _abort_queue
    iden

  File "//anaconda/lib/python2.7/multiprocessing/process.py", line 114, in run
  File "//anaconda/lib/python2.7/multiprocessing/process.py", line 114, in run
  File "//anaconda/lib/python2.7/multiprocessing/process.py", line 114, in run
  File "//anaconda/lib/python2.7/multiprocessing/process.py", line 114, in run
    self._target(*self._args, **self._kwargs)
    self._target(*self._args, **self._kwargs)
    self._target(*self._args, **self._kwargs)
    self._target(*self._args, **self._kwargs)
    self._target(*self._args, **self._kwargs)
    self._target(*self._args, **self._kwargs)
  File "//anaconda/lib/python2.7/multiprocessing/pool.py", line 102, in worker
  File "//anaconda/lib/python2.7/multiprocessing/pool.py", line 102, in worker
  File "//anaconda/lib/python2.7/multiprocessing/pool.py", line 102, in worker
  File "//anaconda/lib/python2.7/multiprocessing/pool.py", line 102, in worker
  File "//anaconda/lib/python2.7/multiprocessing/pool.py", line 102, in worker
  File "//ana

Process PoolWorker-29:
Process PoolWorker-31:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "//anaconda/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
  File "//anaconda/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
  File "//anaconda/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
  File "//anaconda/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
  File "//anaconda/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
  File "//anaconda/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
    self.run()
    self.run()
    self.run()
    self.run()
  File "//anaconda/lib/python2.7/multiprocessing/process.py", line 114, in run
    self.run()
  File "//anaconda/lib/python2.7/multiprocessing/process.py

    task = get()
    task = get()
    task = get()
    task = get()
    task = get()
    task = get()
  File "//anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/pool.py", line 360, in get
  File "//anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/pool.py", line 362, in get
  File "//anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/pool.py", line 360, in get
  File "//anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/pool.py", line 360, in get
  File "//anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/pool.py", line 360, in get
  File "//anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/pool.py", line 360, in get
    racquire()
    racquire()
    racquire()
    racquire()
    return recv()
    racquire()
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
Process PoolWorker-47:
Process PoolWorker-45:
Process PoolWorker-50:
Process PoolWorker-46:
Process Poo

In [None]:
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']