In [None]:
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Lambda
from keras.layers import LSTM
from keras.models import Sequential
from keras.utils import np_utils

import numpy as np
import os
import pandas as pd
import string
import sys
import tensorflow as tf
import time

In [None]:
# Read the dataset in
data = pd.read_csv("../input/kickstarter-project-statistics/most_backed.csv")

In [None]:
# The stupid punctuation that people included in their blurbs to be removed
punctuation = string.punctuation + "\xa0" + "\x03" + "\n" + "£©«®°²³´¹º»½Ç×Üàáäåæèéïöøüēπφ​‒–—‘’“”•…℃™Ⓡ◡★♥　ヒロー世浮絵️！"

In [None]:
# function to clean the strings from the blurbs
def clean_strings(str_list):
    cleaned = [entry.strip() for entry in str_list]
    # want to append both the name and the main category
    table = str.maketrans("", "", punctuation) 
    # Remove all the special characters mentioned above
    cleaned = [entry.translate(table) for entry in cleaned]
    # Set each of the characters remaining to lowercase
    cleaned = [entry.lower() for entry in cleaned]
    return cleaned

# Clean the text data for blurbs, product title and product category
cleaned_blurb = clean_strings(data["blurb"])
cleaned_name = clean_strings(data["title"])
cleaned_category = clean_strings(data["category"])

In [None]:
# Combine these lists together so each string contains the blurb, title and category of a pitch
new_list = [cleaned_name[i] + " " + cleaned_category[i] + " "  for i in range(len(cleaned_name))]

In [None]:
# Create one long string of product pitches
joined_text = " ".join(new_list)

In [None]:
# now, we need a mapping from the individual characters to integers
chars = sorted(list(set(joined_text)))
char_to_int = dict((c,i) for i, c in enumerate(chars))

In [None]:
# count the length of the joined text, along with the unique characters (used below)
num_ch = len(joined_text)
num_ch_unique = len(chars)

In [None]:
# Creating the training set of sequences of 50 characters
seq_length = 50
dataX = []
dataY = []
for i in range(0, num_ch - seq_length, 1):
    # in sequence is the next 50 chars
    seq_in = joined_text[i:i + seq_length]
    # prediction value is the character after the sequence
    seq_out = joined_text[i + seq_length]
    # add the new sequence to the dataset
    dataX.append([char_to_int[ch] for ch in seq_in])
    dataY.append(char_to_int[seq_out])

In [None]:
# how many sequences do we have to analyze
n_patterns = len(dataX)

In [None]:
# Put it in a form that's friendly to keras
X = np.reshape(dataX, (n_patterns, seq_length, 1))

# Normalize the data so it performs well in the sigmoid transformation
X = X / float(num_ch_unique)

In [None]:
# convert the y data to categorical values, for predictions
y = np_utils.to_categorical(dataY)

In [1]:
# Now, time for the model!
model = Sequential()
# These hyperparam selections are more arbitrary than anything
model.add(LSTM(256,input_shape=(X.shape[1], X.shape[2]),return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(256,return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(256))
model.add(Dropout(0.3))
# Apply the temperature reduction
model.add(Lambda(lambda x: x / 2))
# softmax activation function
model.add(Dense(y.shape[1], activation="softmax"))
# adam optimizer for speed
model.compile(loss="categorical_crossentropy", optimizer="adam")

NameError: name 'Sequential' is not defined

In [None]:
# define checkpoints so that we can choose the best set of weights for the model (and so I don't loose everything when the kernel crashes)
filepath = "weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor="loss", verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [None]:
model.fit(X, y, epochs=98, batch_size=64, callbacks=callbacks_list)

In [None]:
# Commented out, but can be used to load in the best weight model to date instead of retraining
# fname = "../input/weights-updated/weights-improvement-41-1.6265.hdf5"
# model.load_weights(fname)
# model.compile(loss="categorical_crossentropy", optimizer="adam")

In [None]:
# pick a random seed
start = np.random.randint(0, len(dataX)-1)
# This is a random sample to start on, and see what the model generates
pattern = dataX[start]
print("Seed:")
print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")
# generate characters
for i in range(1000):
    x = np.reshape(pattern, (1, len(pattern), 1))
    x = x / float(num_ch_unique)
    prediction = model.predict(x, verbose=0)
    index = np.argmax(prediction)
    result = int_to_char[index]
    seq_in = [int_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
print("\nDone.")