In [1]:
import sqlite3 as sql
import numpy as np
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

NUM_WORDS = 10000
SEED = 113

data = sql.connect("movie_lines.db")
cursor = data.cursor()

Using TensorFlow backend.


## Gather data

In [2]:
cursor.execute("SELECT COUNT(movie_id) FROM movies")
count = cursor.fetchone()[0]

movie_lines = []
for i in range(count):
    cursor.execute("SELECT line_text FROM lines WHERE movie_id = {}".format(i))
    lines = ""
    lines = " ".join([line[0] for line in cursor.fetchall()])
    movie_lines.append(lines)
    

In [3]:
tokenizer = Tokenizer(num_words=NUM_WORDS, oov_token="<UNK>")
tokenizer.fit_on_texts(movie_lines)

In [4]:
data = np.array(tokenizer.texts_to_sequences(movie_lines))

## Gather labels

In [5]:
cursor.execute("SELECT name FROM genres")
all_genres = np.array([genre[0] for genre in cursor.fetchall()])

label_encoder = LabelEncoder()
genre_ints = label_encoder.fit_transform(all_genres)

genre_dict = {}
for i in range(len(all_genres)):
    genre_dict[all_genres[i]] = genre_ints[i]

In [6]:
movie_genres = []
for i in range(count):
    cursor.execute("""
        SELECT genres.name
        FROM movies, genres, movie_genre_linking
        WHERE movies.movie_id = movie_genre_linking.movie_id
          AND genres.genre_id = movie_genre_linking.genre_id
          AND movies.movie_id = {}
        """.format(i))
    genres = [genre[0] for genre in cursor.fetchall()]
    movie_genres.append(genres)

In [8]:
labels = np.array(movie_genres_int)

In [7]:
movie_genres_int = []
for entry in movie_genres:
    int_encoded = []
    for genre in entry:
        int_encoded.append(genre_dict[genre])
    movie_genres_int.append(int_encoded)

## Randomize and Split

In [9]:
# Based on keras.datasets.imdb implementation of shuffling
np.random.seed(SEED)
indices = np.arange(len(data))
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

In [10]:
train_data = data[:462]
train_labels = labels[:462]
test_data = data[462:]
test_labels = labels[462:]

## One-Hot Encode

## Build Network (*VERY* much a prototype)

In [11]:
# From https://github.com/fchollet/deep-learning-with-python-notebooks/blob/master/3.5-classifying-movie-reviews.ipynb
def vectorize_sequences(sequences, dimension=NUM_WORDS):
    # Create an all-zero matrix of shape (len(sequences), dimension)
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.  # set specific indices of results[i] to 1s
    return results

x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)

y_train = vectorize_sequences(train_labels, dimension=len(all_genres))
y_test = vectorize_sequences(test_labels, dimension=len(all_genres))

In [12]:
print(x_train.shape)
print(y_train.shape)

(462, 10000)
(462, 24)


In [15]:
from keras import models
from keras import layers

model = models.Sequential()
model.add(layers.Dense(16, activation='relu', input_shape=(NUM_WORDS,)))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(y_train.shape[1], activation="sigmoid"))






In [16]:
model.compile(loss="binary_crossentropy",
              optimizer='rmsprop',
              metrics=['accuracy'])



Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [28]:
model.fit(x_train, y_train, epochs=10, batch_size=16)

predictions = model.predict(x_test)
results = model.evaluate(x_test, y_test)
results

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[0.516097576964286, 0.9002688157942987]

## How to proceed?

1. One-hot encode all of the genre permutations (i.e. ['action', 'drama', 'crime'] indexed and encoded to one value, etc.)
2. Rework the data to utilize embeddings instead
3. Eliminate all but the first genre entry for each movie
  - **Problem**: Genres sorted alphabetically, and a few movies have no genres (empty lists)
4. Some other way I have not considered?
  - Can densely-connected neural networks predict multiple labels for one entry (i.e. movie)?
  - [This article](https://medium.com/@vijayabhaskar96/multi-label-image-classification-tutorial-with-keras-imagedatagenerator-cd541f8eaf24) supposes using Keras functions instead of the Sequential Keras model
  - [This answer](https://stackoverflow.com/a/44165755) seems to be a more intuitive option where predictions are given thresholds

In [None]:
## Other Next Steps

- Consolidate Cornell data generation into a cornell.py module