In [0]:
from google.colab import files
files.upload()

{}

In [0]:
!unzip train_posters.zip
!unzip test_posters.zip

Archive:  train_posters.zip
replace train_posters/100114.jpg? [y]es, [n]o, [A]ll, [N]one, [r]ename: N
Archive:  test_posters.zip
replace test_posters/100388.jpg? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [0]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
import os
import cv2
import matplotlib.pyplot as plt

# Read csv data
# Reorder the labels to match the order of the images

csv_data = pd.read_csv('train_data.csv').as_matrix()

csv_data = csv_data[csv_data[:,1].argsort()] # csv_data[:,1].argsort() returns indices that sort movies by imdb # 
genres = csv_data[:,-1]


test_data_csv = pd.read_csv('test_data.csv').as_matrix()

# One-hot encode genres column

train_labels = to_categorical(np.array(genres))
print("Label for first training example: {}".format(genres[0]))
print("One-hot encoded label for first training example: {}".format(train_labels[0]))

Using TensorFlow backend.


Label for first training example: 3
One-hot encoded label for first training example: [0. 0. 0. 1.]


In [0]:
train_data = 'train_posters'
test_data = 'test_posters'

def preprocess_training_data():
    train_images = []
    image_num = 0

    for ind,i in enumerate(csv_data[:,1]):

        path = os.path.join(train_data,str(i) + ".jpg")
        img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
        img = cv2.resize(img, (64,64))

        train_images.append(np.array(img)/255)
    return train_images


def preprocess_test_data():
    test_images = []
    for ind,i in enumerate(test_data_csv[:,1]):

        path = os.path.join(test_data,str(i) + ".jpg")
        img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
        img = cv2.resize(img, (64,64))
        test_images.append(np.array(img)/255)
            
    return test_images
    
preprocessed_train = preprocess_training_data()
preprocessed_test = preprocess_test_data()


x_train = np.array(preprocessed_train).reshape(-1,64,64,1)
y_train = train_labels

x_test = np.array(preprocessed_test).reshape(-1,64,64,1)

In [0]:
# Display training example #1371 Death Note, a well-acclaimed Hollywood adaption of the Death Note anime (jk)
# Display movie poster and associated label and title

# Feel free to change the train_ind and see how the preprocessing affect the images
train_ind = 1371
plt.imshow(preprocessed_train[train_ind])
print(csv_data[:,1][train_ind])
print(y_train[train_ind])
print(csv_data[:,(3,4,5)])

In [0]:
# Displaying test example #200 
# Remember there is no genre label or title associated with this image
# We are trying to predict the labels! 

plt.imshow(preprocessed_test[200])
print("Test example #200")

In [0]:
# Naive Bayes
train_title = csv_data[:,2]
test_title = test_data_csv[:,2]

from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
    ])
text_clf.fit(train_title, genre_list)
text_clf.predict(test_title)

array([3, 1, 3, 3, 3, 3, 3, 3, 0, 2, 3, 3, 1, 3, 2, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 3, 3, 2, 2, 3, 3, 3, 3, 3, 3, 1, 3,
       3, 3, 2, 3, 3, 3, 0, 3, 3, 3, 2, 3, 0, 0, 1, 2, 2, 2, 3, 3, 3, 3,
       3, 3, 3, 2, 3, 3, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 0, 3, 3, 3,
       2, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 0,
       3, 3, 3, 3, 3, 1, 3, 1, 3, 3, 3, 2, 3, 3, 2, 0, 3, 3, 0, 3, 3, 1,
       3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 0, 3, 3, 3, 2, 0, 1, 3, 3, 3,
       3, 3, 3, 3, 1, 3, 1, 0, 3, 0, 1, 3, 3, 3, 1, 3, 3, 3, 3, 0, 0, 3,
       1, 0, 3, 0, 3, 3, 3, 2, 3, 3, 2, 3, 1, 2, 3, 3, 2, 0, 2, 2, 2, 3,
       3, 2, 3, 2, 3, 1, 3, 3, 3, 2, 3, 2, 2, 3, 3, 2, 3, 1, 3, 3, 3, 1,
       3, 3, 1, 1, 3, 3, 3, 2, 3, 3, 0, 3, 3, 1, 3, 1, 3, 3, 3, 1, 3, 0,
       0, 3, 3, 3, 0, 3, 3, 3, 3, 3, 0, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 0,
       3, 3, 3, 3, 1, 1, 0, 0, 3, 3, 0, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3,
       3, 2, 3, 2, 3, 0, 3, 2, 3, 3, 3, 3, 1, 3, 3,

In [0]:
# Naive Bayes with Cross Validation

import random
import numpy as np

test_list = list(range(3094))
#print(test_list)
train_list = list(range(3094))
#print(train_list)

test_rand = random.sample(test_list, 343)
train_rand = list(set(train_list) - set(test_rand))

#print(test_rand)
#print(train_rand)

test_genre = csv_data[test_rand,-1]
train_genre = csv_data[train_rand,-1]
#print(test_genre)
#print(train_genre)

test_title_cross = csv_data[test_rand,2]
train_title_cross = csv_data[train_rand,2]

test_genre = genres[test_rand].tolist()
train_genre = genres[train_rand].tolist()

from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
    ])
text_clf.fit(train_title_cross, train_genre)
bayes_predict = text_clf.predict(test_title_cross)

# Check accuracy
import numpy as np

docs_test = test_title_cross
predicted = text_clf.predict(docs_test)
acc = np.mean(predicted == test_genre)  

print('Bayes Accuracy =',acc)
print('Bayes prediction:',bayes_predict)

# Decide not to use Naive Bayes because it made the accuracy worse

Bayes Accuracy = 0.40524781341107874
Bayes prediction: [3 3 2 3 3 3 3 3 3 3 3 3 3 3 0 3 3 0 3 0 3 0 3 3 2 1 0 3 3 3 3 1 3 0 3 3 2
 2 3 3 3 3 3 1 0 3 2 3 3 2 3 3 0 3 3 3 1 3 3 3 0 3 0 0 3 0 3 3 3 3 2 3 1 3
 3 3 3 0 2 1 0 2 3 3 3 3 0 2 3 0 3 3 0 3 3 2 3 3 3 2 3 3 3 3 2 3 3 1 3 3 3
 0 3 3 1 0 3 2 3 3 0 3 3 3 0 2 3 0 3 3 3 2 3 2 3 3 0 1 1 3 0 3 3 3 3 3 0 0
 1 3 3 3 0 3 3 3 2 2 3 2 3 3 3 2 3 3 3 3 3 3 3 3 3 0 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 2 2 3 0 3 1 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 0 3 1 3 3 3 3 2 3
 0 3 3 3 2 3 3 3 3 1 3 3 3 3 3 3 1 3 3 3 3 3 3 3 2 2 3 3 3 3 2 3 3 3 3 2 2
 3 2 3 3 2 3 2 3 3 3 0 2 3 3 3 3 3 3 2 3 3 3 1 1 3 2 3 1 3 3 3 3 3 3 1 3 3
 3 3 2 2 2 2 0 3 3 3 3 3 0 0 3 3 2 3 3 3 3 3 3 3 3 2 2 3 1 3 2 0 2 3 3 1 3
 3 3 3 3 0 3 3 3 3 2]
