# Imports

In [1]:
import sys
from google.colab import drive
drive.mount('/content/gdrive')
sys.path.append('/content/gdrive/My Drive/Colab Notebooks/Multi_Label_Text_Classification')
base_dir = 'gdrive/My Drive/Colab Notebooks/Multi_Label_Text_Classification/'

ModuleNotFoundError: No module named 'google.colab'

In [None]:
!pip3 install --quiet "tensorflow>=1.7"
!pip3 install --quiet tensorflow-hub

In [None]:
# Ignore  the warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

import glob
import functools 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
import missingno as msno
import re
import os.path
import math
from sklearn.model_selection import StratifiedShuffleSplit
#from sklearn.cross_validation import KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from scipy import sparse
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
#from skmultilearn.problem_transform import LabelPowerset
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
import scipy

import nltk
from wordcloud import WordCloud
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import unicodedata

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

nltk.download('stopwords')
nltk.download('wordnet')

sns.set_style("whitegrid")
sns.set_context("talk", font_scale=0.8)

from helper_functions import *
rdm_seed = 29
np.random.seed(rdm_seed)
import keras
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import ReduceLROnPlateau
from keras.models import load_model
import keras.optimizers

# MODELING

**Loading the input**

In [None]:
#mydata_train = pd.read_csv('./../Data/preprocessed/movies_genres_train_preprocessed.csv')
#mydata_test = pd.read_csv('./../Data/preprocessed/movies_genres_test_preprocessed.csv')
#mydata = pd.read_csv('../Data/movies_genres.csv', delimiter='\t')

mydata_train = pd.read_csv(base_dir+'Data/preprocessed/movies_genres_train_preprocessed.csv')
mydata_test = pd.read_csv(base_dir+'Data/preprocessed/movies_genres_test_preprocessed.csv')
mydata = pd.read_csv(base_dir+'Data/movies_genres.csv', delimiter='\t')

In [None]:
train_X, train_y = mydata_train['plot'], mydata_train.drop(['title', 'plot', 'plot_lang'], axis=1)
test_X, test_y = mydata_test['plot'], mydata_test.drop(['title', 'plot', 'plot_lang'], axis=1)

category_columns = train_y.columns

In [None]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3"
embed = hub.Module(module_url)

## Obtain Plot Embedding

In [None]:
# embed_movie_plots(train_X, train_or_test='train')
# embed_movie_plots(test_X, train_or_test='test')

train_files = glob.glob(base_dir+"Data/preprocessed/embed_vector/*train*.npy")
train_vector_set = []
for file in train_files:
  train_vector_set.append(np.load(file))
train_vector = np.concatenate(train_vector_set)

test_files = glob.glob(base_dir+"Data/preprocessed/embed_vector/*test*.npy")
test_vector_set = []
for file in test_files:
    test_vector_set.append(np.load(file))
test_vector = np.concatenate(test_vector_set)

## LabelPowerset
We use a Neural Network model to make prediction among one of the 1505 unique genre combinations in our training data set. 
* Input Layer consists of 512 features
* Output Layer consists of 1505 nodes representing the each of the unique genre combinations  
  * We use softmax activation function since the classifier has to output one among the 1505 combinations
* Hidden Layers - number of nodes in the hidden layer has to be in between the number of input and output nodes for optimal performance. We select 1024 neurons
* Dropout of 20%. To avoid overfit, we randomly drop out 20% of the neurons in the hidden layer

In [None]:
# Creating a LUT for the 1505 labels
train_y_labels= train_y.groupby(list(category_columns)).ngroup()
y_labels_lut = train_y.copy(deep=True) 
y_labels_lut['Labels'] = train_y_labels
y_labels_lut = y_labels_lut.drop_duplicates()
y_labels_lut = y_labels_lut.reset_index(drop=True).set_index('Labels').sort_index()


# One-hot encoding the output labels
num_classes = y_labels_lut.shape[0]
train_y_onehot = np_utils.to_categorical(train_y_labels, num_classes = num_classes)

In [None]:
def gen_model(optimizer):
  model = Sequential()
  model.add(Dense(1024, activation='relu', input_shape=(512,)))
  model.add(Dropout(0.5))
  model.add(Dense(1505, activation='softmax'))
  model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
  return model
lr_reduction = ReduceLROnPlateau(monitor='val_acc', 
                                            patience=2, 
                                            verbose=1, 
                                            factor=0.5, 
                                            min_lr=0.0001)

**Stochastic Gradient Descent Optimizer** 

In [None]:
epochs, batch_size = 20, 128
model = gen_model(keras.optimizers.SGD(lr=1))
model.fit(train_vector, train_y_onehot,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_split=0.3,
          callbacks=[lr_reduction])

In [None]:
y_pred = model.predict(test_vector)
y_pred_label = pd.DataFrame(np.argmax(y_pred, axis=1))
predictions = pd.merge(y_pred_label, y_labels_lut, how='left', left_on=0, right_on='Labels')[category_columns]
accuracy(test_y, predictions)

**Adam Optimizer**

In [None]:
epochs, batch_size = 20, 128
model = gen_model(keras.optimizers.Adam(lr=0.001))
model.fit(train_vector, train_y_onehot,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_split=0.3,
          callbacks=[lr_reduction])

In [None]:
y_pred = model.predict(test_vector)
y_pred_label = pd.DataFrame(np.argmax(y_pred, axis=1))
predictions = pd.merge(y_pred_label, y_labels_lut, how='left', left_on=0, right_on='Labels')[category_columns]
accuracy(test_y, predictions)

**RMSProp Optimizer**

In [None]:
epochs, batch_size = 20, 128
model = gen_model(keras.optimizers.RMSprop(lr=0.001))
model.fit(train_vector, train_y_onehot,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_split=0.3,
          callbacks=[lr_reduction])

In [None]:
y_pred = model.predict(test_vector)
y_pred_label = pd.DataFrame(np.argmax(y_pred, axis=1))
predictions = pd.merge(y_pred_label, y_labels_lut, how='left', left_on=0, right_on='Labels')[category_columns]
accuracy(test_y, predictions)

**Adagrad**

In [None]:
epochs, batch_size = 20, 128
model = gen_model(keras.optimizers.Adagrad(lr=0.01))
model.fit(train_vector, train_y_onehot,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_split=0.3,
          callbacks=[lr_reduction])

In [None]:
y_pred = model.predict(test_vector)
y_pred_label = pd.DataFrame(np.argmax(y_pred, axis=1))
predictions = pd.merge(y_pred_label, y_labels_lut, how='left', left_on=0, right_on='Labels')[category_columns]
accuracy(test_y, predictions)

**Adadelta**

In [None]:
epochs, batch_size = 20, 128
model = gen_model(keras.optimizers.Adadelta(lr=1.0))
model.fit(train_vector, train_y_onehot,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_split=0.3,
          callbacks=[lr_reduction])

In [None]:
y_pred = model.predict(test_vector)
y_pred_label = pd.DataFrame(np.argmax(y_pred, axis=1))
predictions = pd.merge(y_pred_label, y_labels_lut, how='left', left_on=0, right_on='Labels')[category_columns]
accuracy(test_y, predictions)

**Observations/Conclusions**
* Predictions using Sentence Embedding with Neural Networks doesnt really produce predictions as accurate as the simple ML models which used TF-IDF vectorizer
* Adam Optimizer seems to perform best among the ones tried with a F1 score of 0.62

## Binary Relevance
Here we build an predictor for each genre separately. In other words, the output layer will have 28 nodes - each corresponding to a genre. We will use a threshold for each genre to make predictions whether the plot falls into that genre or not

In [None]:
prob_thresh = (train_y.sum()/train_y.shape[0]).clip(upper=0.5)
prob_thresh

In [None]:
def gen_model_genre(optimizer):
  model = Sequential()
  model.add(Dense(800, activation='relu', input_shape=(512,)))
  model.add(Dropout(0.25))
  model.add(Dense(27, activation='sigmoid'))
  model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
  return model
lr_reduction = ReduceLROnPlateau(monitor='val_acc', 
                                            patience=2, 
                                            verbose=1, 
                                            factor=0.5, 
                                            min_lr=0.0001)

In [None]:
epochs, batch_size = 20, 128
model = gen_model_genre(keras.optimizers.Adam(lr=0.001))
model.fit(train_vector, train_y,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_split=0.3,
          callbacks=[lr_reduction])

In [None]:
y_pred = model.predict(test_vector)
predictions = pd.DataFrame(index=test_y.index, columns=test_y.columns)
for i in range(y_pred.shape[0]):
  predictions.iloc[i,:] = (y_pred[i,:]>prob_thresh).map({True:1, False:0})
accuracy(test_y, predictions)