In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# import libraries 

import time
import numpy as np
import pandas as pd
import os
import pickle as pkl
from matplotlib import pyplot as plt
from collections import defaultdict
import nltk
import re
import json
import csv
import seaborn as sns
from tqdm import tqdm
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import sent_tokenize
from nltk.tokenize import word_tokenize

from collections import Counter

%matplotlib inline
pd.set_option('display.max_colwidth', 300)

from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.optimizers import SGD
from keras.models import load_model

from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten, Lambda
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, LSTM, GRU, Bidirectional, TimeDistributed
from keras.models import Model, load_model

from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import initializers, regularizers, optimizers
from keras.callbacks import History, CSVLogger

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, precision_recall_curve, average_precision_score, auc
from sklearn.feature_extraction.text import TfidfVectorizer

# Preprocessing the  data

In [None]:
#Lemmetizing Function (Lemmatization usually refers to doing things properly with the use of a vocabulary and morphological analysis of words, 
#normally aiming to remove inflectional endings only and to return the base or dictionary form of a word, which is known as the lemma)
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
def nltk2wn_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return wordnet.NOUN

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
#Tokenize w/lemmetization AFTER removing stopwords 
#https://machinelearningmastery.com/clean-text-machine-learning-python/
def tokenize(plot, stop_words, lemmatize = False):
    
    def re_sub(pattern, replace):
        return re.sub(pattern, replace, plot)
    
    plot = plot.lower() #lowercase
    plot = re_sub(r"[-+]?[.\d]*[\d]+[:,/.\d]*", "DG") #generic tag for numbers
    plot = re_sub(r"([!?.]){2,}", r"\1") #Convert multiple punctuations to the last punctuation mark
    plot = plot.replace('-',' ') #separating hyphenated words
    plot = plot.replace('_','') #remove underscores
    plot = re_sub(r'(?<!\w)([a-zA-Z])\.', r'\1') #remove periods from abbreviations
    plot = re_sub('[^\w\s\.\?\!\']','') #remove punctuation besides sentence completers and apostrophes
    sentences = nltk.sent_tokenize(plot)
    words = list(map(nltk.word_tokenize, sentences))
    words = [[x for x in w if not x in stop_words] for w in words]

    if lemmatize:
        output_lem = [nltk.pos_tag(w) for w in words]
        return [[lemmatizer.lemmatize(x[0], pos = nltk2wn_tag(x[1])) for x in w] for w in output_lem]
    else:
        return words

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
stop = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
start = time.time()
trail1['tokenized_words'] = trail1.apply(lambda row: tokenize(row['plots'], stop, lemmatize = True), axis=1)
end = time.time()
print("Total Time to tokenize plots:", end - start, "seconds")

Total Time to tokenize plots: 1122.765466928482 seconds


In [None]:
trail1['flattened_tokens'] = trail1.apply(lambda l: [item for sublist in l['tokenized_words'] for item in sublist], axis=1)

In [None]:
trail1.head()

Unnamed: 0,plots,movie_name,genres,tokenized_words,flattened_tokens
0,"Shlykov, a hard-working taxi driver and Lyosha, a saxophonist, develop a bizarre love-hate relationship, and despite their prejudices, realize they aren't so different after all.",Taxi Blues,Drama,"[[shlykov, hard, work, taxi, driver, lyosha, saxophonist, develop, bizarre, love, hate, relationship, despite, prejudice, realize, n't, different, .]]","[shlykov, hard, work, taxi, driver, lyosha, saxophonist, develop, bizarre, love, hate, relationship, despite, prejudice, realize, n't, different, .]"
1,"The nation of Panem consists of a wealthy Capitol and twelve poorer districts. As punishment for a past rebellion, each district must provide a boy and girl between the ages of 12 and 18 selected by lottery for the annual Hunger Games. The tributes must fight to the death in an arena; the sole...",The Hunger Games,Science Fiction,"[[nation, panem, consist, wealthy, capitol, twelve, poor, district, .], [punishment, past, rebellion, district, must, provide, boy, girl, age, DG, DG, select, lottery, annual, hunger, game, .], [tribute, must, fight, death, arena, sole, survivor, reward, fame, wealth, .], [first, reap, DG, year,...","[nation, panem, consist, wealthy, capitol, twelve, poor, district, ., punishment, past, rebellion, district, must, provide, boy, girl, age, DG, DG, select, lottery, annual, hunger, game, ., tribute, must, fight, death, arena, sole, survivor, reward, fame, wealth, ., first, reap, DG, year, old, p..."
2,"The nation of Panem consists of a wealthy Capitol and twelve poorer districts. As punishment for a past rebellion, each district must provide a boy and girl between the ages of 12 and 18 selected by lottery for the annual Hunger Games. The tributes must fight to the death in an arena; the sole...",The Hunger Games,Action,"[[nation, panem, consist, wealthy, capitol, twelve, poor, district, .], [punishment, past, rebellion, district, must, provide, boy, girl, age, DG, DG, select, lottery, annual, hunger, game, .], [tribute, must, fight, death, arena, sole, survivor, reward, fame, wealth, .], [first, reap, DG, year,...","[nation, panem, consist, wealthy, capitol, twelve, poor, district, ., punishment, past, rebellion, district, must, provide, boy, girl, age, DG, DG, select, lottery, annual, hunger, game, ., tribute, must, fight, death, arena, sole, survivor, reward, fame, wealth, ., first, reap, DG, year, old, p..."
3,"The nation of Panem consists of a wealthy Capitol and twelve poorer districts. As punishment for a past rebellion, each district must provide a boy and girl between the ages of 12 and 18 selected by lottery for the annual Hunger Games. The tributes must fight to the death in an arena; the sole...",The Hunger Games,Drama,"[[nation, panem, consist, wealthy, capitol, twelve, poor, district, .], [punishment, past, rebellion, district, must, provide, boy, girl, age, DG, DG, select, lottery, annual, hunger, game, .], [tribute, must, fight, death, arena, sole, survivor, reward, fame, wealth, .], [first, reap, DG, year,...","[nation, panem, consist, wealthy, capitol, twelve, poor, district, ., punishment, past, rebellion, district, must, provide, boy, girl, age, DG, DG, select, lottery, annual, hunger, game, ., tribute, must, fight, death, arena, sole, survivor, reward, fame, wealth, ., first, reap, DG, year, old, p..."
4,"Poovalli Induchoodan is sentenced for six years prison life for murdering his classmate. Induchoodan, the only son of Justice Maranchery Karunakara Menon was framed in the case by Manapally Madhavan Nambiar and his crony DYSP Sankaranarayanan to take revenge on idealist judge Menon who had e...",Narasimham,Action,"[[poovalli, induchoodan, sentence, six, year, prison, life, murder, classmate, .], [induchoodan, son, justice, maranchery, karunakara, menon, frame, case, manapally, madhavan, nambiar, crony, dysp, sankaranarayanan, take, revenge, idealist, judge, menon, earlier, give, jail, sentence, manapally,...","[poovalli, induchoodan, sentence, six, year, prison, life, murder, classmate, ., induchoodan, son, justice, maranchery, karunakara, menon, frame, case, manapally, madhavan, nambiar, crony, dysp, sankaranarayanan, take, revenge, idealist, judge, menon, earlier, give, jail, sentence, manapally, co..."


In [None]:
type(trail1["movie_name"])

pandas.core.series.Series

In [None]:
#Binarize labels
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(trail1["genres"])
trail1["binarized_labels"] = labels.tolist()

y = mlb.transform(trail1["genres"])


TypeError: ignored

In [None]:
mlb.classes_

array(['Action', 'Adventure', 'Comedy', 'Crime Fiction', 'Drama',
       'Family Film', 'Horror', 'Romance Film', 'Science Fiction',
       'Thriller'], dtype=object)

In [None]:
labels.shape

(36050, 10)

In [None]:
#Pickle Data
trail1.to_pickle("./trail1.pkl")

In [None]:
trail1 = pd.read_pickle("./trail1.pkl")
trail1.head(2)

Unnamed: 0,plots,movie_name,genres,tokenized_words,flattened_tokens,binarized_labels
0,"Shlykov, a hard-working taxi driver and Lyosha, a saxophonist, develop a bizarre love-hate relationship, and despite their prejudices, realize they aren't so different after all.",Taxi Blues,[Drama],"[[shlykov, hard, work, taxi, driver, lyosha, saxophonist, develop, bizarre, love, hate, relationship, despite, prejudice, realize, n't, different, .]]","[shlykov, hard, work, taxi, driver, lyosha, saxophonist, develop, bizarre, love, hate, relationship, despite, prejudice, realize, n't, different, .]","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]"
1,"The nation of Panem consists of a wealthy Capitol and twelve poorer districts. As punishment for a past rebellion, each district must provide a boy and girl between the ages of 12 and 18 selected by lottery for the annual Hunger Games. The tributes must fight to the death in an arena; the sole...",The Hunger Games,"[Science Fiction, Action, Drama]","[[nation, panem, consist, wealthy, capitol, twelve, poor, district, .], [punishment, past, rebellion, district, must, provide, boy, girl, age, DG, DG, select, lottery, annual, hunger, game, .], [tribute, must, fight, death, arena, sole, survivor, reward, fame, wealth, .], [first, reap, DG, year,...","[nation, panem, consist, wealthy, capitol, twelve, poor, district, ., punishment, past, rebellion, district, must, provide, boy, girl, age, DG, DG, select, lottery, annual, hunger, game, ., tribute, must, fight, death, arena, sole, survivor, reward, fame, wealth, ., first, reap, DG, year, old, p...","[1, 0, 0, 0, 1, 0, 0, 0, 1, 0]"


#Token Analysis
Iterate over tokenized words and create dictionaries that keep track of number of tokens, length of sentences, and sentences per plot summary

In [None]:
word_dict = {}
sent_per_summary_dict = {}
word_per_sent_dict = {}
rows = len(trail1['tokenized_words'])
print(rows)#number of plot summaries
for i in range(len(trail1['tokenized_words'])):
    length = len(trail1['tokenized_words'][i])
    if length in sent_per_summary_dict:
        sent_per_summary_dict[length] += 1
    else:
        sent_per_summary_dict[length] = 1
    for j in range(length):
        word_count = len(trail1['tokenized_words'][i][j])
        if word_count in word_per_sent_dict:
            word_per_sent_dict[word_count] += 1
        else:
            word_per_sent_dict[word_count] = 1
        for word in trail1['tokenized_words'][i][j]:
            if word in word_dict:
                word_dict[word] += 1
            else:
                word_dict[word] = 1

36050


In [None]:

print(len(word_dict.keys())) #should be number of unique words
print(sum(word_dict.values())) #should be total number of words

131861
7172308


In [None]:
count = 0
twoOrOne = 0
for value in word_dict.values():
    if value == 1:
        count +=1
    if value <3:
        twoOrOne +=1
print(len(word_dict.keys()) - count) # words that appear more than once
print(len(word_dict.keys()) - twoOrOne) # words that appear more than twice

76241
60642


In [None]:
print(len(word_per_sent_dict.keys())) #should be number of unique sentence lengths
print(sum(word_per_sent_dict.values())) #should be number of sentences in all plots
print(max(word_per_sent_dict.keys())) #should be largest sentence length
total = 0
weight_sum = 0
for key, value in word_per_sent_dict.items():
    total += value
    weight_sum += key*value
print(weight_sum/total) #should be average sentence length
#print(word_per_sent_dict)

111
587257
273
12.213235431846705


In [None]:
print(len(sent_per_summary_dict.keys())) #should be number of unique sentence lengths per summary
print(max(sent_per_summary_dict.keys())) #should be highest amount of sentences per summary
total = 0
weight_sum = 0
for key, value in sent_per_summary_dict.items():
    total += value
    weight_sum += key*value
print(weight_sum/total) #should be average sentence count per summary
#print(sent_per_summary_dict)

152
321
16.290069348127602


#Load GloVe Word Embeddings

In [None]:
#Load GloVe Word Embeddings
#compute an index mapping words to known embeddings, by parsing the data dump of pre-trained embeddings
embeddings_index = {}
GLOVE_DIR = '/content/drive/MyDrive/deep LEarning Project_/'
f = open(os.path.join(GLOVE_DIR, 'glove.6B.300d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [None]:
#create average word vector. This will later be used in place of unknown words
with open(os.path.join(GLOVE_DIR, 'glove.6B.300d.txt'), 'r') as f:
    for i, line in enumerate(f):
        pass
n_vec = i + 1
hidden_dim = len(line.split(' ')) - 1

vecs = np.zeros((n_vec, hidden_dim), dtype=np.float32)

with open(os.path.join(GLOVE_DIR, 'glove.6B.300d.txt'), 'r') as f:
    for i, line in enumerate(f):
        vecs[i] = np.array([float(n) for n in line.split(' ')[1:]], dtype=np.float32)

average_vec = np.mean(vecs, axis=0)
#print(average_vec)