In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

In [2]:
omdb = pd.read_csv('../../data/preprocessed/omdb_cleaned.csv')
movies = pd.read_csv('../../data/preprocessed/movies_id_updated.csv')
mapping = movies[['id', 'imdbID']].rename(columns={'id':'movieID'})
keywords = pd.read_csv('keywordDict.csv', header=None, sep=';')

In [3]:
keywords = keywords.dropna()

In [4]:
keywords[1] = keywords[1].apply(lambda x: x[1:-1]) 

In [5]:
keywords[1] = keywords[1].apply(lambda x: x.split(','))

In [6]:
keywords

Unnamed: 0,0,1
0,tt0114709,"[ 'toy', 'rivalry', 'cowboy', 'cgi-animatio..."
1,tt0113497,"[ 'board-game', 'adventurer', 'fight', 'gam..."
2,tt0107050,"[ 'original-story', 'neighbor', 'minnesota',..."
3,tt0114885,"[ 'black-american', 'husband-wife-relationshi..."
4,tt0113041,"[ 'fatherhood', 'doberman', 'dog', 'mansion..."
...,...,...
9418,tt0953318,"[ 'faked-death', 'bullying', 'imaginary-frie..."
9419,tt0960731,"[ 'surrealism', '2000s', 'year-1974', '1970..."
9420,tt0025464,"[ 'governor', 'gambling', 'electric-chair', ..."
9421,tt1024715,"[ 'sex-addict', 'sex-in-airplane', 'falling-..."


In [7]:
keywords.iloc[0,1][0]

" 'toy'"

In [8]:
keywords[1]

0       [ 'toy',  'rivalry',  'cowboy',  'cgi-animatio...
1       [ 'board-game',  'adventurer',  'fight',  'gam...
2       [ 'original-story',  'neighbor',  'minnesota',...
3       [ 'black-american',  'husband-wife-relationshi...
4       [ 'fatherhood',  'doberman',  'dog',  'mansion...
                              ...                        
9418    [ 'faked-death',  'bullying',  'imaginary-frie...
9419    [ 'surrealism',  '2000s',  'year-1974',  '1970...
9420    [ 'governor',  'gambling',  'electric-chair', ...
9421    [ 'sex-addict',  'sex-in-airplane',  'falling-...
9422    [ 'time-machine',  'black-adder',  'time-trave...
Name: 1, Length: 9348, dtype: object

In [9]:
pd.DataFrame(keywords[1].value_counts())

Unnamed: 0,1
[ 'independent-film'],3
[ 'f-rated'],2
[ 'based-on-novel'],2
"[ 'female-frontal-nudity', 'woman-in-a-bathtub', 'sorority-house', 'told-in-flashback', 'strip-club', 'woman-in-jeopardy', 'bathtub', 'blood-splatter', 'flashback-within-a-flashback', 'basement', 'voyeur', 'police-detective', 'ouija-board', 'serial-killer', 'bear-trap', 'stabbed-to-death', 'murder-suspect', 'shot-to-death', 'woman-in-shower', 'massacre', 'police', 'bra-and-panties', 'slasher', 'possessed-woman', 'demonic-possession', 'sequel', 'seance', 'cult-film', 'second-part', 'murdered-with-a-hook', 'stabbed-in-the-stomach', 'murderess', 'breasts', 'blood', 'neighbor', 'telephone', 'key', 'rain', 'female-police-detective', 'fireplace', 'black-bra', 'white-panties', 'flashlight', 'attic', 'corpse', 'fear', 'screaming', 'breaking-down-a-door', 'stabbed-in-the-neck', 'bathroom', 'violence', 'sorority', 'in-joke', 'rainstorm', 'stripper', 'police-investigation', 'evil-spirit', 'gore', 'head-in-toilet', 'shotgun', 'question-mark-at-the-end', 'ambiguous-ending', 'power-drill', 'murder-of-family', 'meat-hook', 'knife', 'drill', 'blonde', 'murder', 'ghost', 'possession', 'independent-film']",1
"[ 'cat', 'ufo', 'human-alien-relationship', 'talking-cat', 'alien', 'night', 'cat-movie', 'collar', 'military', 'scientist', 'spaceship', 'gold', 'spy', 'alien-cat', 'alien-visitation', 'visit', 'opening-a-door', 'scene-during-opening-credits', 'scene-before-opening-credits', 'double-barreled-shotgun', 'telekinesis', 'pledge-of-allegiance', 'u.s.-citizenship', 'citizenship', 'courtroom', 'parachute-caught-in-a-tree', 'helicopter-crash', 'parachute', 'flare-gun', 'farm', 'tractor', 'flying-upside-down', 'hanging-from-a-helicopter', 'pet-cat', 'kidnapping', 'airport', 'military-base', 'dog', 'disguise', 'jeep', 'hangar', 'military-uniform', 'limousine', 'film-projector', 'video-surveillance', 'time-freeze', 'shrinking', 'freeze-frame', 'pistol', 'held-at-gunpoint', 'gold-bar', 'alien-race', 'alien-contact', 'alien-creature', 'human-alien-team', 'female-scientist', 'tuna', 'general', 'pool-ball', 'pool-hustler', 'shotgun', 'farmhouse', 'playing-pool', 'pool-hall', 'sedative', 'doctor', 'football-game', 'animal-in-title', 'beer-can', 'fingerprint', 'bookie', 'fingers-in-ears', 'drink-thrown-into-someone's-face', 'surveillance', 'beer-drinking', 'apartment', 'film-camera', 'army-officer', 'soldier', 'supernatural-power', 'u.s.-army', 'laboratory', 'bet', 'garbage-truck', 'dumpster', 'armored-car', 'psychokinesis', 'neighbor', 'talking-animal', 'date', 'baseball-game', 'watching-sports-on-tv', 'army-base', 'flying', 'motorcycle', 'five-word-title', 'outer-space', 'kerchief', 'betting-shop', 'blindfold', ...]",1
...,...
"[ 'strip-search', 'jewish', 'nazi', 'trial', 'war-crime', 'genocide', 'shoah', 'courtroom', 'x-rayed-skeleton', 'jew', 'holocaust', 'mossad', 'based-on-play', 'independent-film']",1
"[ 'sex', 'hollywood', 'party', 'birthday', 'friend', 'birthday-party', 'film-within-a-film', 'volkswagen-passat', 'volkswagen', 'automobile', 'car', 'reference-to-cuba-gooding-jr', 'satire', 'film-executive', 'adult-filmmaking', 'oven', 'slobbering', 'sighing', 'grunting', 'moaning', 'refrigerator', 'beer', 'coin-purse', 'pager', 'crawling-on-the-floor', 'anxiety', 'tucson-arizona', 'gangster', 'reference-to-santa-claus', 'africa', 'kitchen', 'play-rehearsal', 'rehearsal', 'drinking-blood', 'dead-wolf', 'sister-sister-relationship', 'rendezvous', 'post-it', 'food', 'reference-to-george-bernard-shaw', 'university-of-toronto', 'desert', 'holiday-inn', 'southwest-airlines', 'men's-bathroom', 'urination', 'reference-to-kirk-douglas', 'parking-garage', 'jew', 'faith-the-magazine', 'apple-computer', 'reference-to-joseph-goebbels', 'brownie', 'dream', 'job-interview', 'erection', 'adult-bookstore', 'robbery', 'flatiron-building-manhattan-new-york-city', 'magazine-writer', 'writer', 'veterinarian', 'hashish', 'tuxedo', 'plastic-bag-over-head', 'videotaping-sex', 'film-set', 'falling-off-a-chair', 'backstage', 'berkshire-massachusetts', 'food-court', 'restaurant', 'cafe', 'death', 'hand-job', 'theatre-production', 'theatre-audience', 'reference-to-harold-pinter', 'reference-to-the-duchess-of-malfi-the-stage-play', 'pilates', 'drunkenness', 'drinking', 'drink', 'pajamas', 'hunger', 'milk-and-cookies', 'republican', 'democrat', 'manhattan-new-york-city', 'new-york-city', 'reference-to-harvey-weinstein', 'money', 'dating', 'interracial-sex', 'exhibitionism', 'reference-to-al-pacino', 'painting', 'painter', 'artist', 'screenwriter', ...]",1
"[ 'catholic', 'adolescence', 'toilet-stall', 'pubic-hair', 'penis', 'underwater-scene', 'female-nudity', 'bare-butt', 'mutual-masturbation', 'erection', 'sex', 'first-gay-sexual-experience', 'reference-to-devil', 'written-by-director', 'crotch-grab', '13-year-old-boy', 'male-objectification', 'social-masturbation', 'taking-a-shower', 'sexual-awakening', 'taking-off-pants', 'teenage-sexuality', 'erotic-dream', 'shower-room', 'shared-shower', 'group-shower', 'showering-in-underwear', 'showering-together', 'erotic-fantasy', 'teenager', 'barefoot-male', 'taking-off-clothes', 'taking-off-shirt', 'cmnm', 'cmnm-scene', 'clothed-male-naked-male', 'female-objectification', 'man-wears-underwear', 'man-in-a-shower', 'man-wears-a-swimsuit', 'male-full-frontal-nudity', 'male-pubic-hair', 'hate', 'chess', 'hit-with-a-hockey-stick', 'student-athlete', 'perfectionist', 'award', 'directorial-debut', 'farmer', 'silent-retreat', 'cupcake', 'jumping-into-water', 'falling-into-water', 'saved', 'hell', 'imitating-someone', 'river', 'bonfire', 'picnic', 'brutality', 'writing-on-a-toilet-stall-wall', 'bathroom', 'loss-of-best-friend', 'body-versus-mind', 'catholic-guilt', 'holy-communion', 'fantasy-sequence', 'washing-pajamas', 'washing-bedsheets', 'school-expulsion', 'ale', 'beer', 'factory-worker', 'reference-to-don-camillo', 'reading-aloud', 'reflection-in-a-car-window', 'son-kisses-mother-goodbye', 'autobiographical', 'diving', 'boat', 'beach', 'cruelty', 'hit-on-the-leg', 'guest-house', 'knitting', 'reference-to-rousseau', 'ground-hockey', 'hockey', 'wellington-boots', 'tackling-someone', 'wrestling', 'peer-pressure', 'self-abuse', 'absolution', 'false-accusation-of-lying', 'false-accusation', 'reading-from-the-bible', 'worrying', 'virgin-mary-statue', ...]",1
"[ 'soviet-propaganda', 'russian-civil-war', 'russian-revolution', 'red-army', 'bolshevik', 'war-hero', 'character-name-as-title', 'surname-as-title', 'equestrian', 'warrior', 'military', 'siberia', 'communist-propaganda', 'uniform', 'one-word-title', 'cult-film', 'year-1918', 'russian-soldier', 'czech-legion', 'based-on-novel', 'chapaev-character']",1


In [13]:
mlb = MultiLabelBinarizer()

In [14]:
keywords_enc = mlb.fit_transform(keywords[1])

In [15]:
len(keywords_enc)

9348

In [17]:
keywords_enc.shape

(9348, 94431)

In [18]:
keywords_enc.sum()

998110