In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import unicodedata
import sys
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [20]:
# Genres
def get_genres():
    genres = pd.read_csv('../../data/raw/genres.csv', sep=',')
    movies = pd.read_csv('../../data/preprocessed/movies_id_updated.csv')
    mapping = movies[['id', 'imdbID']].rename(columns={'id':'movieID'})
    genres_grouped = genres.merge(mapping, on='movieID').groupby('imdbID')['genre'].apply(list).reset_index(name='genres')
    mlb = MultiLabelBinarizer()
    genres_encoded = mlb.fit_transform(genres_grouped['genres'])
    genres_grouped = genres_grouped.join(pd.DataFrame(genres_encoded))
    genres_grouped = genres_grouped.sort_values('imdbID').drop(columns={'genres'})
    return genres

In [21]:
# Languages
# how often should a selected language appear at least:
ts_lang = 10
# get_language
mlb = MultiLabelBinarizer()
lg = pd.DataFrame(mlb.fit_transform(omdb['Language']))
lg_index = pd.DataFrame(lg.sum()>ts_lang)
language = omdb[['0']].join(lg[lg_index[lg_index[0]].index])

In [22]:
def get_actors(threshold=20):
    actors = pd.read_csv('../../data/raw/actors.csv', sep=',')
    movies = pd.read_csv('../../data/preprocessed/movies_id_updated.csv')
    mapping = movies[['id', 'imdbID']].rename(columns={'id':'movieID'})
    actor_counts = pd.DataFrame(actors['actorID'].value_counts())
    actors_selected = actor_counts[actor_counts['actorID']>threshold]
    print(actors_selected)
    actors_selected = actors.set_index('actorID').loc[actors_selected.index].reset_index()
    # merge with imdbID, groupby imdbID and write the x most prominent actors as one entry per movie
    actors_grouped = actors_selected.merge(mapping, on='movieID').groupby('imdbID')['actorID'].apply(list).reset_index(name='actors')
    mlb = MultiLabelBinarizer()
    actors_enc = pd.DataFrame(mlb.fit_transform(actors_grouped['actors']))
    actors_grouped = actors_grouped.join(actors_enc).drop(columns={'actors'})
    return actors_grouped

In [23]:
def actors(threshold=14):    
    actor_counts = pd.DataFrame(actors['actorID'].value_counts())
    actors_selected = actor_counts[actor_counts['actorID']>threshold]
    actors_selected = actors.set_index('actorID').loc[actors_selected.index].reset_index()
    # merge with imdbID, groupby imdbID and write the x most prominent actors as one entry per movie
    actors_grouped = actors_selected.merge(mapping, on='movieID').groupby('imdbID')['actorID'].apply(list).reset_index(name='actors')
    mlb = MultiLabelBinarizer()
    actors_enc = pd.DataFrame(mlb.fit_transform(actors_grouped['actors']))
    actors_grouped = actors_grouped.join(actors_enc)
    return actors_grouped

In [24]:
# get the threshold first keywords per movie
def get_keywords(threshold=10):
    omdb = pd.read_csv('../../data/preprocessed/omdb_cleaned.csv')
    keywords = pd.read_csv('keywordDict.csv', header=None, sep=';')
    keywords = keywords.dropna()
    keywords[1] = keywords[1].apply(lambda x: x[1:-1])
    keywords[1] = keywords[1].apply(lambda x: x.split(','))
    keywords[1] = keywords[1].apply(lambda x: x[0:threshold])
    keywords = keywords.explode(1)
    keywords_grouped = keywords.groupby(0)[1].apply(list).reset_index(name='keywords')
    keywords_grouped = keywords_grouped.rename(columns={0: 'imdbID'})
    mlb = MultiLabelBinarizer()
    keywords_enc = pd.DataFrame(mlb.fit_transform(keywords_grouped['keywords']))
    keywords_grouped = keywords_grouped.join(keywords_enc).drop(columns={'keywords'})
    return keywords_grouped

In [19]:
def get_newkeywords(threshold=200):
    omdb = pd.read_csv('../../data/preprocessed/omdb_cleaned.csv')
    newkeywords = pd.read_csv('../ContentbasedFiltering/keywordDict.csv', header=None, sep=';')
    newkeywords = newkeywords.dropna()
    newkeywords[1] = newkeywords[1].apply(lambda x: x[1:-1])
    newkeywords[1] = newkeywords[1].apply(lambda x: x.split(','))
    newkeywords = newkeywords.explode(1)
    newkeywords_counts = pd.DataFrame(newkeywords[1].value_counts())
    newkeywords_selected = newkeywords_counts[newkeywords_counts[1]>threshold]
    newkeywords_selected = newkeywords.set_index(1).loc[newkeywords_selected.index].reset_index()
    print(newkeywords)
    newkeywords_selected = newkeywords_selected.rename(columns= {0: 'imdbID'})
    newkeywords_grouped = newkeywords_selected.groupby('imdbID')[1].apply(list).reset_index(name='newkeywords')
    mlb = MultiLabelBinarizer()
    newkeywords_enc = pd.DataFrame(mlb.fit_transform(newkeywords_grouped['newkeywords']))
    newkeywords_grouped = newkeywords_grouped.join(newkeywords_enc).drop(columns={'newkeywords'})
    return newkeywords_grouped

In [25]:
# get the information which of the threshold most frequent words are contained in a plot
def get_plots(threshold=100):
    omdb = pd.read_csv('../../data/preprocessed/omdb_cleaned.csv')
    plots = pd.read_csv('../../data/preprocessed/plot.csv')
    plots = plots.dropna()
    punctuation = dict.fromkeys(i for i in range(sys.maxunicode)
    if unicodedata.category(chr(i)).startswith('P'))
    plots['Plot'] = [string.translate(punctuation) for string in plots['Plot']]
    plots['Plot'] = plots['Plot'].apply(word_tokenize)
    plots['Plot'] = plots['Plot'].apply(lambda x: [item.lower() for item in x])
    stop_words = stopwords.words('english') + ['find', 'one', 'two', 'three', 'four','set', 'film','come', 'get', 'take', 'must', 'film', 'make', 'go', 'high', 'former', 'look','movie', 'make', 'go', 'high', 'us', 'use', 'whose', 'stop', 'sent', 'series', 'another', 'arrive', 'ii', 'bring', 'see', 'big', 'keep', 'cause', 'because', 'he', 'leave']
    plots['Plot'] = plots['Plot'].apply(lambda x: [item for item in x if item not in stop_words])
    porter = PorterStemmer()
    plots['Plot'] = plots['Plot'].apply(lambda x: [porter.stem(word) for word in x])
    plots = plots.explode('Plot')
    plots_counts = pd.DataFrame(plots['Plot'].value_counts())
    plots_selected = plots_counts[plots_counts['Plot']>threshold]
    plots_selected = plots.set_index('Plot').loc[plots_selected.index].reset_index()
    plots_grouped = plots_selected.groupby('imdbID')['Plot'].apply(list).reset_index(name='plots')
    mlb = MultiLabelBinarizer()
    plots_enc = pd.DataFrame(mlb.fit_transform(plots_grouped['plots']))
    plots_grouped = plots_grouped.join(plots_enc).drop(columns={'plots'})
    return plots_grouped

In [26]:
# get the information which of the threshold most frequent directors are the directors of a certain movie
def get_directors(threshold=5):
    directors = pd.read_csv('../../data/raw/directors.csv', sep=',')
    movies = pd.read_csv('../../data/preprocessed/movies_id_updated.csv')
    mapping = movies[['id', 'imdbID']].rename(columns={'id':'movieID'})
    director_counts = pd.DataFrame(directors['directorID'].value_counts())
    directors_selected = director_counts[director_counts['directorID']>threshold]
    directors_selected = directors.set_index('directorID').loc[directors_selected.index].reset_index()
    # merge with imdbID, groupby imdbID and write the x most prominent directors as one entry per movie
    directors_grouped = directors_selected.merge(mapping, on='movieID').groupby('imdbID')['directorID'].apply(list).reset_index(name='directors')
    mlb = MultiLabelBinarizer()
    directors_enc = pd.DataFrame(mlb.fit_transform(directors_grouped['directors']))
    directors_grouped = directors_grouped.join(directors_enc).drop(columns={'directors'})
    return directors_grouped