In [4]:
# install.packages('ggraph')
# install.packages('igraph')

In [5]:
library('data.table')
library('dplyr')
library('tidyr')
library('ggplot2')
library('scales')
library('igraph')
library('ggraph')

In [6]:
titles <- fread("title_ratings", quote="", na.strings = "\\N")
aka <- fread("aka_filtered", quote = "", na.strings = "\\N")
crew <- fread("crew_filtered", na.strings = "\\N")
principals <- fread("principals_filtered", quote="", na.strings = "\\N")
ep_series <- fread("episodes_filtered", na.strings = "\\N")
names <-fread("names_filtered", na.strings = "\\N")

ERROR: Error in fread("title_ratings", quote = "", na.strings = "\\N"): File 'title_ratings' does not exist or is non-readable. getwd()=='/home/rastislav/Documents/skola/erasmus/visualisation/imdb/Data-Visualization'


In [None]:
# titles and their ratings
head(titles, 3)
# the names of the titles in different languages, probably not relevant
head(aka, 3)
# list of writers and directors for each movie
head(crew, 3)
# list of roles of an actor in the given movie
head(principals, 3)
# table tying episodes to its tvSeries
head(ep_series, 3)
# names of the actors with the titles they starred in
head(names, 3)

In [None]:
head(ep_series, 3)
count(ep_series)

## Further filtering

### Series episodes split

In [None]:
series <- titles %>% filter(titleType != 'tvEpisode')
episodes <- titles %>% filter(titleType == 'tvEpisode')
count(series)
count(episodes)

### Filter tvSeries under 1000 reviews and their related elements

In [None]:
series <- series %>% filter(numVotes >= 1000)

In [None]:
ep_series <- ep_series %>% filter(parentTconst %in% series$tconst)

In [None]:
episodes <- episodes %>% filter(tconst %in% ep_series$tconst)

In [None]:
principals <- principals %>% filter(tconst %in% series$tconst | tconst %in% episodes$tconst)
names_filtered <- names %>% filter(nconst %in% principals$nconst)
aka_filtered <- aka %>% filter(titleId %in% series$tconst | titleId %in% episodes$tconst)
crew_filtered <- crew %>% filter(tconst %in% series$tconst | tconst %in% episodes$tconst)

### Data transformations

In [None]:
# series and episodes, - split genres, type -> factor
series$titleType = as.factor(series$titleType)
series <- series %>% mutate(genres = strsplit(genres, split=','))
episodes$titleType = as.factor(episodes$titleType)
episodes <- episodes %>% mutate(genres = strsplit(genres, split=','))
head(episodes)

In [None]:
# crew - split directors and writers
crew <- crew %>% mutate(directors = strsplit(directors, split = ','), writers = strsplit(writers, split = ','))
head(crew, 3)

In [None]:
# principals - delete brackets from characters and split, catogry -> factor
principals$category <- as.factor(principals$category)
principals$characters <- gsub("\\]", '', principals$characters)
principals$characters <- gsub("\\[", '', principals$characters)
principals$characters <- gsub('"', '', principals$characters)
principals$characters <- gsub('"', '', principals$characters)
principals <- principals %>% mutate(characters = strsplit(characters, split = ','))
head(principals, 3)

In [None]:
# ep_series - season and ep number -> int
ep_series$seasonNumber <- as.integer(ep_series$seasonNumber)
ep_series$episodeNumber <- as.integer(ep_series$episodeNumber)
head(ep_series, 3)

In [None]:
# names - split proffesion and titles
names <- names %>% mutate(primaryProfession = strsplit(primaryProfession, split = ','), knownForTitles = strsplit(knownForTitles, split = ','))
head(names, 3)

In [None]:
# free up memory after transformations
gc()

## Project

### People + aggregate data of the series they played in

In [None]:
title_person = merge(principals, series, by = "tconst", all=FALSE)
head(title_person, 10)

In [None]:
name_ratings <- title_person %>% group_by(nconst) %>% summarise(
    meanRating = mean(averageRating), averageVotes = mean(numVotes), weightedRating = sum(averageRating *  numVotes), sumVotes=sum(numVotes), ) %>%
    transform(weightedRating = weightedRating / sumVotes)

In [None]:
actor_rating <- title_person %>% filter(category == 'actor') %>% group_by(nconst) %>% summarise(
    meanRating = mean(averageRating), averageVotes = mean(numVotes), weightedRating = sum(averageRating *  numVotes), sumVotes=sum(numVotes)) %>%
    transform(weightedRating = weightedRating / sumVotes)
head(actor_rating, 3)

### Actors working well together

In [None]:
library(igraph)
library(gggraph)

In [None]:
ggraph(mygraph, layout = 'dendrogram', circular = TRUE) + 
  geom_conn_bundle(data = get_con(from = from, to = to), alpha=0.2, colour="skyblue", tension = 0) + 
  geom_node_point(aes(filter = leaf, x = x*1.05, y=y*1.05)) +
  theme_void()