In [None]:
# This R environment comes with many helpful analytics packages installed
# It is defined by the kaggle/rstats Docker image: https://github.com/kaggle/docker-rstats
# For example, here's a helpful package to load
options(repr.plot.width = 18, repr.plot.height = 10)

install.packages('countrycode')
library(countrycode)
library(tidyverse) # metapackage of all tidyverse packages

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

list.files(path = "../input")

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
netflix <- read_csv('../input/netflix-tv-shows-and-movies/titles.csv')

# **Cleaning the dataset**
Every entry has a list of Genres and Production Countries in the format `['entry1','entry2']`. This is converted to `entry1, entry2...` format

In [15]:
# Cleaning
netflix <- netflix %>%
      mutate(genres = na_if(genres, "[]"), production_countries = na_if(production_countries, "[]"))

netflix$genres <- gsub("\\[|\\]|\\'|\\s", "", as.character(netflix$genres))
netflix$production_countries <- gsub("\\[|\\]|\\'|\\s", "", as.character(netflix$production_countries))

netflix$type <- gsub("MOVIE", "Movie", as.character(netflix$type))
netflix$type <- gsub("SHOW", "TV Show", as.character(netflix$type))

# Content Type: Pie Chart

A pie chart showing distribution of content type, viz. TV show or Movie.

In [16]:
# Content Type
ggplot(data=count(netflix, type), aes(x = "", y = n, fill = type)) +
    geom_col() +
    geom_text(aes(label = scales::percent(n/sum(n), accuracy = 0.01)), 
              position = position_stack(vjust = 0.5), size = 6) +
    coord_polar(theta = "y") + 
    scale_fill_manual(values = c("#8F00FF", "#FF8000")) +
    labs(title = 'Content Type', x = "", y = "", fill = "Type") + 
    theme( plot.title = element_text(size = 30, face = "bold"),
        axis.text = element_blank(),
        axis.ticks = element_blank(),
        axis.title = element_blank(),
        panel.grid = element_blank(),
        panel.background = element_blank(),
        plot.background = element_blank(),
        legend.text = element_text(size=20),
        legend.spacing.x = unit(2, "line"),
        legend.title = element_text(size=20, face="bold"),
        legend.position="bottom")
    

# Distribution of Movies/TV Shows over age-certifications: Bar Chart

The bar chart shows a higher number of netflix titles for TV-MA & R (18+) rating than ratings suitable for kids.

In [17]:
# Age - Certification 
ggplot(data=drop_na(count(netflix, age_certification)), aes(x = reorder(age_certification, -n), y = n)) +
    geom_bar(stat="identity", mapping=aes(fill=n)) + 
    labs(title = 'Age - Certification', x = "", y = "Number of Movies / TV Shows") +
    scale_fill_continuous(low="#FF9B9B", high="#FF3F3F") +
    theme( plot.title = element_text(size = 30, face="bold"),
        axis.text = element_text(size = 15),
        axis.title = element_text(size = 20),
        panel.background = element_blank(),
        legend.position = "none")

# Number of Titles Released per Year: A histogram

A huge spike is observed in the number of titles released on netflix from the year 2019. Peaked during the COVID-19 Pandemic.

In [18]:
# Release Year vs. Movies
netflix %>% 
    ggplot(aes(x=release_year, fill=type)) + 
    geom_histogram(binwidth=1, color="black", alpha=0.9) +
    labs(title = 'Release Year', x = "Release Year", y = "Number of Movies / TV Shows", fill="Type") +
    scale_fill_manual(values = c("#0073C2FF", "#EFC000FF")) +
    theme( plot.title = element_text(size = 30, face="bold"),
        axis.text = element_text(size = 15),
        strip.text.x = element_text(size = 20),
        axis.title = element_text(size = 20),
        panel.spacing = unit(3, "lines"),
        legend.title = element_text(size=20),
        legend.text = element_text(size=15),)

# Distribution of Netflix Titles by Genre: Bar Plot

In [20]:
# Cleaning Genres
netflix_genre <- netflix %>% 
    separate_rows(genres, sep=",") %>%
    pivot_wider(names_from = genres, values_from = genres, values_fn = function(x) 1, values_fill = 0) 
netflix_genre <- select(netflix_genre, -'NA')

In [21]:
# Genres Dataframe
netflix_genre <- netflix_genre %>%
    group_by(type) %>%
    summarize(across(documentation:sport, sum)) %>%
    t() %>%
    as.data.frame()
colnames(netflix_genre) <- unlist(netflix_genre[1,])
netflix_genre <- rownames_to_column(netflix_genre, "genre")
netflix_genre <- netflix_genre[-c(1),]
netflix_genre <- gather(netflix_genre, "type", "count", 2:3)
netflix_genre <- transform(netflix_genre, count = as.numeric(count))

In [22]:
# Genres Plot
ggplot(data=netflix_genre, aes(x = reorder(genre, count), y = count, fill=type)) +
    geom_bar(stat="identity") + 
    labs(title = 'Genre Distribution', x = "", y = "Number of Movies / TV Shows", fill = "Type") +
    coord_flip() +
    scale_fill_manual(values = c("#0073C2FF", "#EFC000FF")) +
    theme( plot.title = element_text(size = 30, face="bold"),
        axis.text = element_text(size = 15, face="bold"),
        axis.title = element_text(size = 20),
        panel.background = element_blank(),
        legend.title = element_text(size=20),
        legend.text = element_text(size=15),
        legend.position = "top" )


# Runtimes Histogram 

In [23]:
# Movie Runtime Histogram

netflix[!netflix$runtime == 0, ] %>% 
    ggplot(aes(x=runtime)) + 
    geom_histogram(binwidth=2, color='#158301', fill='#47BD4C') +
    labs(title = 'Movies & TV Shows: Runtimes', x = "Runtime", y = "Number of Movies / TV Shows") +
    facet_wrap(~type) +
    theme( plot.title = element_text(size = 30, face="bold"),
        axis.text = element_text(size = 15),
        strip.text.x = element_text(size = 20),
        axis.title = element_text(size = 20),
        panel.spacing = unit(3, "lines"))


# IMDB rating histogram

In [24]:
netflix_imdb <- drop_na(netflix, imdb_votes)

# IMDB Ratings distribution

netflix_imdb %>%
    ggplot(aes(x=imdb_score)) + 
    geom_histogram(binwidth=0.1, color='#F20046', fill='#FF4949') +
    labs(title = 'Movies & TV Shows: IMDB Rating', x = "IMDB Rating", y = "Number of Movies / TV Shows") +
    facet_wrap(~type) +
    theme( plot.title = element_text(size = 30, face="bold"),
        axis.text = element_text(size = 15),
        strip.text.x = element_text(size = 20),
        axis.title = element_text(size = 20),
        panel.spacing = unit(3, "lines"))

# Runtimes vs. IMDB rating: Scatter Plot

In [25]:
# Runtimes vs Average IMDB rating

netflix_imdb[!netflix_imdb$runtime == 0, ] %>%
    group_by(runtime) %>%
    summarize(type, avg_score = mean(imdb_score), count = n()) %>%
    ggplot() +
    geom_point(mapping=aes(x=runtime, y=avg_score, color=count, alpha=count), size=5) +
    guides(alpha="none", color=guide_colorbar(barwidth=unit(5, "cm"), direction = "horizontal")) +
    labs(title = 'Runtime vs. Average IMDB Rating', x = "Runtime", y = "Average IMDB Rating", 
        color="Number of Movies/TV Shows") +
    facet_wrap(~type) + 
    scale_color_continuous(high = "#02007E", low = "#42C0FF") +
    theme( plot.title = element_text(size = 30, face="bold"),
        axis.text = element_text(size = 15),
        strip.text.x = element_text(size = 20),
        axis.title = element_text(size = 20),
        panel.spacing = unit(3, "lines"),
        legend.title = element_text(size=20),
        legend.text = element_text(size=15),
        legend.position = "top")

# Production Countries Distribution

In [26]:
# Countries

# Special Cases
netflix[netflix == "Lebanon"] <- "LB"

netflix_country <- netflix %>% 
    separate_rows(production_countries, sep=",") %>%
    drop_na(production_countries) %>%
    count(region = production_countries)
netflix_country <- netflix_country[!(netflix_country$region == "SU" | netflix_country$region == "XX"),]

netflix_country$region <- countrycode(netflix_country$region, origin='iso2c', destination="country.name",
                    custom_match = c('US' = 'USA', 
                                     'GB' = "UK", 
                                     'VA' = 'Vatican', 
                                     'PS' = 'Palestine', 
                                     'CZ' = 'Czech Republic',
                                     'HK' = 'China',
                                     'CD' = "Democratic Republic of the Congo"))

In [27]:
netflix_mapdata <- inner_join(map_data("world"), netflix_country, by="region")
ggplot(netflix_mapdata, aes(x=long, y=lat, group=group)) +
    geom_polygon(aes(fill=n), color="black") + 
    scale_fill_continuous(type="viridis") + 
    labs(title="Production Countries", fill="Number of Movies/TV Shows") +
    guides(fill=guide_colorbar(barwidth=unit(8, "cm"), direction = "horizontal")) +
    theme(plot.title = element_text(size = 30, face="bold"),
            axis.text = element_blank(),
            axis.title = element_blank(),
            panel.background = element_blank(),
              legend.title = element_text(size=20),
            legend.text = element_text(size=15),
            legend.position = "top")