In [2]:
# Import the needed dependencies below.
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import altair as alt # graphical representation tool
alt.renderers.enable('notebook')

ValueError: 
To use the 'notebook' renderer, you must install the vega package
and the associated Jupyter extension.
See https://altair-viz.github.io/getting_started/installation.html
for more information.


In [None]:
netflix_movies = pd.read_csv('netflix_titles.csv')
netflix_stocks = pd.read_csv('NFLX.csv')

In [None]:
netflix_movies = netflix_movies[netflix_movies['type']=='Movie'].dropna()
netflix_movies.head()

In [None]:
netflix_stocks.head()

Now, that we have our DataFrames ready; we can continue to doing some investigative work. Let's check the descriptive statistics for both stocks and a couple different segmentations for movies with pd.DataFrame.describe().

In [None]:
netflix_movies.date_added = pd.to_datetime(netflix_movies.date_added)
movies_over_time = netflix_movies.copy().groupby('release_year')['title'].count().reset_index()
movies_over_time = movies_over_time.rename(columns={0:'count'})

median_year = int(netflix_movies ['release_year'].median()) #2016
mode_year = int(netflix_movies ['release_year'].mode())
print("this is the median:", median_year, ", this is the mode:", mode_year)
movies_over_time.describe()

In [None]:
netflix_stocks['Date'] = pd.to_datetime(netflix_stocks['Date'])

netflix_stocks.describe()

We get to see our maxes and mins & highs and lows. We have a pretty good idea of the distribution for the movies and the average closing dollar amount. Could Netflix releases over the years correlated with the stock price? Can we tell if our answer is statistically significant?

If you think about releases, there are four elements of releases that might catch a movie lover's attention. You have:
1. Main film location:  language matters but relativity to the elements in the movie and being familiar with the references is quentissential in most instances. 
2. Director: we all have our favorite director. It's the movie that was most memorable or the one that made you laugh the most. You loved the cinamatic touches they added. You strap in for a good movie when you hear what's on for movie night.
3. Actor: Leonardo DiCaprio gets my buy in on any movie. We all have that person of a few! Is their a primary actor driving the market?
4. Genre: I love a good horror movie, don't you? Not your forte? Let's see if the genres of the movie releases and the stock price have a relationship.

These ideas are all speculative, but I invite my readers to stay the course. The insights and graphs are at the bottom. Continue reading if you'd like.

In [None]:
# Assuming the first movie listed is where this movie was filmed the most, we will replace the lists of movies
# with the first country and use that to find the top 10 countries that have movies are currently on Netflix 

movies_with_countries = netflix_movies.copy()
new=movies_with_countries['country'].str.split(',', n=5, expand=True)
countries = [('1st',0), ('2nd',1),('3rd',2),('4th',3),('5th',4),('6th',5)]
for i in countries:
    movies_with_countries[i[0]+" country"] = new[i[1]]
new_mwc = movies_with_countries.drop(columns=['show_id','type','director','cast'])
distribution_df = new_mwc[['1st country',
                                         'title']].groupby(
    '1st country'
).count().reset_index()
distribution_df.head()

We could say Hollywood knows how to make a good movie? Or should we say that one director is taking center stage and we just can't get enough?

In [None]:
directors = movies_with_countries.copy()
directors = directors[directors['country']=='United States'][['director','title','rating','listed_in']]
directors.head()


new_dir = directors['director'].str.split(',', n=2, expand=True)
split = [('1st',0), ('2nd',1)]

# does genre matter?
for i in split:
    directors[i[0]+" Named Director"] = new_dir[i[1]]
top_directors = directors[['1st Named Director','title','rating','listed_in']].groupby(
    '1st Named Director'
)['title'].count().reset_index()


top_directors[top_directors['title']==top_directors.title.max()]

Let's be honest, Jay Karas is taking the market by storm with his work in comedy, so this one isn't too much of a surprise! Let's look for other trends that are in the data. Later, we will investigate if there is a correlation between the movies released and the stock price. We can't say conclusively that there is a direct cause and effect relationship; it's interested to find patterns in data, just for fun!

Now, we have to ask, *does the year a movie was released matter?* I can imagine that there have been times that the film industry took a hit and not too many movies reached production. We can only speculate about the greater implications. It could just mean that the most popular year had the best movies. We decide what this metric becomes in the end! 

In [None]:
# does genre matter?
genre = []
for i in genre:
    directors[i[0]+" genre"] = genre[i[1]]
top_directors = directors[['director','title','rating','listed_in']].groupby(
    'director'
)['title'].count().reset_index()

In [None]:
titles_per_country = alt.Chart(distribution_df,
                              height=300,
                              width=500,
                              title='Number of Titles per Country').transform_window(
    rank='rank(title)',
    sort=[alt.SortField('title', order='descending')]
).transform_filter(
    alt.datum.rank <=10
).mark_bar(color='red').encode(
    y=alt.Y('title:Q',
           axis=alt.Axis(title='Number of Movies',
                         tickCount=4,
                         labelPadding=10,
                        titlePadding=10)),
    x=alt.X('1st country:N',
           sort=alt.EncodingSortField('title',order='descending'),
           title=None,
           axis=alt.Axis(
                        labelAngle=-45,
               labelFontSize=15
           )),
    color=alt.condition(
        alt.datum.rank == 1,
        alt.value('red'),
        alt.value('black')
    ),
    tooltip=['1st country:N', 'title']
)

titles_by_director = alt.Chart(top_directors,
                              height=300,
                              width=500,
                              title='Number of Titles by Director').transform_window(
    rank='rank(title)',
    sort=[alt.SortField('title', order='descending')]
).transform_filter(
    alt.datum.rank <=10
).mark_bar(color='red').encode(
    y=alt.Y('title:Q',
           axis=alt.Axis(title='Number of Movies',
                         tickCount=4,
                         labelPadding=10,
                        titlePadding=10)),
    x=alt.X('1st Named Director:N',
           sort=alt.EncodingSortField('title',order='descending'),
           title=None,
           axis=alt.Axis(
                        labelAngle=-45,
               labelFontSize=15
           )),
    color=alt.condition(
        alt.datum.rank == 1,
        alt.value('red'),
        alt.value('black')
    ),
    tooltip=['1st Named Director:N', 'title']
)

titles_by_year = alt.Chart(movies_over_time,
                              height=300,
                              width=500,
                              title='Number of Titles by Year').transform_window(
    rank='rank(title)',
    sort=[alt.SortField('title', order='descending')]
).transform_filter(
    alt.datum.rank <=10
).mark_bar(color='red').encode(
    y=alt.Y('title:Q',
           axis=alt.Axis(title='Number of Movies',
                         tickCount=4,
                         labelPadding=10,
                        titlePadding=10)),
    x=alt.X('release_year:N',
           sort=alt.EncodingSortField('title',order='descending'),
           title=None,
           axis=alt.Axis(
                        labelAngle=-45,
               labelFontSize=15
           )),
    color=alt.condition(
        alt.datum.rank == 1,
        alt.value('red'),
        alt.value('black')
    ),
    tooltip=['release_year:N', 'title']
)

In [None]:
# Coding the graphs done in hidden cell. Data used in the graphs are visible in the cells above.
# Remake the year chart into something pertaining to actors, then continue with the genre scripting aboved
alt.vconcat((titles_per_country | titles_by_director),(titles_by_year))