### Install may be required 

In [None]:
!pip install numpy pandas scipy matplotlib

In [2]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
%matplotlib inline

### Run shell command `head` on filename

In [None]:
!head movies.csv

### Read in a csv file and select the Date field

In [4]:
movies = pd.read_csv(
    'movies.csv',
    delimiter=',',
    parse_dates=['Release Date']
)

### Show dimensions of the `movies` DataFrame

In [5]:
movies.shape

(4000, 5)

In [None]:
movies.head()

### List columns and thier data types

In [None]:
movies.info()

### Get statistical analysis on number value columns

In [None]:
movies.describe()

### Get statitics for specific numerical value column

In [None]:
movies['Domestic Gross'].describe()

### Box plot specific column

In [None]:
movies['Domestic Gross'].plot(kind='box', vert=False, figsize=(14,6))

In [None]:
movies['Domestic Gross'].plot(figsize=(14,6))

### Add additional data to plot  
Use `axvline`

In [None]:
ax = movies['Domestic Gross'].plot(kind='density', figsize=(14,6))
ax.axvline(movies['Domestic Gross'].mean(), color="#ff4422")
ax.axvline(movies['Domestic Gross'].median(), color='green')

In [None]:
ax = movies['Domestic Gross'].plot(kind='hist', figsize=(14,6))
ax.set_ylabel('Number of Movies')
ax.set_xlabel('Length')

In [None]:
movies.head()

### Get counts of each value for a column and sort

In [None]:
movies['Release Date'].value_counts().sort_values(ascending=False)

### Correlation of number value columns

In [None]:
corr = movies.corr()
corr

### Plot correlation in Red/Blue box format

In [None]:
fig = plt.figure(figsize=(8,8))
plt.matshow(corr, cmap='RdBu', fignum=fig.number)
plt.xticks(range(len(corr.columns)), corr.columns, rotation='vertical');
plt.yticks(range(len(corr.columns)), corr.columns);

### Add column and round down the decimals

In [None]:
movies['Domestic Millions'] = (movies['Domestic Gross'] / 1000000).round(decimals=2)
movies.head()

In [None]:
movies['Worldwide Millions'] = (movies['Worldwide Gross'] / 1000000).round(decimals=2)
movies.head()

In [None]:
movies['Budget Millions'] = (movies['Production Budget'] / 1000000).round(decimals=2)
movies.head()


In [None]:
df = movies[['Release Date', 'Movie Title', 'Domestic Millions', 'Worldwide Millions', 'Budget Millions']]
df.head()

### Find all the rows with no 'Release Date' and drop them - assign to new df

In [None]:
movies.info()
df = movies.dropna(subset=['Release Date'])
df.info()


### Create a `df` with all values that have dates less than 2022  
### Use `.loc[mask]` to filter the values

In [None]:
mask = (df['Release Date'] < "2022-01-01")
df = df.loc[mask]
df.head()

#### Create new columns 'Year' and 'Month' using the 'Release Date' ('Release Date' must be a datetime)

In [23]:
df['Year'] = df['Release Date'].dt.year
df['Month'] = df['Release Date'].dt.month


In [None]:
df.info()

### Plot budget and domestic box office dollars grouped by 'Year'

In [None]:
df.groupby('Year')[['Budget Millions','Domestic Millions']].sum() \
    .plot(linestyle='dashed', legend=True, grid=True, figsize=(14,7))


### Plot Count of releases by Month of the year

In [None]:
df['Month'].value_counts().plot(kind='bar', figsize=(14,7), xlabel='Month', ylabel='Number of Releases')

### Plot total releases by month (sorted)  
also uses colormap for variety

In [None]:
df['Month'].value_counts().sort_index(ascending=True).plot(kind='bar', figsize=(14,7), colormap='Set2')

### Plot total release by month using `.groupby` - Second Method  
also use colormap for variety

In [None]:
df.groupby('Month')['Release Date'].count().plot(kind='bar', figsize=(14,7), colormap='Set2')

### Plot which months have the highest Domestic Totals

In [None]:
df.groupby('Month')['Domestic Millions'].sum().plot(kind='bar', figsize=(14,7))

### More `.groupby` syntax

In [None]:
year_group = df.groupby('Year')
year_group.get_group(1997).set_index('Year')


### Same as above

In [None]:
df.groupby('Year').get_group(1997)

### Plot biggest movies in specific year  
Use `.set_index` to index agaist the X axis

In [None]:
year_group.get_group(1990).set_index('Movie Title')['Domestic Millions'] \
    .plot(kind='bar', figsize=(20,10), colormap='Set2', ylabel='USD')

### Plot highest grossing Star Trek Films

In [None]:
fltr = df['Movie Title'].str.contains('Star Trek')
df.loc[fltr].set_index('Movie Title')['Domestic Millions'].plot(kind='bar', figsize=(20,7))

### Create a column for Day of Week of Release

In [None]:
df['Day of Release'] = df['Release Date'].dt.day_name()
df.head()

### Plot movie domestic totals by release day of week

In [None]:
df.groupby('Day of Release')['Domestic Millions'].sum().plot(kind='bar', figsize=(20,7), ylabel='Millions USD')