In [None]:
# ignore this cell (it's just to make certain text red later, but you don't need to understand it).
from IPython.core.display import display, HTML
display(HTML('<style>em { color: red; }</style> <style>.container { width:100% !important; }</style>'))

In [None]:
import pandas as pd
from pandas import DataFrame, Series
import matplotlib

In [None]:
import random

In [None]:
%matplotlib inline

In [None]:
matplotlib.rcParams["font.size"] = 16

# Stock Market Example

In [None]:
df = pd.read_csv("sp500.csv")
df.head()

### How do we compute *total gain* for *every year*?

In [None]:
df["tot"] = df["return"].cumprod()
df.head()

### Assuming you initally invested *1000 dollars in 1970*, what is your *total wealth* in every future year?

In [None]:
starting = 1000
df["wealth"] = df["tot"] * starting
df.set_index("year").tail()

### How do we create a *line plot* to plot *wealth gain* over the years?

In [None]:
ax = df.set_index("year")["wealth"].plot.line()
ax.set_ylabel("Wealth ($)")

### Random simulation

In [None]:
random.choice([3, 4, 9])

In [None]:
random.choices([3, 4, 9], k = 10) # Sampling with replacement

### How risky is it in investing in SP500 index going forward?
### Approach:
1. Run simulation, randomly replaying past years.
2. Run hundreds of simulations, explore range of outcomes.

In [None]:
# Step 1 in above approach
def run_sim(years=10, starting=1000):
    rand_returns = random.choices(df["return"], k=years)
    rand_returns = Series(rand_returns)
    return rand_returns.cumprod() * starting

# Step 2 in above approach
def run_n_sims(n, years=10, starting=1000):
    sims = pd.DataFrame()
    for i in range(n):
        sims["sim"+str(i)] = run_sim(years=years, starting=starting)
    return sims

### For *10 years*, run *250 simulations*

In [None]:
sims = run_n_sims(250)
sims.tail()

## How do you become a successful Data Scientist?
- By learning how to play both the detective and the lawyer

### How do we create a *line plot* to visualize all of the simulations?
- Be the detective

In [None]:
sims.plot.line(legend=False)

In [None]:
sims

### How do we create a *line plot* to present *statistics* of all of the simulations?
- Be the lawyer

In [None]:
# .quantile function gives percentiles
DataFrame({
    "5th percentile": sims.quantile(0.05, axis=1),
    "median": sims.quantile(0.5, axis=1),
    "95th percentile": sims.quantile(0.95, axis=1),
}).plot.line()

# Bar Plot Example w/ Fire Hydrants

In [None]:
df = pd.read_csv("Fire_Hydrants.csv")
df.columns

In [None]:
df.head()

In [None]:
df.columns

### Let's create a *bar plot* to visualize *colors* of fire hydrants.

In [None]:
orig_colors = df["nozzle_color"]
# To handle data cleaning
orig_colors = orig_colors.str.upper()
orig_colors.head(2)

In [None]:
color_counts = orig_colors.value_counts() # in SQL, it's like GROUP BY with COUNT(*)
color_counts

In [None]:
ax = color_counts.plot.bar(color=["b", "g", "darkorange", "r", "c", "0.5"])
ax.set_ylabel("Hydrant Count")

### Let's create a *bar plot* to visualize *style* of fire hydrants.

In [None]:
counts = df["Style"].str.upper().value_counts()
counts.plot.bar()

In [None]:
counts = df["Style"].str.upper().value_counts() # GROUP BY, COUNT(*), ORDER BY
top12 = counts.iloc[:12] # we can slice a Series, just like we can a list
top12["other"] = counts.iloc[12:].sum() # insert to Series, like we would w/ a dict
ax = top12.plot.barh(color="k")
ax.set_xlabel("Hydrant Count")

### NEXT LECTURE: In what decade were pacers manufactured? .... The last lecture on plotting is coming!