In [None]:
# ignore this cell (it's just to make certain text red later, but you don't need to understand it).
from IPython.core.display import display, HTML
display(HTML('<style>em { color: red; }</style> <style>.container { width:100% !important; }</style>'))

In [None]:
import pandas as pd
from pandas import DataFrame, Series
import matplotlib

In [None]:
import random

In [None]:
%matplotlib inline

In [None]:
matplotlib.rcParams["font.size"] = 16

# Stock Market Example

In [None]:
df = pd.read_csv("sp500.csv")
df.head()

### How do we compute *total gain* for *every year*?

In [None]:
df["total"] = df["return"].cumprod()
df.tail()

### Assuming you initally invested *1000 dollars in 1970*, what is your *total wealth* in every future year?

In [None]:
starting = 1000
df["wealth"] = df["total"] * starting
df.tail()

### How do we create a *line plot* to plot *wealth gain* over the years?

In [None]:
ax = df.set_index("year")["wealth"].plot.line()
ax.set_ylabel("Total wealth ($)")

### Random simulation

In [None]:
random.choice([3, 5, 9]) # sampling with replacement

In [None]:
random.choices([3, 5, 9], k = 10)

### How risky is it in investing in SP500 index going forward?
### Approach:
1. Run simulation, randomly replaying past years.
2. Run hundreds of simulations, explore range of outcomes.

In [None]:
# Step 1
def run_sim(years = 10, starting = 1000):
    rand_returns = random.choices(df["return"], k = years)
    rand_returns = Series(rand_returns)
    rand_returns = rand_returns.cumprod() * starting
    return rand_returns

# Step 2
def run_n_sims(n, years = 10, starting = 1000):
    sims = pd.DataFrame()
    for i in range(n):
        sims["sim" + str(i)] = run_sim(years = 10, starting = 1000) 
    return sims

### For *10 years*, run *250 simulations*

In [None]:
sims = run_n_sims(250)
sims.tail()

## How do you become a successful Data Scientist?
- By learning how to play both the detective and the lawyer

### How do we create a *line plot* to visualize all of the simulations?
- Be the detective

In [None]:
sims.plot.line(legend = False)

### How do we create a *line plot* to present *statistics* of all of the simulations?
- Be the lawyer

In [None]:
# .quantile
DataFrame({
    "5th percentile": sims.quantile(0.05, axis = 1),
    "median": sims.quantile(0.5, axis = 1),
    "95th percentile": sims.quantile(0.95, axis = 1)
}).plot.line()

# Bar Plot Example w/ Fire Hydrants

In [None]:
df = pd.read_csv("Fire_Hydrants.csv")
df.head()

In [None]:
df.columns

### Let's create a *bar plot* to visualize *colors* of fire hydrants.

In [None]:
clean_df = df["nozzle_color"].str.upper()# How to handle clean up of the data?
ax = clean_df.value_counts().plot.bar(color=["b", "g", "darkorange", "r", "c", "0.5"])
ax.set_ylabel("Fire hydrant count")

### Let's create a *bar plot* to visualize *style* of fire hydrants.

In [None]:
style_counts = df["Style"].str.upper().value_counts()
style_counts.plot.bar()

In [None]:
top12 = style_counts.iloc[:12]
top12["other"] = style_counts.iloc[12:].sum()
ax = top12.plot.bar(color="r")
ax.set_ylabel("Hydrant count")
ax.set_xlabel("Hydrant count")

### In what *decade* were *pacers manufactured*?
### Take a peek at the *Style* column data

In [None]:
print(df["Style"].head())
print(df["Style"].tail())

### Which *column* gives *year* information?

In [None]:
df.columns

### How to get the *decade* for *pacers* and *others*?

In [None]:
df[["Style", "year_manufactured"]].head()

In [None]:
pacer_years = df["year_manufactured"][df["Style"] == "Pacer"]
other_years = df["year_manufactured"][df["Style"] != "Pacer"]
print(pacer_years.head())
print(other_years.head())

In [None]:
pacer_decades = (pacer_years // 10 * 10)
pacer_decades.head()

### How to *count the decades* for pacers and others?

In [None]:
pacer_decades = (pacer_years // 10 * 10).value_counts()
pacer_decades.head()

### How to convert the *decades* back to *int*?

In [None]:
#Doesn't work because of NaN values
#pacer_decades = (pacer_years // 10 * 10).astype(int).value_counts()
#pacer_decades.head()

In [None]:
#Getting rid of NaN values
pacer_decades = (pacer_years // 10 * 10).dropna()
pacer_decades = pacer_decades.astype(int).value_counts()
pacer_decades

In [None]:
other_decades = (other_years // 10 * 10).dropna()
other_decades = other_decades.astype(int).value_counts()
other_decades

### How to put both the pacers and other decade counts Series together?

In [None]:
plot_df = DataFrame({
    "pacer": pacer_decades,
    "other": other_decades,
})
plot_df

### Create a *bar plot* for visualization

In [None]:
ax = plot_df.plot.bar()
ax.set_xlabel("Decade")
ax.set_ylabel("Hydrant Count")

In [None]:
ax = plot_df[plot_df.index >= 1950].plot.bar()
ax.set_xlabel("Decade")
ax.set_ylabel("Hydrant Count")

In [None]:
ax = plot_df[plot_df.index >= 1950].plot.bar(stacked=True)
ax.set_xlabel("Decade")
ax.set_ylabel("Hydrant Count")
None