In [1]:
# Warmup 1: what module allows us to work with random numbers?  Import it.


In [2]:
import pandas as pd
from pandas import DataFrame, Series
import matplotlib
from matplotlib import pyplot as plt
matplotlib.rcParams["font.size"] = 15 


### Lecture 39:  Plotting Applications

**Learning Objectives**

39.1 Use data from a dataframe to generate random simulations

39.2 Apply features of line plots and bar plots to visualize results of data investigations

39.3 Clean Series data with dropNa() and str.upper()

39.4 Make a stacked bar plot

### Using Stock Market Data to Generate Random Simulations

In [5]:
# this file should be in your directory
df = pd.read_csv("sp500.csv") 
df.tail()

Unnamed: 0,year,return
47,2017,1.2183
48,2018,0.9557
49,2019,1.3149
50,2020,1.184
51,2021,1.1059


### Add a column to your dataframe

In [8]:
# add an extra column called "total" to the dataframe
# the value for the "total column will be :  df["return"].cumprod()
# cumprod stands for 'cumulative product'

# ???? =   df["return"].cumprod()

df.head()

Unnamed: 0,year,return,total
0,1970,1.0401,1.0401
1,1971,1.1431,1.188938
2,1972,1.1898,1.414599
3,1973,0.8534,1.207219
4,1974,0.7353,0.887668


### Add another column to your dataframe, and set the indices of each row

In [9]:
starting = 1000
 
df["wealth"] = df["total"] * starting

# set the index to be 'year'....but just run this cell once

df.tail()

Unnamed: 0,year,return,total,wealth
47,2017,1.2183,122.296742,122296.741896
48,2018,0.9557,116.878996,116878.99623
49,2019,1.3149,153.684192,153684.192143
50,2020,1.184,181.962083,181962.083497
51,2021,1.1059,201.231868,201231.86814


### Make a line plot for the wealth column

In [1]:
# make a line plot for just the wealth column
# store the result in the variable ax
ax = None


# set the y=label to be "Total wealth ($)"


## Random.choices can randomly select a number of values from a list

In [3]:
import random
random.choice([3, 5, 9]) # selects one

3

In [12]:
fruits = ['apple', 'banana', 'cherry']

# random.choices( list, k=howmany)
# change the value of k
random.choices(fruits, None ) # make a list of 10 random choices from this list, with replacement

['cherry']

### Simulating stock market returns using past results
### Approach:
1. First, Run one simulation by randomly choosing previous returns, store the result in a Series
2. Then, Run many simulations, storing all the Series in a dataframe

In [None]:
# Step 1  Run one simulation by randomly choosing previous returns
def run_sim(years = 10, starting = 1000):
    # choose random choices from the 'return' column of our dataframe
    rand_returns = random.choices(df["return"], k = None)       # TODO: WHAT IS K? 
    rand_returns = Series(rand_returns).cumprod() * starting
    return rand_returns

run_sim(5)

In [None]:


# Step 2 run many simulations and collect all the results into a dataframe
# the columns will have the names 'sim0', 'sim1', 'sim2', ....
def run_n_sims(n, years = 10, starting = 1000):
    s = pd.DataFrame()
    for i in range(n):
        # this builds a dataframe one column (Series) at a time
        s["sim" + str(i)] = run_sim(???, ???) 
    return s

# small simulation to understand how it works
run_n_sims(3, 5)

In [None]:
# run 20 simulations, each lasting 30 years 
sims = None
sims.tail()

In [None]:
# line plot on a dataframe
# what happens when you set legend to True?
# sims.plot.line(legend = False)

### Use the Quantile function to get a particular percentile
- defaults to 50th percentile
- defaults to calculating on all rows down each column and returning a Series

In [None]:
sims.quantile()


In [None]:
# axis is the keyword argument that tells quantile how to make one calculation
# the default is axis='index'....this df matches the one above
sims.quantile(axis='index')

### axis='columns' computes any calculation *across* the columns
- to change to iterating over each column of a single row, use axis = 'columns'
- axis works for many functions like mean, std, etc.,

In [None]:
run_n_sims(3, 5)
sims

In [None]:
sims.quantile(0.5, axis = 'columns')

In [None]:
# run a sim with 250 simulations, each lasting 10 years
# this is what financial planners do for their clients
sims = run_n_sims(250, 10)

In [None]:
df = DataFrame({   # TODO: CHANGE THE QUANTILE VALUES
    "10th percentile": sims.quantile(0.00, axis = 'columns'),
    "median": sims.quantile(0.00, axis = 'columns'),
    "90th percentile": sims.quantile(0.00, axis = 'columns')
})
df

In [None]:

df.plot.line()

# Bar Plot Example w/ Fire Hydrants

In [None]:
hdf = pd.read_csv("Fire_Hydrants.csv")
hdf.tail()

In [None]:
# grab just the column names
hdf.columns

### Let's create a *bar plot* to visualize *colors* of fire hydrants.

In [None]:
# make a series called counts_series which stores the value_counts() of the "nozzle_color"
color_counts = hdf["nozzle_color"] # TODO:  FIND THE VALUE COUNTS
color_counts # what type is this? 

In [None]:
# TODO:  clean the data ......use str.upper()
# NOTE:  str.upper() is called on a Series....not on a string

color_counts= hdf["nozzle_color"] 
color_counts

In [None]:
# make a horizontal bar plot of counts of colors and have the colors match
# use color list: ["b", "g", "darkorange", "r", "c", "0.5"]
ax = color_counts.plot.barh()
ax.set_ylabel("Fire hydrant count")

### Let's create a *bar plot* to visualize *style* of fire hydrants.

In [None]:
# Do the same thing as we did for the colors but this time for the "Style"
style_counts = None
style_counts.plot.bar()

In [None]:
# let's just grab the top 12 
top12 = style_counts.iloc[:12]

In [None]:


# and them add an index to our Series for the sum of all the "other"  
top12["other"] = style_counts     # TODO: add onto this
ax = top12.plot.bar(color="r")
ax.set_ylabel("Hydrant count")
ax.set_xlabel("Hydrant count")

### Finally:  Let's find out how many Pacers were installed per decade compared to other Styles

In [None]:
# let's look at the "year_manufactured" for just the Pacer "Style" and put that in a series

pacer_years = None # TODO

pacer_years

In [None]:
# then do the same for all the other data
other_years = None  # TODO
other_years

In [None]:
# let's do some arithmetic on this series to round each year down to the previous 10
# 1987 --> 1980,   2003 --> 2000
pacer_decades = (pacer_years // 10 * 10)
pacer_decades

In [None]:
# now let's drop the NaN values, convert to int, and do value counts
pacer_decade_counts = (pacer_years // 10 * 10).dropna().astype(int).value_counts()
pacer_decade_counts

In [None]:
# TODO:  do the same for other_years
other_decade_counts = None
other_decade_counts

In [None]:
# Build a DataFrame from a dictionary of key, Series
plot_df = DataFrame({   # change the lists for each
    "pacer": [],
    "other": []
})
plot_df

In [None]:
# make a bar plot

ax = plot_df.plot.bar()
ax.set_xlabel("Decade")
ax.set_ylabel("Hydrant Count")

In [None]:
# change the x axis for data that we don't care about

ax = plot_df[plot_df.index >= 1950].plot.bar()
ax.set_xlabel("Decade")
ax.set_ylabel("Hydrant Count")

In [None]:
# make a Stacked Bar Chart!  
ax = plot_df[plot_df.index >= 1950].plot.bar(stacked=True)
ax.set_xlabel("Decade")
ax.set_ylabel("Hydrant Count")
None

In [9]:
### BE sure you know how to make a stacked bar chart .... quiz/exam question


**Learning Objectives: Can you do this?**

39.1 Use data from a dataframe to generate random simulations

39.2 Apply features of line plots and bar plots to visualize results of data investigations

39.3 Clean Series data by dropping NaN valuues, and converting to upper

39.4 Make a stacked bar plot