In [None]:
#Import Stuff
import thinkplot
import thinkstats2
import pandas as pd
import numpy as np
import scipy

The relationship between the discreet and continuous distributions is important because we sometimes need/want to transform our data between the two. 

One example from real life is your GPA - when you do assignments/exams, you end up with a raw percentage grade which is continuous. When this is converted to a letter scale (A, B, etc...), that letter scale is discreet - there's only a selection of possible values (b-,b,b+, etc...). This is binning. We take a continuous varaible and create a discreet variable from it. 
The other example is when your GPA is caclulated - those discreet values are assigned numbers on a 1-4 scale, then averaged together creating a new continuous value - your GPA.

One place where this is commonly used is lending and credit scores. Having a credit score of 752 vs 764 makes no difference, you're placed in a category of "excellent", "very good", etc...


In [None]:
#Load data
df = pd.read_csv("loan_data.csv")
df.head()

In [None]:
#Create credit-score bucket.
df["grade"] = " "
scoreCol = df.columns.get_loc("fico")
gradeCol = df.columns.get_loc("grade")

for i in range(len(df)) :
    if df.iloc[i,scoreCol] < 580:
        #print("Less than 580-"+str(df.iloc[i,scoreCol]))
        df.iloc[i,gradeCol] = "subprime"
    elif df.iloc[i,scoreCol] < 670:
        #print("580-670-"+str(df.iloc[i,scoreCol]))
        df.iloc[i,gradeCol] = "fair"
    elif df.iloc[i,scoreCol] < 740:
        #print("670-740-"+str(df.iloc[i,scoreCol]))
        df.iloc[i,gradeCol] = "good"
    elif df.iloc[i,scoreCol] < 800:
        #print("740-800-"+str(df.iloc[i,scoreCol]))
        df.iloc[i,gradeCol] = "very good"
    else:
        #print("800+-"+str(df.iloc[i,scoreCol]))
        df.iloc[i,gradeCol] = "excellent"
#print(str(scoreCol)+ " "+ str(gradeCol))
df.head(25)

In [None]:
#In generic cases, we can automate this:
bins = np.arange(580, 860, 60) #or
bins = np.array([580, 670, 740, 800])
indicies = np.digitize(df["fico"], bins)
groups = df.groupby(indicies)
for i, group in groups:
    print(i, group["fico"].min(), len(group), np.exp(group["log.annual.inc"]).mean())

In [None]:
#Graph
#hist2 = thinkstats2.Hist(round(df["fico"], -1))
hist2 = thinkstats2.Hist(df["fico"])
pmf2 = thinkstats2.Pmf(df["fico"])
cdf2 = thinkstats2.Cdf(df["fico"])

In [None]:
#Create graphs
thinkplot.PrePlot(6, rows =2, cols=3)
thinkplot.Hist(hist2)
thinkplot.SubPlot(2)
thinkplot.Pmf(pmf2)
thinkplot.SubPlot(3)
thinkplot.Cdf(cdf2)
thinkplot.SubPlot(4)
thinkstats2.NormalProbabilityPlot(df["fico"])
thinkplot.SubPlot(5)
thinkstats2.NormalProbabilityPlot(np.log(df["fico"]))
thinkplot.SubPlot(6)
pdf = thinkstats2.EstimatedPdf(df["fico"]) #See more below
thinkplot.Pdf(pdf)
thinkplot.Config()

In [None]:
#We can use fancy graphs (more next chapter) to make it pretty
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (20,5) #makes the default size larger. 
#Everything after the comma is optional. 
fig, ax = plt.subplots(1,2)
sns.distplot(df["fico"], kde_kws={"color":"red", "label":"KDE"}, hist_kws={"label":"Data"}, ax=ax[0])
sns.distplot(df["fico"], bins=bins, kde_kws={"color":"red", "label":"KDE"}, hist_kws={"label":"Data"}, ax=ax[1])
plt.show()

As above - the KDE produces a smoothed function, and approximates the distribution of the histogram.

The smaller those bins get, the closer of an approximation. The smoothing factor accounts for 'noise' - e.g. around 750ish. 


Skewness. We can visually see the skew - this one is right skewed a bit - the right side is "stretched" out a bit more. We can verify with caclculations...

In [None]:
#Skew
skw = thinkstats2.PearsonMedianSkewness(df["fico"])
print(df["fico"].mean())
print(df["fico"].median())
print(skw)

We can show it a little more clearly on the graph by adding some reference lines for mean and median.

In [None]:
thinkplot.PrePlot(1)
thinkplot.Pdf(pdf)
thinkplot.axvline(df["fico"].mean(), color="Red", label="Mean")
thinkplot.axvline(df["fico"].median(), color="Green", label="Median")
thinkplot.Config()

We can explore a different varaible similarly - income. 

We are given the income in log format. Why might that be? Can you investigate a little, and add normal income to the dataframe?

In [None]:
#create a new column - income. This should show the regular income, not log transformed. 

In [None]:
#Challenge - try to create a function that makes the suite of 6 graphs above.

In [None]:
#Graph - If function wasn't created. 

In [None]:
#Create graphs

In [None]:
#Try to use the data - break into groups of marginal tax rates:
#15% on the first $49,020 of taxable income, plus
#20.5% on the next $49,020 of taxable income (on the portion of taxable income over 49,020 up to $98,040), plus
#26% on the next $53,939 of taxable income (on the portion of taxable income over $98,040 up to $151,978), plus
#29% on the next $64,533 of taxable income (on the portion of taxable income over 151,978 up to $216,511), plus
#33% of taxable income over $216,511

In [None]:
#Create the 6 graph set of graphs for original log income

In [None]:
#Use the data to estimate the number of people in each tax bracket

In [None]:
#Use the cdf to estimate the number of people who earn Teacher Money - lowest: 59,357, highest: 101,162

In [None]:
#Create a KDE showing the distributiion of income.
#Try both log income, and raw income. 

In [None]:
#Challenge - Create a function that takes an income and returns a tax bill, and marginal tax rate:
def muhTaxes(income):
    return taxbill, margRate