In [None]:
#Import Stuff
import thinkplot
import thinkstats2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as ss

One very iportant concept, especially for machine learning, is examining the relationship between two varaibles.
Take height and weight - as you look at taller people, they probably get heavier, but the relationship isn't exact. Danny DeVito is 4'8" and 194lb, NBA player Lonzo Ball is listed as 6'6" and 190lb! Clearly there's some variation in there.

We can analyze and measure the strength of these relationships of one varaible to another, and use that to better undertand our data, and do things like select varaibles for machine learning. 

A primary tool for examining these relationships is a scatter plot - one varaible on the X axis, the other on the Y, and a point for each value. The shape that all these points create will illustrate the relationship

In [None]:
#Load hieght weight data. 
df = pd.read_csv("cardio_train.csv", sep=";")
df.describe()

In [None]:
df.head()

In [None]:
#Change the days into years, so it is more understandable
df["age"] = round(df["age"]/365, 0)
df.head()

In [None]:
#Create a scatterplot. Note scatterplots don't really need a preceeding object like a hist, it is just mapping points on a plot.
thinkplot.Scatter(df["height"], df["weight"])
thinkplot.Show(xlabel="Height", ylabel="Weight", axis=[120,220,20,160])

As people get taller, they get heavier. It isn't an extremely clear relationship though. 

In [None]:
#Add jitter
thinkplot.Scatter(thinkstats2.Jitter(df["height"]), thinkstats2.Jitter(df["weight"]), alpha=.3)
thinkplot.Show(xlabel="Height", ylabel="Weight", axis=[120,220,20,160])

In [None]:
#Hexbin
thinkplot.HexBin(thinkstats2.Jitter(df["height"]), thinkstats2.Jitter(df["weight"]))
thinkplot.Show(xlabel="Height", ylabel="Weight", axis=[120,220,20,160])

All of these visualizations show the same basic thing - the relationship between height and weight. 

We can measure the strength of that relationship, that's called correlation.

In [None]:
#Calculate both correlation metrics
a=df["height"].values.tolist()
b=df["weight"].values.tolist()
thinkstats2.Corr(a, b), thinkstats2.SpearmanCorr(a,b)
#a,b

In [None]:
#Cut down data to only numeric values, and fliter out some outliers. 
df2 = df[["age", "gender", "height", "weight", "ap_lo", "ap_hi"]]
df2 = df2[(df2["height"]>110) & (df2["height"] < 200)]

In [None]:
#Generate pairplot
sns.pairplot(df2, dropna=True, hue="gender")
plt.show()

In [None]:
#Generate regression plot (pretty scatterplot with regression)
sns.regplot(x=df2["height"], y=df2["weight"], robust=True, data=df2, n_boot=1000, x_jitter=.2, y_jitter=.2, ci=85)

We can now load some different data, do some data cleanup, and look at correlations a bit more. 

In [None]:
#Load drug dataset and drop some values, so the total volume of data is manageable. 
drg = pd.read_csv("drug-use-by-age.csv")
drg = drg.drop(columns=["n", 'alcohol-frequency','marijuana-frequency', 'cocaine-frequency','crack-frequency', 'heroin-frequency','hallucinogen-frequency', 'inhalant-frequency',
    'pain-releiver-frequency','oxycontin-frequency','tranquilizer-frequency','stimulant-frequency','meth-frequency','sedative-frequency', 'pain-releiver-use', 'stimulant-use'])
drg.head()

In [None]:
#Generate pairplot
sns.pairplot(drg, dropna=True, kind="reg")
plt.show()

In [None]:
#Take the data in our dataframe, and change it into a set of correlations. 
drg2 = drg.apply(pd.to_numeric, errors='coerce')
drg2 = drg2.drop(columns=["age"])
drg2 = drg2.corr()
drg2.head()

In [None]:
#Show Correlations in heatmap
plt.rcParams["figure.figsize"] = (20,10)
mask = np.triu(np.ones_like(drg2, dtype=bool))
sns.heatmap(drg2, center=0, linewidths=.5, annot=True, cmap="YlGnBu", yticklabels=True, mask=mask)
plt.show()

We can use our newfound correlation skills to look at something near and dear to my heart - diabetes!

In this data we have a bunch of common values that are tracked for diabetic people, the Outcome varaible is a measure of if that person has diabetes. 

Part 1: Load some data and take a preview. Look for correlations visually and calculate some that appear to be correlated numerically. 

In [None]:
#Read Diabetic Data and change to mmol/L, ditch 0 glucose (errors)
#d["Glucose"] = d["Glucose"]/18.018
#My initial dataframe name is just d, filename is diabetes.csv

In [None]:
#Describe data 


In [None]:
#Look for correlations visually

In [None]:
#Calculate some correleations. Use both regular correlation and spearman. 


One thing jumped out there - there are several things with 0 or near 0 values that don't make sense, e.g. Blood Pressure, skin thickness, and BMI

We should clean up our data and then look again. 

NOTE: Do not filter out 0 Insulin people, that is a valid measure. 
Part 2: Refine our examination. This time, add: hue="Outcome" as an argument in the pairplot. 

In [None]:
#Filter out 0s. Rerun graphs. 

In [None]:
#Look for correlations visually with updated data. 
#sns.pairplot(d, dropna=True, kind="reg", hue="Outcome") #The hue colors the diabetic people differently
#plt.show()

That cleans up our data a little. Since this data is about a glucose test, we can detour to examine glucose more closely...

Part 3: Examine the distribution of glucose:

In [None]:
#Look at glucose overall first. 


In [None]:
#Split by if you're diabetic or not. and look at glucose distribution for each group

#NOTE THIS ONE USES A NEW SNS FUNCTION. IT MAY NOT WORK FOR YOU. USE DISTPLOT LIKE THE PREVIOUS EXAMPLE AND ADAPT IT
#OR USE THINKPLOT LIKE ABOVE. THIS DEPENDS ON THE VERSION OF SEABORN THAT IS INSTALLED ON YOUR COMPUTER, NOT ANYTHING YOU'VE DONE.
#sns.displot(data=d, x="Glucose", kde=True, col="Outcome")

Whoa! Way different. Diabetic people are in shambles and normal people are... well... normal. 

We can take our normal people and treat them as normal - quick detour into analytical distributions....

Part 4: Take non diabetic people and create analytical distribution for them. 

In [None]:
#Non-Diabetic people are normal!?! Look at a Normal Probability Plot to assess. 

The normal probability plot looks pretty good. Why don't we make an analytical distribution out of our data?

In [None]:
#Create a normal distribution. Because we have stated "it is normal", we only need mean and std to define it. 
norm = ss.norm(loc=d_neg["Glucose"].mean(), scale=d_neg["Glucose"].std())
#Print some stats... Note mean and median. 
norm.cdf(6), norm.median(), norm.mean()

In [None]:
#Create a histogram - We will take our new distribution and create some random values mirroring that
sns.distplot(norm.rvs(size=10000), color="blue")
sns.distplot(d_neg["Glucose"], color="red")
plt.show()

By doing what we just did there, we effectively said - this is the distribution of glucose for non-diabetics, it is defined by this formula. The "ss.norm..." generates that formula if you provide mean and std. We can use that new analytical model to do stuff, like graph....

What may be concerning about that...?

Our distribution is more or less normal, but not perfect. Maybe we should consider the skew, and try to make it more accurate?

Part 5: Try to make more accurate analytic models

In [None]:
#What about muh skew?
#Skewnorm is another distribution - a normal one with some skew. We calculate it, and pass it to our distribution with a=VARAIBLE_NAME
#Other than that it is the same as above
#norms = ss.skewnorm() <- Fill in the params

norms.cdf(6), norms.median(), norms.mean(), a

We can also just tell the system to figure it out for us....

In [None]:
#Create a model that is fitted to the data, automagically.
ae, loce, scalee = ss.skewnorm.fit(d_neg["Glucose"])
snormFit = ss.skewnorm(ae, loce, scalee)

In [None]:
#Plot all the distributions together. 

Once we have an analytical distribution that mirrors our data closely enough, we can use it. Some caveats:

- The more data we have, the more likely we are able to make an accurate model. Generalizing from a small amount of data is more risky. 

- The closer the distribution is to the distribution you pick, the better. Many are normal, but not all. 

- Does our sample realistically match the population we are modelling? For us, this data comes from a glucose tolerance test, which is one way that doctors diagnose diabetes. You're given a glass of orange juice, then your blood sugar is measured 2 hours later. If it is really high, you're probably diabetic. Most people who have no medical issues wouldn't do this, so we probably should expect our data to be different than the total population. 

In [None]:
#How many people have a blood glucose in an optimal range (3.9-5.4)
#We can use all of our distributions to make a cdf based caculation, like the examples before
low = 3.9
high = 5.4

Enough with the normies, we can go back to looking at diabetics.

For people with diabetes, inspect for correlations. 

Part 6: Dead pancreas society

In [None]:
#Take diabetic people and look for correlations


In [None]:
#Calculate some correleations

In [None]:
#How many diabetic people fall into that healthy range?

Part 7: Correlation Matrix for diabetics. 

In [None]:
#Create correlation dataframe
tmp = d_pos.drop(columns=["Outcome"])
posCor = tmp.corr()
posCor.head()

In [None]:
#Plot correlation matrix
plt.rcParams["figure.figsize"] = (20,10)
mask = np.triu(np.ones_like(posCor, dtype=bool))
sns.heatmap(posCor, center=0, linewidths=.5, annot=True, cmap="YlGnBu", yticklabels=True, mask=mask)
plt.show()

One last thing that jumps out - the insulin column. If you are (type 1 fully) diabetic, you may have no insulin in your system. If you're type 2, you'd expect insulin to skyrocket. What if we split the data along that line?

Part 8: Segregate Insulin Havers 

In [None]:
#Split into two groups. 


In [None]:
#Look at glucose distribution visually for both

In [None]:
#Plot people for correlations, use hue to separate insuling havers vs not. 