In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from data_cleaner import clean_df

# Hive mind activation
![](https://media.giphy.com/media/26gR0BZV9XBavwbyE/giphy.gif)

Documenting best practices:

In groups: [article 1](https://www.jackhagley.com/What-s-the-difference-between-an-Infographic-and-a-Data-Visualisation), [article 2](https://thoughtbot.com/blog/analyzing-minards-visualization-of-napoleons-1812-march), [article 3](http://dataremixed.com/2016/04/the-design-of-everyday-visualizations/), [article 4](https://visme.co/blog/data-storytelling-tips/), [article 5](https://www.tableau.com/learn/articles/best-beautiful-data-visualization-examples)

To fill in: [Best practices doc](https://docs.google.com/document/d/1Jg_Nwa8K9uCMppSPtmzpHvmJ-dLkRLOfyXJMNxNSoJo/edit#) 

In [None]:
# Anscombe Quartet

In [None]:
sns.set(style="ticks")

# Load the example dataset for Anscombe's quartet
df = sns.load_dataset("anscombe")

# Show the results of a linear regression within each dataset
sns.lmplot(x="x", y="y", col="dataset", hue="dataset", data=df,
           col_wrap=2, ci=None, palette="muted", height=4,
           scatter_kws={"s": 50, "alpha": 1})

In [None]:
# Let's reimport the animal and heart data we have been working with
# along with our cleaner.

In [None]:
#https://data.austintexas.gov/api/views/9t4d-g238/rows.csv?accessType=DOWNLOAD
animal_df = clean_df(pd.read_csv('animals.csv'))
animal_df.head()

In [None]:
# https://www.kaggle.com/ronitf/heart-disease-uci.
heart_df = pd.read_csv('heart.csv')
heart_df.head()

In [None]:
# Briefly back to our task of plotting our data by month of outcome.

In [None]:
animal_df['months'] = animal_df.DateTime.apply(lambda x: x.month)
values, counts = np.unique(animal_df.months, return_counts=True)
plt.bar(x = values, height = counts)
# animal_bar[6].set_color('r') Let's mess around with the color
plt.xlabel('Month')
plt.ylabel('Outcomes')
plt.title('Outcomes by month in Austin animal shelter')

### Histogram Revisited
Histograms are like barplots in the sense that it describe __one-dimensional__ data. A histogram divides the variable into bins, counts the number of observations in each bin, and shows the bins on the x-axis and the frequency on the y-axis. It is used for visualizing __continuous__ variables. <br>

From the documentation: compute and draw the histogram of x. The return value is a tuple (__n, bins, patches__) or ([n0, n1, ...], bins, [patches0, patches1,...]) if the input contains multiple data.

Comparison of barplots and histogram
<img src='img/hist.png'>

In [None]:
# Knowledge check? 
# How can I make an approximately equivalent plot of the months plot
# using one of our histogram plots: df.hist(), plt.hist(), sns.distplot()


In [None]:
# When we create a histogram of random integers, it is not very interesting!
numbers = np.random.randint(1,1000, size = 1000) 
plt.hist(numbers, bins = 10)
plt.show()

In [None]:
numbers = np.random.normal(0, 1, 1000)
plt.hist(numbers, bins = 10) # if we change the bin numbers, it will look smoother
plt.show()

In [None]:
sns.distplot(numbers, bins = 100)

In [None]:
# Extra: if I take the mean of random subsets of random numbers
# what would the distribution look like?
# let's try!

mean_list = []



In [None]:
dog_df = animal_df[animal_df['Animal Type'] == 'Dog']
cat_df = animal_df[animal_df['Animal Type'] == 'Cat']
bird_df = animal_df[animal_df['Animal Type'] == 'Bird']

n, bins, patches = plt.hist(cat_df['days_at_outcome'], bins=20, alpha = .5)
plt.hist(bird_df['days_at_outcome'], bins=30)
# plt.setp(patches[1], 'facecolor', 'red') # set a patchcolor to show a specific bin
plt.hist(dog_df['days_at_outcome'], bins=20) # alpha can be toggled to show overlapping data
plt.title("Distributions Ages of Dogs, Cats, and Birds\n at the Austin Animal Shelter")
plt.xlabel('Days old at shelter at time of outcome')
plt.ylabel('Count')
plt.show()

In [None]:
## let's look at the Seaborn equivalent

In [None]:
sns.distplot(bird_df['days_at_outcome'], bins = 30, 
             kde = False,  # toggle KDE to include kernal density line 
             rug = True) # toggle rug to show ticks of elements in bins
plt.title("Days to outcome of birds\nin the Austin shelter")
plt.show()

In [None]:
## Another way with Pandas
animal_df.hist()

In [None]:
heart_df.hist(figsize=(10,10))

plt.show()

### 2. Boxplots
Boxplots are a very informative type of visualization tool because it allows us to know the distribution of the data. 
<img src='img/boxplot.png'>

In [None]:
sns.boxplot(x = animal_df['Animal Type'], 
            y= animal_df['days_at_outcome'], showfliers = False,
            )

In [None]:
# What does this say about our distributions?

In [None]:
fig, (ax1,ax2, ax3) = plt.subplots(nrows = 1, ncols=3, sharey=False) #toggling sharey effectively changes the scale
dog_df = animal_df[animal_df['Animal Type'] == 'Dog']
cat_df = animal_df[animal_df['Animal Type'] == 'Cat']
bird_df = animal_df[animal_df['Animal Type'] == 'Bird']

sns.boxplot(dog_df['Animal Type'], dog_df['days_at_outcome'], ax = ax1, showfliers = True)
sns.boxplot(cat_df['Animal Type'], cat_df['days_at_outcome'], 
                    ax = ax2, showfliers = False, color='red')
sns.boxplot(bird_df['Animal Type'], bird_df['days_at_outcome'], 
                    ax = ax3, showfliers = False, color='green') # toggle showfliers to not show outliers
plt.tight_layout() # One way to fix the axis label problem                               

In [None]:
## Another way with Pandas
animal_df.boxplot()

In [None]:
heart_df.boxplot()
plt.tight_layout()
plt.xticks(rotation = 90)
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler # we will get to these libraries
ss = StandardScaler()
ss_heart = pd.DataFrame(ss.fit_transform(heart_df)) # scale the data to better show the relationships
ss_heart.columns = heart_df.columns
ss_heart.boxplot()
plt.xticks(rotation = 90)
plt.title('Scaled heart_df boxplots')
plt.show()

# Scatter plots!

![cats scattering](http://giphygifs.s3.amazonaws.com/media/H73nLSMPixSXC/giphy.gif)
### 3. Scatterplot - visualizing two dimensional data
Scatterplots are usually used for visualizing two dimensional data (observations with two variables). It allows us to examine the relationship between two variables, thus it is sometimes called a correlation plot. 

In [None]:
# generate some data -> the sine wave
x = np.linspace(0, 10)
y = np.sin(x)

plt.scatter(x, y, label = "Function: sin(x)")
plt.title('Scatter Plot in Matplotlib')
plt.legend()

In [None]:
# examining correlation with height and weight 
height = [63,62,60,63,64,65,68,67,64,71,72,70,73]
weight = [120,115,114,119,125,130,135,140,128,140,150,165,180]
plt.scatter(height, weight,color = 'r', marker = 'o') #toggle marker to change shape (+,^, o)

In [None]:
sns.scatterplot(height, weight, marker = '+') # similar notation to toggle markers in Seaborn
plt.title('Seaborn Scatter: Height vs Weight')
plt.xlabel('Height')
plt.ylabel('Weight')
plt.plot()
plt.show()

In [None]:
sns.regplot(heart_df['chol'], heart_df['age'], ci = None) # toggle ci to get rid of the 95% confidence interval

In [None]:
# lmplot is very similar, but try running type on the object.
sns.lmplot('chol','age', heart_df)

# Pairplot to the rescue!
![](https://media.giphy.com/media/FQVZk2elXU14Q/giphy.gif)

In [None]:
# Pairplot is a great way to quickly get a glimpse of the relationships
# between variables.
first_five = list(heart_df)[0:5]
sns.pairplot(hear_df[first_five])

# Correlation and Heatmaps!
![](https://media.giphy.com/media/3oKIPxua1XGPonJm3C/giphy.gif)

In [None]:
# Whereas pairplots can be somewhat subjective, correlations can give 
# us a numerical representation.
heart_df[first_five].corr()

In [None]:
# We can then use the heatmap plot to give a different type of correlation
# visualization
sns.heatmap(heart_df[first_five].corr())

In [None]:
heart_df.head()

# Layering

![cake layers](https://media.giphy.com/media/XMgCFjsCSARxK/giphy.gif)

In [None]:
for key, group_indices in heart_df.groupby('sex').indices.items():
    group = heart_df.loc[group_indices]
    plt.scatter(group['age'], group['trestbps'], label = key)
    
plt.legend()

In [None]:
for key, group_indices in heart_df.groupby('sex').indices.items():
    group = heart_df.loc[group_indices]
    sns.scatterplot(group['age'], group['trestbps'], label = key)

plt.legend()
plt.title('Trestbps vs Age\n in Male and Female Participants')
plt.show()

In [None]:
heart_df['age_bins'] = pd.cut(heart_df['age'], bins = 3)
for key, group_indices in heart_df.groupby(age_bins).indices.items():
    group = heart_df.loc[group_indices]
    plt.scatter(group['age'], group['trestbps'])

# Seaborn and fancy color warning

In [None]:
sns.barplot(values, counts) # this looks very nice, but, we want to be intentional with our colors.


In [None]:
# Finally, use the help function if you want a description 
# of a method's capabilities
help(plt.boxplot)