# Lesson 3: Demos


## Normality Demo
http://hamelg.blogspot.com/2015/11/python-for-data-analysis-part-21.html

In [None]:
#General imports
import numpy as np
import pandas as pd
import seaborn as sns  # Plots!
import matplotlib.pyplot as plt

# Set the figure size for plotting
sns.mpl.rc("figure", figsize=(10,4))

%matplotlib inline

In [None]:
mtcars = pd.read_csv("mtcars.csv")
mtcars.head()

In [1]:
# Summary statistics
# mtcars.mean()
# mtcars.median()
# mtcars.quantile(0.5)
# mtcars.mode()
# mtcars.std()
# mtcars.median()
# mtcars.quantile(0.25)

In [None]:
plt.figure(figsize=(10, 10))
mtcars.corr()

# sns.heatmap

# Pandas -> pd
# Seaborn -> sns

# sns.heatmap?

In [None]:
mtcars[['cyl', 'mpg']].sort_values(by="mpg").head(10)

In [None]:
# To change size:

sns.swarmplot(x="cyl", y="mpg", data=mtcars);
sns.swarmplot?

In [None]:
plt = sns.boxplot(x="cyl", y="mpg", data=mtcars)

plt.text(x=0.25, y=31.25, s="3rd Quartile")
plt.text(x=-.2, y=26.75, s="Median")
plt.text(x=0.25, y=21.5, s="1st Quartile")
plt.text(x=-0.1, y=20, s="Min")
plt.text(x=-0.1, y=34.5, s="Max")
plt.text(x=0.45, y=27.5, s="IQR", rotation=90, size=25);

# mtcars[['mpg']].quartile(0.25)
# mtcars[['mpg']].quartile(0.5)
# mtcars[['mpg']].quartile(0.75)
# mtcars[['mpg']].max()
# mtcars[['mpg']].min()

## Analyzing Distributions

Although the mean and median both give us some sense of the center of a distribution, they aren't always the same. The *median* gives us a value that **splits the data into two halves** while the *mean* is a **numeric average,** so extreme values can have a significant impact on the mean. 

In a symmetric distribution, the mean and median will be the same. Let's investigate with a density plot:

In [None]:
mtcars["mpg"].skew()  # Check skewness

In [None]:
mtcars["mpg"].kurt()  # Check kurtosis 

To explore these two measures further, let's create some dummy data and inspect it:

In [None]:
norm_data = np.random.normal(size=100000)
skewed_data = np.concatenate((np.random.normal(size=35000)+2, 
                             np.random.exponential(size=65000)), 
                             axis=0)
uniform_data = np.random.uniform(0,2, size=100000)
peaked_data = np.concatenate((np.random.exponential(size=50000),
                             np.random.exponential(size=50000)*(-1)),
                             axis=0)

data_df = pd.DataFrame({"norm":norm_data,
                       "skewed":skewed_data,
                       "uniform":uniform_data,
                       "peaked":peaked_data})

## Types of distributions

In [None]:

plt = sns.kdeplot(data_df["norm"]);

kurtosis = data_df["norm"].kurt()
skewness = data_df["norm"].skew()

# Plot black line at mean
plt.vlines(data_df["norm"].mean(), ymin=0, ymax=.5, linewidth=2.0)

# Plot red line at median
plt.vlines(data_df["norm"].median(), ymin=0, ymax=.5, linewidth=1.0, color="red");

plt.text(x=3, y=0.35, s="Kurtosis " + str(kurtosis));
plt.text(x=3, y=0.3, s="Skewness " + str(skewness));

In [None]:
dist = data_df["peaked"]

plt = sns.kdeplot(dist);

kurtosis = dist.kurt()
skewness = dist.skew()

# Plot black line at mean
plt.vlines(data_df["peaked"].mean(), ymin=0, ymax=.5, linewidth=2.0)

# Plot red line at median
plt.vlines(data_df["peaked"].median(), ymin=0, ymax=.5, linewidth=1.0, color="red");

plt.text(x=3, y=0.35, s="Kurtosis " + str(kurtosis));
plt.text(x=3, y=0.3, s="Skewness " + str(skewness));


In [None]:
plt = sns.kdeplot(data_df["uniform"])

kurtosis = data_df["uniform"].kurt()
skewness = data_df["uniform"].skew()

# Plot black line at mean
plt.vlines(data_df["uniform"].mean(), ymin=0, ymax=.6, linewidth=2.0)

# Plot red line at median
plt.vlines(data_df["uniform"].median(), ymin=0, ymax=.6, linewidth=1.0, color="red");

plt.text(x=2, y=0.35, s="Kurtosis " + str(kurtosis));
plt.text(x=2, y=0.3, s="Skewness " + str(skewness));

In [None]:
plt = sns.kdeplot(data_df["skewed"])

kurtosis = data_df["skewed"].kurt()
skewness = data_df["skewed"].skew()

# Plot black line at mean
plt.vlines(data_df["skewed"].mean(), ymin=0, ymax=.6, linewidth=2.0)

# Plot red line at median
plt.vlines(data_df["skewed"].median(), ymin=0, ymax=.6, linewidth=1.0, color="red");

plt.text(x=2, y=0.35, s="Kurtosis " + str(kurtosis));
plt.text(x=2, y=0.3, s="Skewness " + str(skewness));

In [None]:
data_df["skewed"].median()

In [None]:
norm_data = np.random.normal(size=50)
outliers = np.random.normal(15, size=3)
combined_data = pd.DataFrame({"combined": np.concatenate((norm_data, outliers), axis=0)})
comb = combined_data["combined"] 

sns.kdeplot(comb)

kurtosis = comb.kurt()
skewness = comb.skew()
plt.vlines(comb.mean(),     # Plot black line at mean
           ymin=0, 
           ymax=.45,
           linewidth=2.0)

plt.vlines(comb.median(),   # Plot red line at median
           ymin=0, 
           ymax=.45, 
           linewidth=1.0,
           color="red")

plt.text(x=2, y=0.2, s="Kurtosis " + str(kurtosis));
plt.text(x=2, y=0.15, s="Skewness " + str(skewness));

Since the median tends to resist the effects of skewness and outliers, it is known a "robust" statistic. 

The median generally gives a better sense of the typical value in a distribution with significant skew or outliers.

### All together

In [None]:
plt = sns.kdeplot(data_df["uniform"])
sns.kdeplot(data_df["peaked"])
sns.kdeplot(data_df["skewed"])
sns.kdeplot?

In [None]:
exponential_dist = np.random.exponential(size=100000)

exp_data = pd.DataFrame(exponential_dist)

plt = sns.distplot(skewed_data)

# Plot black line at mean
plt.vlines(exp_data.mean(), ymin=0, ymax=1, linewidth=2.0)

# Plot red line at median
plt.vlines(exp_data.median(), ymin=0, ymax=1, linewidth=1.0, color="red");

print "MEAN: ", exp_data[0].mean() 
print "STD: ", exp_data[0].std()
print "Kurt:", exp_data[0].kurt() 
print "Skew: ", exp_data[0].skew()

### Skewness and Kurtosis
*Skewness* measures the **skew or asymmetry of a distribution** while *Kurtosis* measures the **"peakedness" of a distribution**. 

We won't go into the exact calculations behind these, but they are essentially just statistics that take the idea of variance a step further: while variance involves squaring deviations from the mean, skewness involves cubing deviations from the mean, and kurtosis involves raising deviations from the mean to the 4th power.

Pandas has built in functions for checking skewness and kurtosis, df.skew() and df.kurt() respectively:

### Skewness

Now let's check the skewness of each of these distributions. 

Since skewness measures asymmetry, we'd expect to see low skewness for all of the distributions except the skewed one, because all the others are roughly symmetric:

In [None]:
data_df.skew()

### Kurtosis

Now let's check kurtosis. Since kurtosis measures peakedness, we'd expect the flat (uniform) distribution to have low kurtosis while the distributions with sharper peaks should have higher kurtosis.

In [None]:
data_df.kurt()

As we can see from the output, the normally distributed data has a kurtosis near zero, the flat distribution has negative kurtosis, and the two pointier distributions have positive kurtosis.

### Post-class exercises

- http://chrisalbon.com/python/pandas_with_seaborn.html
- https://stanford.edu/~mwaskom/software/seaborn/tutorial/distributions.html
- https://stanford.edu/~mwaskom/software/seaborn/tutorial/categorical.html

## Class Variable Demo

### Class/Dummy Variables
We want to represent categorical variables numerically, but we can't simply code them as 0=rural, 1=suburban, 2=urban because that would imply an **ordered relationship** between suburban and urban (suggesting that urban is somehow "twice" the suburban category, which doesn't make sense).

Why do we only need **two dummy variables, not three?** Because two dummies capture all of the information about the Area feature, and implicitly defines rural as the reference level.

In general, if you have a categorical feature with k levels, you create k-1 dummy variables.


#### Create three dummy variables using get_dummies, then exclude the first dummy column
my_categorical_var_dummies = pd.get_dummies(my_categorical_var, prefix='Area').iloc[:, 1:]

In [None]:
# read data into a DataFrame
data = pd.read_csv('http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv', index_col=0)
data.head()

### Handling Categorical Predictors with Two Categories

Up to now, all of our predictors have been numeric. What if one of our predictors was categorical?

Let's create a new feature called "Size," and randomly assign observations to be small or large:

In [None]:
# set a seed for reproducibility
np.random.seed(12345)

# create a Series of booleans in which roughly half are True
nums = np.random.rand(len(data))
mask_large = nums > 0.5

# initially set Size to small, then change roughly half to be large
data['Size'] = 'small'
data.loc[mask_large, 'Size'] = 'large'
data.head()

For scikit-learn, we need to represent all data numerically. 

If the feature only has two categories, we can simply create a dummy variable that represents the categories as a binary value.

In [None]:
# create a new Series called IsLarge
data['IsLarge'] = data.Size.map({'small':0, 'large':1})
data.head()

### Handling Categorical Predictors with More than Two Categories

Let's create a new feature called Area, and randomly assign observations to be rural, suburban, or urban:

In [None]:
# set a seed for reproducibility
np.random.seed(123456)

# assign roughly one third of observations to each group
nums = np.random.rand(len(data))
mask_suburban = (nums > 0.33) & (nums < 0.66)
mask_urban = nums > 0.66
data['Area'] = 'rural'
data.loc[mask_suburban, 'Area'] = 'suburban'
data.loc[mask_urban, 'Area'] = 'urban'
data.head()

We have to represent Area numerically, but we can't simply code it as 0=rural, 1=suburban, 2=urban because that would imply an ordered relationship between suburban and urban (and thus urban is somehow "twice" the suburban category).

Instead, we create another dummy variable:

In [None]:
# create three dummy variables using get_dummies, then exclude the first dummy column
area_dummies = pd.get_dummies(data.Area, prefix='Area').iloc[:, 1:]

# concatenate the dummy variable columns onto the original DataFrame (axis=0 means rows, axis=1 means columns)
data = pd.concat([data, area_dummies], axis=1)
data.head()