In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
%matplotlib inline

In [None]:
df = pd.read_csv('data/hwg.csv')
df.head()

In [None]:
df.head()

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
df['genc'] = df['Gender'] == 'Male'
df.plot.scatter('Height', 'Weight', c=df['genc'], ax=ax)

## Variances of single variables

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
df['Height'].hist(bins=50, ax=ax, alpha=0.5, label='Height')
df['Weight'].hist(bins=50, ax=ax, alpha=0.5, label='Weight')
plt.legend()

### The Covariance Matrix

In [None]:
np.cov(df['Height'], df['Weight'])

## Example of Low Covariance - Rotate the Male data by 90deg

### The rotation matrix:
$$ R_{\theta} = \begin{bmatrix}\cos\theta & -\sin\theta \\ \sin\theta & \cos\theta\end{bmatrix}$$

In [None]:
def get_rotation_matrix(theta):  # in RADIANS!
    return np.array([[np.cos(theta), - np.sin(theta)],
                     [np.sin(theta), np.cos(theta)]])

In [None]:
# Normalize the data, so we are rotating about the origin
X = df[['Height', 'Weight']].values
xCent = X - X.mean(axis=0)
xNorm = xCent / xCent.std(axis=0)

# put it back into the dataframe
df['hnorm'] = xNorm[:, 0]
df['wnorm'] = xNorm[:, 1]

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))

df.plot.scatter('hnorm', 'wnorm', c=df['genc'], ax=ax)

### Question: What's different about this scatterplot?

In [None]:
males = df[df['Gender'] == 'Male']
X = males[['hnorm', 'wnorm']].values
females = df[df['Gender'] == 'Female']
Y = females[['hnorm', 'wnorm']].values

In [None]:
rotator_90 = get_rotation_matrix(np.pi / 2)
xrot = np.dot(rotator_90, X.T)

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
plt.scatter(*xrot, label='Male')
plt.scatter(*Y.T, label='Female')
plt.gca().set_aspect('equal')
plt.legend()

In [None]:
newData = np.r_[xrot.T, Y]

In [None]:
np.cov(newData.T)

In [None]:
# Note the negative sign!