In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

plt.rcParams['figure.figsize'] = (4, 4)
plt.rcParams['figure.dpi'] = 150
plt.rcParams['lines.linewidth'] = 3
sns.set()

### Manhattan Taxis

In [None]:
def taxi_scatter(t, prefix='pickup'):
    plt.scatter(t[prefix + '_lon'], t[prefix + '_lat'], s=2, alpha=0.2)
    plt.xticks(np.arange(-74.02, -73.921, 0.04))
    plt.yticks(np.arange(40.7, 40.9, 0.04))
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    plt.title(prefix + ' locations')
    
taxi = pd.read_csv('manhattan_taxi.csv')
plt.figure(figsize=(3, 8))
taxi_scatter(taxi)

In [None]:
plt.figure(figsize=(3, 8))
taxi_scatter(taxi, 'dropoff')
taxi_scatter(taxi, 'pickup')
plt.legend();

Which are reasonable explanations of the data based on the plot above?
- (a) pick-up and drop-off locations are like random samples from the **same** distribution
- (b) pick-up and drop-off locations are like random samples from **different** distributions
- (c) there are **fewer** pick-ups than drop-offs
- (d) there are **more** pick-ups than drop-offs
- (e) the total variance of drop-off location is **higher** than pick-up location
- (f) the total variance of drop-off location is **lower** than pick-up location

In [None]:
taxi.loc[:, 'south_of_40.75'] = taxi['pickup_lat'] < 40.75
plt.figure(figsize=(3, 8))
taxi_scatter(taxi[taxi['south_of_40.75'] == True])
taxi_scatter(taxi[taxi['south_of_40.75'] == False])

### Qcut

In [None]:
r = np.random.normal(0, 1, 100)
r

In [None]:
pd.qcut(r, 5)

In [None]:
bins = pd.qcut(r, 5, labels=[1, 2, 3, 4, 5])
bins

In [None]:
plt.hist(r, bins=30);

In [None]:
bins.value_counts()

Discussion question: what will the following 

In [None]:
sns.boxplot(bins, r);

In [None]:
taxi.loc[:, 'ns'] = pd.qcut(taxi['pickup_lat'], 2, labels=['south', 'north'])
plt.figure(figsize=(3, 8))
taxi_scatter(taxi[taxi['ns'] == 'north'])
taxi_scatter(taxi[taxi['ns'] == 'south'])

In [None]:
groups = 5
taxi.loc[:, 'group'] = pd.qcut(taxi['pickup_lat'], groups, labels=np.arange(groups))
plt.figure(figsize=(3, 8))
for group in np.arange(groups):
    taxi_scatter(taxi[taxi['group'] == group])

In [None]:
def group_by(values, num_regions):
    taxi.loc[:, 'group'] = pd.qcut(values, num_regions, 
                                   labels=np.arange(num_regions))

    plt.figure(figsize=(3, 8))
    for group in np.arange(num_regions):
        taxi_scatter(taxi[taxi['group'] == group])
        
group_by(taxi['pickup_lat'], 5)

### PCA for region splitting

In [None]:
D = taxi[['pickup_lon', 'pickup_lat']].values
X = (D - np.mean(D, axis=0)) 

plt.figure(figsize=(3, 8))
plt.scatter(X[:, 0], X[:, 1], s=2, alpha=0.2);

In [None]:
from ipywidgets import interact

def group_direction(slope):
    direction = X @ np.array([1, slope])
    group_by(direction, 3)
    plt.figure(figsize=(3, 2))
    plt.hist(direction)
    
interact(group_direction, slope=(1, 3, 0.2));

In [None]:
# Find the first principle component
u, s, vt = np.linalg.svd(X / np.sqrt(D.shape[0]), 
                         full_matrices=False)
vt.T[:, 0]

In [None]:
first_pc = X @ vt.T[:, 0]
plt.hist(first_pc);

In [None]:
group_by(first_pc, 3)

In [None]:
group_by(first_pc, 7)

Discussion question: what will the following outputs look like?

In [None]:
# Not too difficult
group_by(X @ vt.T[:, 1], 3)

In [None]:
# More difficult!
v = (X @ vt.T[:, 0] > 0) + 2 * (X @ vt.T[:, 1] > 0)
plt.figure(figsize=(3, 8))
for group in [0, 1, 2, 3]:
    taxi_scatter(taxi[v == group])

Discussion question: Why aren't the two lines perpendicular?

In [None]:
v = (X @ vt.T[:, 0] > 0) + 2 * (X @ vt.T[:, 1] > 0)
plt.figure(figsize=(5, 8))
for group in [0, 1, 2, 3]:
    taxi_scatter(taxi[v == group])

### Feature engineering

In [None]:
train = np.array([[4., 2., 6., 4.]]).T
test = np.array([[8., 6.]]).T

train_mean = np.mean(train)
print('train centered:')
print(train - train_mean)
print('test centered:')
print(test - train_mean)

In [None]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler(with_std=False)
scaler.fit(train)
print('train centered:')
print(scaler.transform(train))
print('test centered:')
print(scaler.transform(test))

Discussion question: is there anything wrong with the way train & test are used in this feature engineering?

In [None]:
import sklearn.model_selection

train, test = sklearn.model_selection.train_test_split(taxi, test_size=0.2, 
                                                       random_state=42)

D_train = train[['pickup_lon', 'pickup_lat']].values
D_test  =  test[['pickup_lon', 'pickup_lat']].values
train_means = np.mean(D_train, axis=0)
X_train = D_train - train_means
X_test  =  D_test - train_means
groups = [0, 1, 2]

train.loc[:, 'group'] = pd.qcut(X_train @ np.array([1, 2.6]), 3, labels=groups)
test.loc[:, 'group']  = pd.qcut(X_test  @ np.array([1, 2.6]), 3, labels=groups)

plt.figure(figsize=(3, 8))
for group in groups:
    taxi_scatter(train[train['group'] == group])
    taxi_scatter(test[test['group'] == group])

In [None]:
pd.qcut(X_train @ np.array([1, 2.6]), 3).value_counts()

In [None]:
pd.qcut(X_test @ np.array([1, 2.6]), 3).value_counts()

### Why bin quantitative values?

In [None]:
x = X_train @ np.array([1, 2.6])
y = train['distance'] / train['duration'] * 60 * 60
sns.regplot(x, y);

In [None]:
n = 4
x_cut = pd.qcut(x, n, np.arange(n))
sns.boxplot(x_cut, y);

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(x.reshape(-1, 1), y)
y_hat = model.predict(x.reshape(-1, 1))
np.average((y_hat - y)**2) ** 0.5

In [None]:
means = pd.DataFrame({'x_cut': x_cut, 'y': y}).groupby('x_cut').mean()
means

In [None]:
y_hat = means.loc[x_cut, 'y'].values
np.average((y_hat - y)**2) ** 0.5

In [None]:
x_and_x2 = np.vstack([x, x*x]).T
x_and_x2

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model = LinearRegression()
model.fit(x_and_x2, y)
y_hat = model.predict(x_and_x2)
np.average((y_hat - y)**2) ** 0.5