# Using embeddings to represent week days
(actually mostly python tricks I've learned)



### Dataset
* A count of bicycles rides across East River bridges
* https://www.kaggle.com/new-york-city/nyc-east-river-bicycle-crossings 

        

### Task
* Represent weekdays by using categorical embeddings
* Effectively python version of: http://flovv.github.io/Embeddings_with_keras/

In [41]:
%matplotlib notebook
import keras
from keras import Sequential
from keras.layers import Embedding
from keras.layers import Dense, Dropout, Activation, Flatten

from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit

import pandas as pd
import numpy as np

from plotnine import *
from plotnine import options
options.set_option('figure_size' , (10,6))

    
import matplotlib.pyplot as plt

import seaborn as sns

from pandas.api.types import CategoricalDtype
import warnings
warnings.filterwarnings('ignore')

In [42]:
df = pd.read_csv("~/.kaggle/datasets/new-york-city/nyc-east-river-bicycle-crossings/nyc-east-river-bicycle-counts.csv")
df['date'] = pd.to_datetime(df['Date'])
df['weekday'] = df['date'].dt.weekday
df['weekday_name'] = df['date'].dt.weekday_name

bridge = 'Total'
by_day = df.groupby('weekday_name')[[bridge]].sum()

## ggplot2 in python!

In [43]:
from plotnine import *
ggplot(by_day.reset_index(), aes('weekday_name', bridge)) + geom_bar(stat = 'identity')

<IPython.core.display.Javascript object>

<ggplot: (-9223363250860512403)>

### Ordered categorical variable in pandas

In [45]:
from pandas.api.types import CategoricalDtype
weekdays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
weekday_type = CategoricalDtype(categories=weekdays, ordered=True)
df['weekday_name'] = df['weekday_name'].astype(weekday_type)
by_day = df.groupby('weekday_name')[[bridge]].sum()

In [46]:
ggplot(by_day.reset_index(), aes('weekday_name', bridge)) + geom_bar(stat = 'identity')

<IPython.core.display.Javascript object>

<ggplot: (-9223363250860575656)>

### Pre-processing

In [47]:
df['users'] = df['Brooklyn Bridge']

df = df[df['users'] > 0]
df['scaled_users'] = (df['users'] - np.mean(df['users']))/np.std(df['users'])

emb_size = 3
embedding_names = [f'D{x+1}' for x in np.arange(emb_size)]

In [48]:
model = keras.Sequential()
model.add(Embedding(input_dim=7, output_dim=emb_size, input_length=1, name="embedding"))
model.add(Flatten())
model.add(Dense(units=40, activation='relu'))
model.add(Dense(units=10, activation='relu'))
model.add(Dense(units=1))
model.compile(loss='mse', optimizer='sgd')

In [49]:
hh = model.fit(x=df[['weekday']], y=df[['scaled_users']], epochs=50,batch_size=2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [50]:
mm = model.get_layer('embedding')
emb_matrix = mm.get_weights()[0]
emb_matrix

array([[-0.05626222,  0.04770591,  0.20970954],
       [ 0.20515893, -0.17663999, -0.10393152],
       [-0.4766545 ,  0.39117068,  0.14561543],
       [-0.11553285,  0.07674664,  0.1321006 ],
       [-0.05854096,  0.02401886, -0.01307055],
       [ 0.4257923 , -0.3092957 , -0.31917801],
       [ 0.22764297, -0.15471804, -0.11441667]], dtype=float32)

In [51]:
emp_df = pd.DataFrame(emb_matrix, columns = embedding_names)
emp_df['weekday'] = np.arange(0,7)

df = pd.merge(df, emp_df, on = 'weekday')
dummyw = pd.get_dummies(df['weekday_name'])
df_X = pd.concat([df, dummyw], ignore_index= False, axis = 1)
    
y = (df_X['Williamsburg Bridge'] - np.mean(df_X['Williamsburg Bridge']))/np.std(df_X['Williamsburg Bridge'])

all_x = embedding_names + weekdays
df_X = df_X[all_x]
    
model = LinearRegression()
dummy_x = df_X[weekdays]
emb_x = df_X[embedding_names]

In [52]:
bootstrap = ShuffleSplit(n_splits=100, random_state=0, test_size=0.2)

dummy_scores = cross_val_score(model,dummy_x, y, scoring="neg_mean_squared_error", cv=bootstrap)
emb_scores = cross_val_score(model, emb_x, y, scoring="neg_mean_squared_error", cv=bootstrap)

scores = pd.DataFrame({ 'dummy':-dummy_scores, 'embedded':-emb_scores})
scores.mean()

dummy       0.936436
embedded    0.908810
dtype: float64

In [59]:
ggplot(scores, aes('dummy')) + geom_density() + geom_density(aes('embedded'), color = 'red')

<IPython.core.display.Javascript object>

<ggplot: (8785994180030)>

In [61]:
corr_mat = emp_df.drop('weekday', axis = 1).transpose().corr()
corr_mat.columns = weekdays 
corr_mat['weekdays_name'] = weekdays
corr_mat.set_index('weekdays_name', inplace = True)

corr_mat



Unnamed: 0_level_0,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
weekdays_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Monday,1.0,-0.675453,0.600275,0.907152,0.441099,-0.803674,-0.735102
Tuesday,-0.675453,1.0,-0.995228,-0.92304,-0.95973,0.98165,0.996454
Wednesday,0.600275,-0.995228,1.0,0.881096,0.982562,-0.958359,-0.983489
Thursday,0.907152,-0.92304,0.881096,1.0,0.777797,-0.979462,-0.952135
Friday,0.441099,-0.95973,0.982562,0.777797,1.0,-0.88855,-0.93269
Saturday,-0.803674,0.98165,-0.958359,-0.979462,-0.88855,1.0,0.994214
Sunday,-0.735102,0.996454,-0.983489,-0.952135,-0.93269,0.994214,1.0


In [63]:
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure()
ax = fig.add_subplot(111, projection = '3d')
ax.scatter(xs = emp_df['D1'], ys = emp_df['D2'], zs = emp_df['D3'], c = emp_df['weekday'])

for row_num, day in emp_df.iterrows():
    ax.text(x = day['D1'] , y = day['D2'], z = day['D3'], s = weekdays[row_num])

plt.show()

<IPython.core.display.Javascript object>

# Questions?

