## Notebook for making hour2vec and dow2vec

In [1]:
import pandas as pd
import numpy as np
import re
import _pickle as pickle
from haversine import haversine
import matplotlib.pyplot as plt
%matplotlib inline
from common import load_clean_train, load_clean_test, load_midpoint_id, extract_dates, add_midpoint_loc_id, tt_join_city_stats2clean, load_clean_parking

In [7]:
# SVD
from sklearn.decomposition import NMF

def NMF_factorize(X, n=5):
    msk = X.isna().values
    X = X.fillna(0)
    X_imputed = X.copy()
   # Initializing model
    nmf_model = NMF(n_components=n)
    nmf_model.fit(X_imputed.values)
    W = nmf_model.fit_transform(X_imputed.values)
    
    # iterate model
    while nmf_model.reconstruction_err_**2 > 10:
        W = nmf_model.fit_transform(X_imputed.values)
        X_imputed.values[~msk] = W.dot(nmf_model.components_)[~msk]
        print(nmf_model.reconstruction_err_)
    
    
    return W, nmf_model.components_


def fill_matrix_w_df(mtx, df):
    placeholder = mtx.copy()
    rows, cols = df.shape
    for i in range(rows):
        hour = df.loc[i,'hour']
        #print('-========', hour)
        for j in range(1,cols):
            day = df.columns[j]
            #print(hour, day, j)
            if not pd.isnull(df.iloc[i,j]):
                placeholder.iloc[hour-1,day] = df.iloc[i,j]
    return placeholder


def make_vectors(train_df, parking_df, snr_df, specific_loc=60, street_name='MISSION ST'):
    placeholder = np.empty((24,7))
    placeholder[:] = np.nan
    placeholder_df = pd.DataFrame(placeholder)
        
    # training
    spec_trn = train_df[train_df['loc_id']==specific_loc].copy()
    s = spec_trn.groupby(['dow','hour','day', 'mon'])['Real.Spots'].mean().reset_index()
    s2 = s.groupby(['dow','hour'])['Real.Spots'].mean().reset_index()
    s4 = s2.pivot(index='hour', columns='dow', values='Real.Spots').reset_index()
    
    # parking
    spec_park = parking_df[parking_df['loc_id']==specific_loc]
    s = spec_park.groupby(['dow','hour','day','month']).size().reset_index()
    max_spots = np.max(s[0])
    s[0] = max_spots - s[0] 
    s2 = s.groupby(['dow','hour'])[0].mean().reset_index()
    s3 = s2.pivot(index='hour', columns='dow', values =0).reset_index()
    s3.fillna(max_spots,inplace=True)
    
    #SENSOR
    if street_name == 'GEARY ST':
         street_name = 'MONTGOMERY ST'
    s5 = snsr_df[snsr_df['STREET_NAME']==street_name].groupby(['dow','hour'])['vac_spaces'].mean().reset_index()
    s5 = s5.pivot(index='hour', columns = 'dow', values = 'vac_spaces').reset_index()
    
    pl0 = fill_matrix_w_df(placeholder_df, s5)
    pl1 = fill_matrix_w_df(pl0, s3)
    pl2 = fill_matrix_w_df(pl1, s4)

    pl2 = pl2[(pl2.T != 0).any()]
    pl2.dropna(axis=0,how='all',inplace=True)
    pl2.dropna(axis=1,how='all',inplace=True)
    hour_row_labels = pl2.index
    dow_col_labels = pl2.columns.values
    hour_vec, dow_vec = NMF_factorize(pl2, n = 12)
    return hour_vec, dow_vec, hour_row_labels, dow_col_labels

def make_all_vectors(train_df, parking_df, snr_df):
    locid_hour2vec = None
    locid_dow2vec = None
    
    print('gathering ids and streets....')
    loc_ids = train_df.groupby(['loc_id','Street']).size().reset_index()
    loc_ids = loc_ids[['loc_id','Street']].values
    loc_ids = {k : v.upper().replace('STREET','ST').replace('Avenue', 'AVE') for k,v in loc_ids}
    all_vecs = {}
    
    for loc_id, street_name in loc_ids.items():
        print('processing ... %d %s' % (loc_id, street_name))
        hour_vec, dow_vec, hours, dow = make_vectors(train_df, parking_df, snr_df, specific_loc=loc_id, street_name=street_name)
        
        block = pd.DataFrame(hour_vec)
        block['hour'] = hours
        block['loc_id'] = loc_id
        if locid_hour2vec is not None:
            locid_hour2vec = pd.concat([locid_hour2vec, block])
        else:
            locid_hour2vec = block


        block = pd.DataFrame(dow_vec.T)
        block['dow'] = dow
        block['loc_id'] = loc_id
        if locid_dow2vec is not None:
            locid_dow2vec = pd.concat([locid_dow2vec, block])
        else:
            locid_dow2vec = block
    
    print('reordering for output')
    locid_hour2vec = locid_hour2vec[['loc_id','hour',0,1,2,3,4,5,6,7,8,9,10,11]].reset_index()
    locid_hour2vec.drop(columns='index', inplace=True)
    locid_hour2vec.columns = ['loc_id','hour'] + ['hourvec%d' % i for i in range(12)] 

    locid_dow2vec = locid_dow2vec[['loc_id','dow',0,1,2,3,4,5,6,7,8,9,10,11]].reset_index()
    locid_dow2vec.drop(columns='index', inplace=True)
    locid_dow2vec.columns = ['loc_id','dow'] + ['dowvec%d' % i for i in range(12)]
    
    locid_hour2vec.to_feather('ref_data/parking_locid_hour2vec.feather')
    locid_dow2vec.to_feather('ref_data/parking_locid_dow2vec.feather')
    
    print('shapes are %s %s' %(str(locid_dow2vec.shape), str(locid_hour2vec.shape)))
    return locid_hour2vec, locid_dow2vec
    

In [8]:
def add_yelp(input_df):
    tmp = input_df.copy()
    yelp_summary_stats = pd.read_feather('ref_data/yelp_summary_stats_df_by_location.feather')
    tmp = tmp.merge(yelp_summary_stats, how='left', on = 'loc_id')
    return tmp

train_df = load_clean_train()
test_df = load_clean_test()
print(train_df.shape, test_df.shape)

train_df = extract_dates(train_df)
test_df = extract_dates(test_df)
print(train_df.shape, test_df.shape)

train_df = add_midpoint_loc_id(train_df)
test_df = add_midpoint_loc_id(test_df)
print(train_df.shape, test_df.shape)

train_df = tt_join_city_stats2clean(train_df)
test_df = tt_join_city_stats2clean(test_df)
print(train_df.shape, test_df.shape)

train_df = add_yelp(train_df)
test_df = add_yelp(test_df)
print(train_df.shape, test_df.shape)

train_df['STREET'] = train_df['Street'].map(lambda x : x.replace(' ','').lower())

(1100, 18) (726, 16)
(1100, 26) (726, 24)
(1100, 27) (726, 25)
(1100, 31) (726, 29)
(1100, 316) (726, 314)


In [9]:
snsr_df = pd.read_feather('/Users/timlee/data/sf_parking/sensor_small.feather')

In [10]:
parking_df = load_clean_parking()

In [11]:
hour2vec, dow2vec = make_all_vectors(train_df, parking_df, snsr_df)
# with open('ref_data/composite_dow_hour_vecs.pkl', 'wb') as f:
#     pickle.dump(all_vecs, f)

gathering ids and streets....
processing ... 0 MISSION ST
processing ... 1 23RD ST
processing ... 2 23RD ST
processing ... 3 23RD ST
processing ... 4 23RD ST
processing ... 5 BRYANT ST
processing ... 6 MISSION ST
processing ... 7 VAN NESS AVENUE
processing ... 8 VAN NESS AVENUE
processing ... 9 MISSION ST
processing ... 10 POLK ST
processing ... 11 GROVE ST
processing ... 12 GROVE ST
processing ... 13 LARKIN ST
processing ... 14 MISSION ST
processing ... 15 LARKIN ST
processing ... 16 VAN NESS AVENUE
processing ... 17 REDWOOD ST
processing ... 18 HYDE ST
processing ... 19 VAN NESS AVENUE
processing ... 20 POLK ST
processing ... 21 VAN NESS AVENUE
processing ... 22 LARKIN ST
processing ... 23 LEAVENWORTH ST
processing ... 24 POLK ST
processing ... 25 TAYLOR ST
processing ... 26 LEAVENWORTH ST
processing ... 27 POLK ST
processing ... 28 JONES ST
processing ... 29 VAN NESS AVENUE
processing ... 30 JONES ST
processing ... 31 POLK ST
processing ... 32 LARKIN ST
processing ... 33 VAN NESS AV

In [14]:
hour2vec

Unnamed: 0,loc_id,hour,hourvec0,hourvec1,hourvec2,hourvec3,hourvec4,hourvec5,hourvec6,hourvec7,hourvec8,hourvec9,hourvec10,hourvec11
0,0,0,9.119143,2.718641,0.874219,2.012421,2.182592,0.789806,0.852255,0.641476,0.008458,1.029036,0.406243,0.303415
1,0,1,8.480153,5.863092,1.692448,2.021654,4.345150,0.852672,1.107530,0.000000,0.000000,0.021098,0.709856,0.000000
2,0,2,7.938739,6.246981,2.284527,1.846010,4.946104,0.393758,1.032027,0.000000,0.000000,0.071083,1.528047,0.000000
3,0,3,8.207523,6.812406,1.954575,1.811711,4.343911,0.865961,1.074725,0.005643,0.000000,0.106537,1.504019,0.000000
4,0,4,8.188204,6.661636,1.688898,1.704350,3.604483,1.093660,1.022343,0.063359,0.000000,0.265008,1.397764,0.000000
5,0,5,4.543130,2.604558,0.414184,1.415178,0.360974,1.979661,0.773205,1.271179,1.413552,1.175982,1.645599,0.538137
6,0,6,6.096293,3.738880,1.321281,0.888846,0.972857,1.112900,0.756093,0.576472,0.002622,1.001987,1.867502,0.061004
7,0,7,3.069016,5.161645,2.312407,0.506469,0.385952,0.987351,1.371274,0.820134,0.551722,0.940270,0.888449,0.680729
8,0,8,0.534181,0.000000,0.950521,0.000000,0.049142,0.662786,0.220101,2.521513,0.575027,1.089210,0.000000,1.588881
9,0,9,2.575601,3.846349,0.000000,0.215281,2.517342,0.842446,0.066997,0.300513,1.255121,0.992725,1.806122,0.163605


In [76]:
hour_vec.keys()

dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23])

In [77]:
dow_vec.keys()

dict_keys([0, 1, 2, 3, 4, 5, 6])

In [70]:
hour_vec[0]

array([  6.06144234e+00,   1.62914441e+00,   3.28608797e-03,
         2.17193392e+00,   7.65132819e-01,   3.64996638e-01,
         1.17455367e+00,   2.68091736e+00])

In [71]:
dow_vec[1]

array([ 0.73004767,  0.        ,  1.6044187 ,  1.77178773,  0.66050069,
        0.        ,  0.96394066,  0.06518009])

In [249]:
train_df.groupby('neighborhood').size()

neighborhood
chinatown             15
civiccenter           98
fillmoredistrict      84
financialdistrict    138
lowernobhill          73
missiondistrict      114
nobhill              139
northbeach            10
pacificheights        45
polkgulch             94
russianhill           35
southofmarket         46
tenderloin           159
westernaddition       50
dtype: int64

In [250]:
rez = make_vectors(train_df, parking_df, 84)

9 7
4 5


In [251]:
rez

{84: {'dow': {0: array([ 0.        ,  0.        ,  0.91653008,  0.        ,  0.01845601,
           1.12344613]),
   1: array([ 0.38461266,  0.48616891,  0.        ,  0.60562896,  0.68642017,
           0.05074085]),
   2: array([  8.22141225e-01,   0.00000000e+00,   1.51920527e+00,
            7.93355358e-04,   2.37141253e-01,   0.00000000e+00]),
   3: array([  1.47277574e+00,   1.48684889e-04,   0.00000000e+00,
            0.00000000e+00,   6.40693624e-01,   5.55292763e-01]),
   4: array([  0.00000000e+00,   0.00000000e+00,   3.14445598e-01,
            3.77871977e-04,   1.72654478e+00,   1.07640252e+00]),
   5: array([ 1.61517787,  1.07983578,  0.31658781,  0.59228623,  0.53811442,  0.        ])},
  'hours': {8: array([  2.55319708e+00,   2.93503229e+00,   3.59081861e-01,
            3.38195684e-01,   3.62099455e-01,   1.63573644e-03,
            8.25049656e-01,   2.69402241e-01]),
   9: array([ 1.04907873,  1.90687311,  1.32431972,  1.23915182,  0.        ,
           0.0840998 ,  

In [233]:
lbls2

array([0, 1, 2, 3, 4, 5])

In [232]:
lbls

Int64Index([8, 9, 10, 11, 12, 13, 14, 15, 16], dtype='int64')

In [227]:
pd.DataFrame(np.dot(hour_vec,dow_vec))

Unnamed: 0,0,1,2,3,4,5
0,2.000135,2.500266,3.000063,2.999826,2.999745,4.000006
1,3.998466,2.998174,2.999855,4.002592,2.503266,2.9983
2,2.999773,2.666336,1.999982,3.00047,2.500593,2.999653
3,4.000675,2.000557,3.999981,2.998978,1.998876,2.000681
4,2.0003,0.000521,4.00002,2.99951,0.499476,3.00025
5,4.000357,3.000399,2.000029,2.49949,2.999339,1.000358
6,2.999928,8e-06,3.000013,7.9e-05,3.000047,3.999951
7,3.000749,2.501188,0.001378,2.99855,1.498022,2.001238
8,0.000397,3.000157,2.000063,1.999655,2.999658,4.000171


### Test range

In [192]:
bbb = test_df.groupby(['dow','hour']).size().reset_index()
bbb.pivot(index='hour', columns = 'dow', values = 0)

dow,1,2,3,4,5,6
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
7,,,,,,6.0
8,,21.0,,11.0,,16.0
9,6.0,15.0,,19.0,2.0,9.0
10,17.0,9.0,,22.0,15.0,10.0
11,8.0,28.0,1.0,23.0,6.0,10.0
12,5.0,12.0,1.0,5.0,,
13,4.0,,9.0,1.0,7.0,
14,2.0,30.0,4.0,1.0,,
15,3.0,26.0,,,,
16,1.0,32.0,,10.0,,


In [173]:
s5 = train_df[train_df['neighborhood'] == 'tenderloin'].groupby(['dow','hour'])[['Real.Spots']].mean().reset_index()
s5.pivot(index='hour', columns='dow')

Unnamed: 0_level_0,Real.Spots,Real.Spots,Real.Spots,Real.Spots,Real.Spots,Real.Spots,Real.Spots
dow,0,1,2,3,4,5,6
hour,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
7,1.5,,,,,,
8,,,,,,0.5,
9,1.25,0.0,1.0,,,2.5,
10,,,,,,1.0,
11,,0.5,,,,,
12,,,,,,,0.0
13,2.071429,,0.0,0.0,,8.0,
14,,,,0.75,,2.666667,
15,9.666667,,0.0,,,10.4,
16,2.8,,,,,,


In [4]:
train_df.groupby('loc_id').size().sort_values(ascending=False)

loc_id
8     48
16    43
79    25
19    25
61    25
60    23
75    23
4     21
46    20
68    20
0     20
2     19
3     19
7     19
87    19
5     18
1     17
71    17
70    17
14    17
40    16
89    16
21    16
72    15
33    14
49    14
62    14
52    14
29    14
57    14
      ..
38     7
55     7
90     7
22     7
69     6
67     6
83     6
54     6
84     6
91     6
51     6
48     6
44     6
42     6
86     6
85     6
56     6
30     6
45     6
12     5
64     5
58     5
74     5
88     5
77     5
82     5
63     5
73     4
66     4
17     2
Length: 93, dtype: int64

In [5]:
specific_loc=74#60 / 74 
spec_loc = train_df[train_df['loc_id']==specific_loc].copy()
print(spec_loc.head(2))
s = spec_loc.groupby(['dow','hour'])['Real.Spots'].mean().reset_index()
s.pivot(index='hour', columns='dow', values='Real.Spots')

             Street            From                 To       Date   Time  \
376  Battery Street  Halleck Street  California Street 2014-02-10  17:45   
401  Battery Street  Halleck Street  California Street 2014-02-11  12:29   

     Real.Spots  Street.Length  any_spot    Clean_Street      Clean_From  \
376           7       54.14309         1  Battery Street  Halleck Street   
401           0       54.14309         0  Battery Street  Halleck Street   

         ...       ycat_wraps  ycat_yelpevents  10-100  100-250  1000-2000  \
376      ...              1.0              0.0     163      107         23   
401      ...              1.0              0.0     163      107         23   

     2000+  250-500 500-1000  less_10         STREET  
376      5       77       38      106  batterystreet  
401      5       77       38      106  batterystreet  

[2 rows x 317 columns]


dow,0,1,5
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
7,0.0,,
8,,,0.0
11,0.0,,
12,,0.0,
17,7.0,,


In [6]:
specific_loc=60
spec_trn = train_df[train_df['loc_id']==specific_loc].copy()
s = spec_trn.groupby(['dow','hour'])['Real.Spots'].mean().reset_index()
s.pivot(index='hour', columns='dow', values='Real.Spots')

dow,0,2,3,4,5,6
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
7,,,,,,3.0
9,,,,,,3.0
13,,,,4.0,9.0,
15,,,,,13.0,
16,1.5,1.0,0.0,,,8.0
17,,,,0.0,,
18,,,0.0,4.5,,
19,,,4.0,,,
20,,,,4.0,0.0,
21,,,,0.0,,


### Parking Matrix

In [7]:
parking_df = load_clean_parking()

In [8]:
spec_park = parking_df[parking_df['loc_id']==79]
s = spec_park.groupby(['dow','hour','day','month']).size().reset_index()
max_spots = s[0].max()
s = s.groupby(['dow','hour'])[0].mean().reset_index()        
s2 = s.pivot(index='hour', columns='dow', values =0).reset_index()
s2

dow,hour,0,1,2,3,4,5,6
0,8,1.0,1.333333,1.0,,1.0,1.0,
1,9,2.333333,2.5,2.0,3.0,3.666667,3.666667,
2,10,4.0,4.0,4.0,3.666667,3.333333,4.0,
3,11,3.0,3.5,2.333333,2.5,3.333333,5.333333,
4,12,2.666667,3.75,3.0,1.333333,5.0,6.0,
5,13,2.666667,4.0,3.0,2.0,5.0,5.0,2.0
6,14,1.333333,3.75,2.5,2.0,4.333333,4.333333,1.0
7,15,3.0,2.75,3.333333,3.0,3.666667,3.333333,
8,16,2.333333,1.75,8.0,4.0,2.333333,2.5,
9,17,3.0,2.25,1.5,5.333333,4.0,2.666667,


In [9]:
locid_hour2vec = None
locid_dow2vec = None

for i in range(93):
    spec_id = i
    spec_park = parking_df[parking_df['loc_id']==spec_id]
    if spec_park.shape[0] > 0:
        s = spec_park.groupby(['dow','hour','day','month']).size().reset_index()
        max_spots = s[0].max()
        s = s.groupby(['dow','hour'])[0].mean().reset_index()        
        s2 = s.pivot(index='hour', columns='dow', values =0).reset_index()
        hours = s2['hour'].values

        s2 = max_spots - s2
        s2.drop(columns='hour', inplace=True)
        dow = list(s2.columns.values)
        hour_vec, dow_vec = NMF_factorize(s2, n = 12)

        block = pd.DataFrame(hour_vec)
        block['hour'] = hours
        block['loc_id'] = spec_id
        if locid_hour2vec is not None:
            locid_hour2vec = pd.concat([locid_hour2vec, block])
        else:
            locid_hour2vec = block


        block = pd.DataFrame(dow_vec.T)
        block['dow'] = dow
        block['loc_id'] = spec_id
        if locid_dow2vec is not None:
            locid_dow2vec = pd.concat([locid_dow2vec, block])
        else:
            locid_dow2vec = block

locid_hour2vec = locid_hour2vec[['loc_id','hour',0,1,2,3,4,5,6,7,8,9,10,11]].reset_index()
locid_hour2vec.drop(columns='index', inplace=True)
locid_hour2vec.columns = ['loc_id','hour'] + ['hourvec%d' % i for i in range(12)] 

locid_dow2vec = locid_dow2vec[['loc_id','dow',0,1,2,3,4,5,6,7,8,9,10,11]].reset_index()
locid_dow2vec.drop(columns='index', inplace=True)
locid_dow2vec.columns = ['loc_id','dow'] + ['dowvec%d' % i for i in range(12)]

#locid_hour2vec.to_feather('ref_data/parking_locid_hour2vec.feather')
#locid_dow2vec.to_feather('ref_data/parking_locid_dow2vec.feather')

In [10]:
locid_hour2vec.head()

Unnamed: 0,loc_id,hour,hourvec0,hourvec1,hourvec2,hourvec3,hourvec4,hourvec5,hourvec6,hourvec7,hourvec8,hourvec9,hourvec10,hourvec11
0,0,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.141574,0.0,0.0,0.908912,0.0
1,0,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.246689,0.0,0.0,0.363427,0.0
2,0,8,3.709756,5.127013,4.780231,0.0,0.135352,1.622257,1.875739,0.080435,0.38936,0.0,0.0,1.207767
3,0,9,1.702784,1.765342,2.123713,7.3e-05,0.454871,1.111809,3.529719,0.057934,0.0,0.221811,1.17381,0.491005
4,0,10,4.07316,4.703089,3.264858,4e-05,0.385605,0.033502,0.615925,0.0,0.734755,0.322854,1.269693,0.676651


In [11]:
locid_dow2vec

Unnamed: 0,loc_id,dow,dowvec0,dowvec1,dowvec2,dowvec3,dowvec4,dowvec5,dowvec6,dowvec7,dowvec8,dowvec9,dowvec10,dowvec11
0,0,0,0.000000,9.919214e-01,0.423171,0.638141,0.000000e+00,1.686131,1.395086,0.000000,0.531341,0.943333,5.993336e-03,0.000429
1,0,1,0.000000,1.070876e-01,1.962715,0.030888,9.136925e-01,0.300488,0.317859,10.632492,1.944199,0.000000,2.049009e+00,0.000000
2,0,2,1.322147,0.000000e+00,0.579747,0.194002,2.481225e-01,0.560561,0.523157,0.000000,0.514264,1.966833,1.585410e-07,2.372975
3,0,3,0.000000,9.527776e-01,1.335654,0.388553,3.704479e-01,1.006470,0.000000,0.000000,1.398005,0.895071,0.000000e+00,0.000278
4,0,4,0.000000,8.528183e-01,1.402113,0.339874,1.951393e+00,0.565554,0.372788,0.000000,0.118102,3.019374,1.010462e-02,0.000621
5,0,5,0.000000,1.922314e+00,0.188351,0.372770,3.071104e+00,0.000000,1.150970,0.000071,0.000000,0.366804,0.000000e+00,0.000411
6,0,6,0.000000,0.000000e+00,0.000000,3.170156,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000
7,1,0,0.382737,1.955838e-01,0.000000,0.000000,0.000000e+00,1.838416,0.319826,0.955790,1.042020,0.000000,0.000000e+00,0.000000
8,1,1,0.597623,9.914977e-02,0.908290,0.792158,0.000000e+00,0.639915,0.248725,0.070582,0.515162,0.449775,9.396025e-02,0.000000
9,1,2,0.638111,0.000000e+00,0.258109,1.122543,1.394818e+00,0.000000,0.599680,0.003728,0.000000,0.599269,4.080201e-01,0.000000


In [12]:
spec_park = parking_df[parking_df['loc_id']==1]
s = spec_park.groupby(['dow','hour']).size().reset_index()
s.pivot(index='hour', columns='dow', values =0)

dow,0,1,2,3,4,5
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
7,,,,,,1.0
8,,,1.0,1.0,,
9,1.0,5.0,2.0,1.0,3.0,
10,2.0,6.0,4.0,3.0,3.0,5.0
11,2.0,4.0,3.0,,5.0,7.0
12,1.0,3.0,2.0,2.0,1.0,4.0
13,4.0,4.0,1.0,1.0,4.0,2.0
14,,4.0,1.0,2.0,4.0,5.0
15,2.0,4.0,1.0,2.0,4.0,5.0
16,2.0,6.0,3.0,2.0,,6.0


### Combining vectors

In [50]:
placeholder = np.empty((24,7))
placeholder[:] = np.nan
placeholder_df = pd.DataFrame(placeholder)

In [51]:
specific_loc=60
spec_park = parking_df[parking_df['loc_id']==specific_loc]
s = spec_park.groupby(['dow','hour','day','month']).size().reset_index()
max_spots = np.max(s[0])
s[0] = max_spots - s[0] 
s2 = s.groupby(['dow','hour'])[0].mean().reset_index()
s3 = s2.pivot(index='hour', columns='dow', values =0).reset_index()
s3.fillna(max_spots,inplace=True)

In [52]:
s3

dow,hour,0,1,2,3,4,5,6
0,8,9.0,8.0,9.0,7.5,7.0,9.0,9.0
1,9,7.5,7.0,6.0,5.333333,7.333333,6.666667,9.0
2,10,6.0,5.75,4.666667,7.0,5.333333,6.0,9.0
3,11,6.333333,5.25,5.5,5.333333,7.0,6.666667,9.0
4,12,5.0,6.25,4.333333,7.0,4.666667,6.0,8.0
5,13,5.0,4.75,6.0,6.666667,6.333333,6.333333,9.0
6,14,5.666667,5.25,6.333333,4.666667,6.333333,3.666667,9.0
7,15,4.666667,3.25,7.5,6.333333,4.666667,5.333333,9.0
8,16,7.666667,5.75,6.333333,5.666667,5.666667,5.0,9.0
9,17,6.333333,6.333333,7.0,6.0,6.0,5.0,8.0


In [53]:
spec_trn = train_df[train_df['loc_id']==specific_loc].copy()
s = spec_trn.groupby(['dow','hour','day', 'mon'])['Real.Spots'].mean().reset_index()
s2 = s.groupby(['dow','hour'])['Real.Spots'].mean().reset_index()
s4 = s2.pivot(index='hour', columns='dow', values='Real.Spots').reset_index()

In [54]:
s4

dow,hour,0,2,3,4,5,6
0,7,,,,,,3.0
1,9,,,,,,3.0
2,13,,,,4.0,9.0,
3,15,,,,,13.0,
4,16,1.5,1.0,0.0,,,8.0
5,17,,,,0.0,,
6,18,,,0.0,4.5,,
7,19,,,4.0,,,
8,20,,,,4.0,0.0,
9,21,,,,0.0,,


In [58]:
def fill_matrix_w_df(mtx, df):
    placeholder = mtx.copy()
    rows, cols = df.shape
    print(rows, cols)
    for i in range(rows):
        hour = df.loc[i,'hour']
        #print('-========', hour)
        for j in range(1,cols):
            day = df.columns[j]
            #print(hour, day, j)
            if not pd.isnull(df.iloc[i,j]):
                placeholder.iloc[hour-1,day] = df.iloc[i,j]
    return placeholder


def make_vectors(train_df, parking_df, specific_loc=60):
    placeholder = np.empty((24,7))
    placeholder[:] = np.nan
    placeholder_df = pd.DataFrame(placeholder)
        
    # training
    spec_trn = train_df[train_df['loc_id']==specific_loc].copy()
    s = spec_trn.groupby(['dow','hour','day', 'mon'])['Real.Spots'].mean().reset_index()
    s2 = s.groupby(['dow','hour'])['Real.Spots'].mean().reset_index()
    s4 = s2.pivot(index='hour', columns='dow', values='Real.Spots').reset_index()
    
    # parking
    spec_park = parking_df[parking_df['loc_id']==specific_loc]
    s = spec_park.groupby(['dow','hour','day','month']).size().reset_index()
    max_spots = np.max(s[0])
    s[0] = max_spots - s[0] 
    s2 = s.groupby(['dow','hour'])[0].mean().reset_index()
    s3 = s2.pivot(index='hour', columns='dow', values =0).reset_index()
    s3.fillna(max_spots,inplace=True)
    
    pl1 = fill_matrix_w_df(placeholder_df, s3)
    pl2 = fill_matrix_w_df(pl1, s4)
    
    hour_vec, dow_vec = NMF_factorize(pl2, n = 6)
    print(hour_vec.shape,dow_vec.shape)
    return hour_vec, dow_vec
    

In [61]:
pl1 = fill_matrix_w_df(placeholder_df, s3)
pl2 = fill_matrix_w_df(pl1, s4)
hour_vec, dow_vec = NMF_factorize(pl2, n = 6)
print(hour_vec.shape,dow_vec.shape)
    

12 8
10 7
(24, 6) (6, 7)


In [57]:
placeholder_df

Unnamed: 0,0,1,2,3,4,5,6
0,,,,,,,
1,,,,,,,
2,,,,,,,
3,,,,,,,
4,,,,,,,
5,,,,,,,
6,,,,,,,
7,,,,,,,
8,,,,,,,
9,,,,,,,


In [145]:
pl1

Unnamed: 0,0,1,2,3,4,5,6
0,,,,,,,
1,,,,,,,
2,,,,,,,
3,,,,,,,
4,,,,,,,
5,,,,,,,
6,,,,,,,
7,,8.0,,7.5,7.0,,
8,7.5,7.0,6.0,5.333333,7.333333,6.666667,
9,6.0,5.75,4.666667,7.0,5.333333,6.0,


In [146]:
pl2

Unnamed: 0,0,1,2,3,4,5,6
0,,,,,,,
1,,,,,,,
2,,,,,,,
3,,,,,,,
4,,,,,,,
5,,,,,,,
6,,,,,,,3.0
7,,8.0,,7.5,7.0,,
8,7.5,7.0,6.0,5.333333,7.333333,6.666667,3.0
9,6.0,5.75,4.666667,7.0,5.333333,6.0,


In [152]:
hour_vec, dow_vec = NMF_factorize(pl2, n = 6)

In [153]:
pd.DataFrame(np.dot(hour_vec,dow_vec))

Unnamed: 0,0,1,2,3,4,5,6
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.250676,0.104997,0.0,0.01286,0.0,0.597712,2.847953
7,0.243699,8.270034,0.089477,7.533584,6.611909,0.085797,0.0
8,7.623669,7.114822,5.765593,5.350698,7.171315,6.786217,2.959963
9,5.754657,5.518594,5.126117,6.97478,5.658529,5.774571,0.077519
