In [3]:
import pandas as pd
import numpy as np
from helpers import load_all_processed_data

pd.set_option('display.max_columns', None)

In [4]:
lin_model_df = pd.read_csv('Models/Models/model_station_1_rlm_full.csv').set_index('feature').transpose()

In [3]:
df = load_all_processed_data()

In [4]:
# fill in missing values used by linear models with per station and weekhour average
df['bikes_3h_ago'] = df.groupby(['station', 'weekhour']).bikes_3h_ago.transform(lambda x: x.fillna(x.mean()))
df['full_profile_3h_diff_bikes'] = df.groupby(['station', 'weekhour']).full_profile_3h_diff_bikes.transform(lambda x: x.fillna(x.mean()))
df['full_profile_bikes'] = df.groupby(['station', 'weekhour']).full_profile_bikes.transform(lambda x: x.fillna(x.mean()))
df['short_profile_bikes'] = df.groupby(['station', 'weekhour']).short_profile_bikes.transform(lambda x: x.fillna(x.mean()))
df['short_profile_3h_diff_bikes'] = df.groupby(['station', 'weekhour']).short_profile_3h_diff_bikes.transform(lambda x: x.fillna(x.mean()))

In [5]:
# returns linear model in form of a function to call on a dataframe returning a nparray with a value per row in dataframe
def lin_model(model_df):
    coeffs = model_df.to_dict()
    
    def f(X):
        y = 0
        for key, value in coeffs.items():
            if key == '(Intercept)':
                y = y + value['weight']
            else:
                y = y + X[key] * value['weight'] 
        return  y
    
    return f

In [6]:
lin_model(lin_model_df)(df)

0         2.155123
1         1.602163
2         1.974613
3         1.788947
4         1.902293
           ...    
55795    11.134901
55796    10.066212
55797     9.129010
55798    10.883649
55799     7.466786
Length: 55800, dtype: float64

In [7]:
df.iloc[55799]

station                                220
latitude                           39.4584
longitude                        -0.344496
numDocks                                15
timestamp                      1.41479e+09
year                                  2014
month                                   10
day                                     31
hour                                    23
weekday                             Friday
weekhour                               120
isHoliday                                0
windMaxSpeed.m.s                       4.8
windMeanSpeed.m.s                      1.6
windDirection.grades                 157.5
temperature.C                         19.1
relHumidity.HR                          86
airPressure.mb                      1024.1
bikes_3h_ago                            12
full_profile_3h_diff_bikes           -2.75
full_profile_bikes                       6
short_profile_3h_diff_bikes          -2.75
short_profile_bikes                      6
bikes      

In [8]:
lin_model_df.to_dict()

{'(Intercept)': {'weight': 0.240571911655513},
 'bikes_3h_ago': {'weight': 0.519823466637759},
 'full_profile_bikes': {'weight': 0.342836970019425},
 'full_profile_3h_diff_bikes': {'weight': 0.388614128668747}}

In [9]:
0.240571911655513 + 0.519823466637759*12 + 0.342836970019425*6 + 0.388614128668747*-2.75

7.466786477586117

In [6]:
modelled_stations = np.arange(1, 201)

In [7]:
model_types = ['full_temp', 'full', 'short_full_temp', 'short_full', 'short_temp', 'short']

In [8]:
model_names = [f'model_station_{station_id}_rlm_{model_type}' for station_id in modelled_stations for model_type in model_types]

In [13]:
from tqdm import tqdm
for model_name in tqdm(model_names):
    model_df = pd.read_csv(f'Models/Models/{model_name}.csv').set_index('feature').transpose()
    df[model_name] = lin_model(model_df)(df)

100%|██████████| 1200/1200 [00:11<00:00, 104.34it/s]


In [14]:
df['models_min'] = df[model_names].min(axis=1)
df['models_mean'] = df[model_names].mean(axis=1)
df['models_max'] = df[model_names].max(axis=1)
df['models_std'] = df[model_names].std(axis=1)

In [21]:
for station_id in np.arange(201, 276):
    df[df.station == station_id][['station', 'bikes']+model_names].to_csv(f'Processed/phase2/phase_2_station_{station_id}.csv', index=False)
    df[df.station == station_id].to_csv(f'Processed/phase3/phase_3_station_{station_id}.csv', index=False)

### Test Data

In [9]:
tdf = pd.read_csv('test.csv')

In [10]:
from tqdm import tqdm
for model_name in tqdm(model_names):
    model_df = pd.read_csv(f'Models/Models/{model_name}.csv').set_index('feature').transpose()
    tdf[model_name] = lin_model(model_df)(tdf)

100%|██████████| 1200/1200 [00:05<00:00, 224.05it/s]


In [12]:
tdf[['station', 'bikes']+model_names].to_csv(f'Processed/phase2/test2.csv', index=False)
tdf.to_csv(f'Processed/phase3/test3.csv', index=False)

KeyError: "['bikes'] not in index"