In [1]:
import json
import pandas as pd

In [30]:
input = {
    'trip_id': 712382,
    'trip_start_time': "1/1/2017 0:00",
    'trip_stop_time': "1/1/2017 0:03",
    'trip_duration_seconds': 223,
    'from_station_id': 7051,
    "from_station_name": "Wellesley St E / Yonge St Green P",
    'to_station_id': 7089,
    "to_station_name": "Church St  / Wood St",
    "user_type": "Member",
}

In [12]:
input_series = pd.Series(input)
input_series

trip_id                                 1
trip_start_time          2022/1/1 9:40 pm
trip_stop_time           2022/1/1 9:55 pm
trip_duration_seconds                 200
from_station_id                       333
from_station_name                    pape
to_station_id                         444
to_station_name                     bloor
user_type                          Casual
dtype: object

In [7]:
type(input_series) == pd.Series

True

In [14]:
params = {}
params['axis'] = 1
print(type(params))
print(params.get('axis'))

<class 'dict'>
1


In [26]:
params.pop('axis')

1

In [28]:
foo = input
foo.pop(['trip_id', 'user_type'])

TypeError: unhashable type: 'list'

In [17]:
import logging
import sys
from typing import Union
def preprocess(
    df_bikes: Union[pd.DataFrame, pd.Series], 
    logger_level: int = logging.DEBUG):
    """Preprocesses the bikeshare data

    Converts the datetimes from str obj to datetime objects, and extracts
    the time of day to convert into hour floats

    This step is prior to feeding the arrays into the pipeline.
    """
    logging.basicConfig(stream=sys.stdout, level=logger_level)
    df_bikes["dt_start"] = pd.to_datetime(
        df_bikes["trip_start_time"],
        infer_datetime_format=True,
    )
    logging.debug(df_bikes["dt_start"])

    df_bikes["dt_end"] = pd.to_datetime(
        df_bikes["trip_stop_time"],
        infer_datetime_format=True,
    )
    logging.debug(df_bikes["dt_end"])
    # get day of week
    params = {}
    if type(df_bikes) == pd.DataFrame:
        logging.debug("dataframe accepted")
        params['axis'] = 1
        
    df_bikes["day_of_week"] = df_bikes.apply(
        lambda x: x["dt_start"].day_of_week, **params
    )
    # get hours
    df_bikes["start_hour"] = df_bikes.apply(
        lambda x: x["dt_start"].hour + x["dt_start"].minute / 60,
        **params,
    )
    df_bikes["end_hour"] = df_bikes.apply(
        lambda x: x["dt_end"].hour + x["dt_end"].minute / 60,
        **params,
    )
    df_bikes["target"] = df_bikes["user_type"].apply(lambda type: type == "Member")
    drops = [
        "trip_start_time",
        "trip_stop_time",
        "from_station_name",
        "to_station_name",
        "dt_start",
        "dt_end",
        "user_type",
    ]
    df_bikes = df_bikes.drop(drops, **params)
    return df_bikes

`apply()` doesn't really extend to Series; instead of selecting the columns for the transformation, `pd.Series.apply()` will simply apply to every element, since it assumes it's already in column form. Instead of only selecting the `dt_start`, it's trying to *subscript* each element of my `bikes` data array, which would not make sense for int fields like `trip_id` or `duration`.

Pass it as a pd.DataFrame.

Other problem, having issues constructing dataframe from dict, as the dict should have the following structure:

```py
data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}
```

Whereas my sample data are not in a list

In [20]:
# This will fail
try:
    prep = preprocess(input_series)
except TypeError as e:
    print(e)

'int' object is not subscriptable


In [24]:
input_df = pd.DataFrame.from_dict(input)
prep = preprocess(input_df)

ValueError: If using all scalar values, you must pass an index

In [31]:
# from datetime import datetime
from datetime import datetime
date_fmt = """%d/%m/%Y %H:%M"""
dt_start = datetime.strptime(input['trip_start_time'], date_fmt)
dt_start

ValueError: '%' is a bad directive in format '%d/%m/% %H:%M'