In [2]:
from preprocess_data_for_streamlit import *

In [18]:
SOURCE_FILEPATH = os.path.join(os.pardir, 'datasets', 'train.csv')
DESTINATION_FILEPATH = os.path.join(os.pardir, 'datasets', 'train_preprocessed_streamlit.csv')

In [19]:
df = pd.read_csv(SOURCE_FILEPATH, parse_dates=['pickup_datetime', 'dropoff_datetime'])

In [20]:
# Clean data
print('Cleaning data...')
## remove zero passenger
df = drop_zero_records(df, ['passenger_count'])
## remove statistical outliers
df = drop_statistical_outliers(df)
## remove further outliers by min max
NYC_MIN_LON, NYC_MAX_LON = -74.4, -73.4 # approx from google map
NYC_MIN_LAT, NYC_MAX_LAT = 40, 41.6 # approx from google map
df = drop_minmax(df, 'pickup_latitude', NYC_MIN_LAT, NYC_MAX_LAT)
df = drop_minmax(df, 'pickup_longitude', NYC_MIN_LON, NYC_MAX_LON)
df = drop_minmax(df, 'dropoff_latitude', NYC_MIN_LAT, NYC_MAX_LAT)
df = drop_minmax(df, 'dropoff_longitude', NYC_MIN_LON, NYC_MAX_LON)

Cleaning data...


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1361116 entries, 0 to 1458643
Data columns (total 11 columns):
 #   Column              Non-Null Count    Dtype         
---  ------              --------------    -----         
 0   id                  1361116 non-null  object        
 1   vendor_id           1361116 non-null  int64         
 2   pickup_datetime     1361116 non-null  datetime64[ns]
 3   dropoff_datetime    1361116 non-null  datetime64[ns]
 4   passenger_count     1361116 non-null  int64         
 5   pickup_longitude    1361116 non-null  float64       
 6   pickup_latitude     1361116 non-null  float64       
 7   dropoff_longitude   1361116 non-null  float64       
 8   dropoff_latitude    1361116 non-null  float64       
 9   store_and_fwd_flag  1361116 non-null  object        
 10  trip_duration       1361116 non-null  int64         
dtypes: datetime64[ns](2), float64(4), int64(3), object(2)
memory usage: 124.6+ MB


In [22]:
print('Creating features...')
df = feature_eng_df(df)

Creating features...


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1361116 entries, 0 to 1458643
Data columns (total 17 columns):
 #   Column                       Non-Null Count    Dtype         
---  ------                       --------------    -----         
 0   id                           1361116 non-null  object        
 1   vendor_id                    1361116 non-null  int64         
 2   pickup_datetime              1361116 non-null  datetime64[ns]
 3   dropoff_datetime             1361116 non-null  datetime64[ns]
 4   passenger_count              1361116 non-null  int64         
 5   pickup_longitude             1361116 non-null  float64       
 6   pickup_latitude              1361116 non-null  float64       
 7   dropoff_longitude            1361116 non-null  float64       
 8   dropoff_latitude             1361116 non-null  float64       
 9   store_and_fwd_flag           1361116 non-null  object        
 10  trip_duration                1361116 non-null  int64         
 11  trip_distan

In [24]:
df.log_trip_duration.head(10)

0     6.120297
1     6.496775
2     7.661056
3     6.061457
4     6.075346
6     5.831882
7     7.346655
8     5.541264
9     7.110696
10    7.149917
Name: log_trip_duration, dtype: float64

In [25]:
print('Preprocessing data...')

Preprocessing data...


In [26]:
# Select features for X
X_cols_num = [
    'trip_distance',
    'pickup_datetime_month',
    'pickup_datetime_date',
    'pickup_datetime_day_of_week',
    'pickup_datetime_hour',
    'pickup_latitude',
    'pickup_longitude',
    'dropoff_latitude',
    'dropoff_longitude',
    ]
X_cols_cat = []
X_cols = X_cols_num + X_cols_cat

num_pipeline = Pipeline(
    steps=[
        ('median_imputer', SimpleImputer(strategy='median')),
        ('standard_scaler', StandardScaler()),
    ])

preprocessing_pl = Pipeline(
    steps=[
        ('selector', FunctionTransformer(lambda df: df[X_cols])),
        ('column_transformer', ColumnTransformer([('num', num_pipeline, X_cols_num),])),
    ])        

X = preprocessing_pl.fit_transform(df)

In [27]:
df_for_model = pd.DataFrame(X, columns=X_cols)

In [28]:
df_for_model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1361116 entries, 0 to 1361115
Data columns (total 9 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   trip_distance                1361116 non-null  float64
 1   pickup_datetime_month        1361116 non-null  float64
 2   pickup_datetime_date         1361116 non-null  float64
 3   pickup_datetime_day_of_week  1361116 non-null  float64
 4   pickup_datetime_hour         1361116 non-null  float64
 5   pickup_latitude              1361116 non-null  float64
 6   pickup_longitude             1361116 non-null  float64
 7   dropoff_latitude             1361116 non-null  float64
 8   dropoff_longitude            1361116 non-null  float64
dtypes: float64(9)
memory usage: 93.5 MB


In [35]:
df_for_model['log_trip_duration'] = df['log_trip_duration'].values

In [37]:
df_for_model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1361116 entries, 0 to 1361115
Data columns (total 10 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   trip_distance                1361116 non-null  float64
 1   pickup_datetime_month        1361116 non-null  float64
 2   pickup_datetime_date         1361116 non-null  float64
 3   pickup_datetime_day_of_week  1361116 non-null  float64
 4   pickup_datetime_hour         1361116 non-null  float64
 5   pickup_latitude              1361116 non-null  float64
 6   pickup_longitude             1361116 non-null  float64
 7   dropoff_latitude             1361116 non-null  float64
 8   dropoff_longitude            1361116 non-null  float64
 9   log_trip_duration            1361116 non-null  float64
dtypes: float64(10)
memory usage: 103.8 MB


In [4]:
input_df = pd.DataFrame()
input_df['a'] = [1]

input_df.head()

Unnamed: 0,a
0,1


In [14]:
150//11 + 1 


14