# Kaggle - Bike Sharing Demands - Baseline
**Author: Chris Shin**

In [2]:
# Standard Library imports
from datetime import datetime
import calendar

# Third-Party Library imports
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [3]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
submission = pd.read_csv('./data/sampleSubmission.csv')

# Feature Engineering

It's generally better to do feature engineering separately on the training and test sets to avoid data leakage. Data leakage occurs when information from the test set is used to create features or make decisions about the model. This can lead to overfitting and poor performance on new, unseen data.

When you engineer features on the entire dataset, you risk incorporating information from the test set into your model, which can lead to over-optimistic evaluation metrics during training. Then when you evaluate your model on the test set, it may perform poorly because it was never exposed to the test set during training.

Therefore, it's recommended to perform feature engineering on the training set first, and then transform the test set using the same transformations applied to the training set. This ensures that your model hasn't been exposed to information from the test set during training and provides a more accurate evaluation of its performance.

In [4]:
def feature_engineer(data):
    df = data.copy()
    df['weather'] = df['weather'].replace(4, 3)
    df['year'] = df['datetime'].apply(lambda x: x.split()[0].split('-')[0]).astype(int)
    df['month'] = df['datetime'].apply(lambda x: x.split()[0].split('-')[1]).astype(int)
    df['day'] = df['datetime'].apply(lambda x: x.split()[0].split('-')[2]).astype(int)
    df['hour'] = df['datetime'].apply(lambda x: x.split()[1].split(':')[0]).astype(int)
    df['weekday'] = df['datetime'].apply(lambda x: x.split()[0]).apply(lambda dateString: calendar.day_name[datetime.strptime(dateString, '%Y-%m-%d').weekday()])
    df['season'] = df['season'].map({1: 'Spring',
                                        2: 'Summer',
                                        3: 'Fall',
                                        4: 'Winter'})
    # df = pd.get_dummies(df, columns=['season', 'weekday'])

    # create an instance of the encoder with categorical feature indices
    encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
    categorical_cols = ['season', 'weekday']

    # fit and transform the encoded values to the DataFrame
    encoded_values = encoder.fit_transform(df[categorical_cols])
    feature_names = encoder.get_feature_names_out(input_features=['season', 'weekday'])
    df_encoded = pd.DataFrame(encoded_values, columns=feature_names)

    # drop original categorical columns and join the encoded DataFrame to the original
    df = df.drop(categorical_cols, axis=1)
    df = pd.concat([df, df_encoded], axis=1)

    drop_features = ['casual', 'registered', 'datetime', 'windspeed']
    df = df.drop([col for col in df.columns if col in drop_features], axis=1)
        
    return df

### `A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead` Error was shwoing even though using .loc to set the value. But the error disappear after using .copy() function. Why?

The reason why the warning disappeared is that by using .copy(), you created a new copy of the original DataFrame, rather than a view of the original data. This means that any modifications made to the new copy will not affect the original data.

When you make a slice of a DataFrame, like df['datetime'] in your example, it creates a view of the original DataFrame rather than a new copy. If you modify the view, it may modify the original data as well, depending on the context. This can lead to unexpected behavior, so Pandas gives a warning to alert you to the potential issue.

By creating a new copy with .copy(), you avoid modifying the original data and therefore avoid the warning.

##### `One-Hot-Encoding vs get_dummies`

Both one hot encoding and get_dummies are techniques used to convert categorical variables into numerical representations that can be used in machine learning models.

The main difference between them is that one hot encoding is a general term for encoding categorical variables with multiple levels, while get_dummies is a specific function in the Pandas library that creates dummy variables for each level of a categorical variable.

In one hot encoding, a binary vector is created for each category of a categorical variable. The vector has a length equal to the number of unique categories in the variable, and each element is either 0 or 1, representing whether or not the corresponding category is present in the observation.

On the other hand, get_dummies function creates a binary column for each category in the input variable. The resulting dataframe has a binary column for each category, where a 1 represents the presence of the category in that observation, and 0 represents the absence.

Both techniques have their advantages and disadvantages, and the choice between them depends on the specific problem at hand. One hot encoding is more memory-efficient and suitable for large datasets with many unique categories, but it can create a large number of features that may lead to overfitting. On the other hand, get_dummies creates a more interpretable output and may be more suitable for smaller datasets.

In [5]:
X_train = feature_engineer(train)
X_test = feature_engineer(test)

X_train = X_train.drop('count', axis=1)
y_train = train['count']



In [6]:
X_train[X_train.isna().any(axis=1)]

Unnamed: 0,holiday,workingday,weather,temp,atemp,humidity,year,month,day,hour,...,season_Spring,season_Summer,season_Winter,weekday_Friday,weekday_Monday,weekday_Saturday,weekday_Sunday,weekday_Thursday,weekday_Tuesday,weekday_Wednesday


In [7]:
def rmsle(y_true, y_pred, convertExp=True):

    if convertExp:
        y_true = np.exp(y_true)
        y_pred = np.exp(y_pred)

    log_true = np.nan_to_num(np.log(y_true + 1))
    log_pred = np.nan_to_num(np.log(y_pred + 1))

    rmsle_result = np.sqrt(np.mean((log_true - log_pred) ** 2))
    return rmsle_result

In [8]:
linear_reg = LinearRegression()

log_y_train = np.log(y_train)
linear_reg.fit(X_train, log_y_train);

In [9]:
preds = linear_reg.predict(X_train)

In [10]:
print(f"Linear REgression RMSLE: {rmsle(log_y_train, preds, True):.4f}")

Linear REgression RMSLE: 1.0122


In [11]:
linearreg_preds = linear_reg.predict(X_test)
submission['count'] = np.exp(linearreg_preds)
submission.to_csv('submission.csv', index=False)

# Using pipeline

1. First, we create a `Pipeline` object called `feat_eng_pipe` that will encapsulate the entire feature engineering process.

2. The pipeline consists of two steps, each defined as a tuple with two elements: the name of the step and the corresponding estimator/transformer object.

3. The first step is named `feat_eng` and it uses the `FunctionTransformer` object to apply the `feature_engineer` function to the input data. This function takes a dataframe and performs various transformations on its columns, such as splitting the datetime column into year, month, day, and hour columns, replacing the 'weather' column value of 4 with 3, mapping the 'season' column to categorical values, and dropping some columns that are not relevant for our analysis.

4. The second step is named `encoder` and it uses the `ColumnTransformer` object to apply one-hot encoding to the categorical columns 'season' and 'weekday'. One-hot encoding is a technique used to transform categorical data into a numerical format that can be used in machine learning models. The `ColumnTransformer` object takes a list of transformers, each defined as a tuple with three elements: the name of the transformer, the estimator/transformer object, and the list of column names to be transformed. In this case, we only have one transformer, which is a `OneHotEncoder` object that ignores unknown categories and returns a dense matrix. The `remainder` parameter is set to 'passthrough', which means that any remaining columns not specified in the list of transformers should be passed through without any changes.

5. The `feat_eng_pipe` pipeline is now ready to be used for feature engineering on a dataset. We can pass the input data to the pipeline using the `fit_transform` method, which applies the transformations defined in the pipeline sequentially. The output of the pipeline will be a numpy array with the transformed features.

In [19]:
# Define a function to perform some feature engineering on the input data
def feature_engineer(data):
    # Make a copy of the input data to avoid modifying the original data
    df = data.copy()
    # Replace any instances of 4 in the 'weather' column with 3
    df['weather'] = df['weather'].replace(4, 3)
    # Extract the year, month, day, and hour from the 'datetime' column and create new columns for each
    df['year'] = df['datetime'].apply(lambda x: x.split()[0].split('-')[0]).astype(int)
    df['month'] = df['datetime'].apply(lambda x: x.split()[0].split('-')[1]).astype(int)
    df['day'] = df['datetime'].apply(lambda x: x.split()[0].split('-')[2]).astype(int)
    df['hour'] = df['datetime'].apply(lambda x: x.split()[1].split(':')[0]).astype(int)
    # Extract the weekday from the 'datetime' column and map the values to the corresponding day name
    df['weekday'] = df['datetime'].apply(lambda x: x.split()[0]).apply(lambda dateString: calendar.day_name[datetime.strptime(dateString, '%Y-%m-%d').weekday()])
    # Map the values in the 'season' column from integers to corresponding season names
    df['season'] = df['season'].map({1: 'Spring',
                                        2: 'Summer',
                                        3: 'Fall',
                                        4: 'Winter'})
    # Drop unnecessary columns from the DataFrame
    drop_features = ['casual', 'registered', 'datetime', 'windspeed']
    df = df.drop([col for col in df.columns if col in drop_features], axis=1)
    # Return the modified DataFrame
    return df

# Define the categorical columns to be one-hot encoded
categorical_cols = ['season', 'weekday']

# Create a pipeline for feature engineering using FunctionTransformer and ColumnTransformer
feat_eng_pipe = Pipeline([
    ('feat_eng', FunctionTransformer(feature_engineer)), # use feature_engineer function to engineer data
    ('encoder', ColumnTransformer([
        ('one_hot', OneHotEncoder(handle_unknown='ignore', sparse=False), categorical_cols) # one-hot encode categorical columns
    ], remainder='passthrough')) # passthrough any other columns not specified in the ColumnTransformer
])

# Create a pipeline for modeling using LinearRegression
model_pipe = Pipeline([
    ('linear_reg', LinearRegression()) # linear regression model
])

# Combine the feature engineering and modeling pipelines into a single pipeline using Pipeline
pipeline = Pipeline([
    ('feat_eng_pipe', feat_eng_pipe), # feature engineering pipeline
    ('model_pipe', model_pipe) # modeling pipeline
])

# Split the input data into the predictor variables (X_train) and target variable (y_train)
X_train = train.drop(['count'], axis=1)
y_train = train['count']
# Transform the target variable by taking the natural logarithm
log_y_train = np.log(y_train)

# Fit the entire pipeline to the training data and make predictions on the test data
pipeline.fit(X_train, log_y_train)



In [20]:
X_train_transformed = feat_eng_pipe.transform(X_train)
transformed_col_names = feat_eng_pipe.named_steps['encoder'].get_feature_names_out()


In [21]:
X_train_transformed_df = pd.DataFrame(X_train_transformed, columns=transformed_col_names)
X_train_transformed_df

Unnamed: 0,one_hot__season_Fall,one_hot__season_Spring,one_hot__season_Summer,one_hot__season_Winter,one_hot__weekday_Friday,one_hot__weekday_Monday,one_hot__weekday_Saturday,one_hot__weekday_Sunday,one_hot__weekday_Thursday,one_hot__weekday_Tuesday,...,remainder__holiday,remainder__workingday,remainder__weather,remainder__temp,remainder__atemp,remainder__humidity,remainder__year,remainder__month,remainder__day,remainder__hour
0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,9.84,14.395,81.0,2011.0,1.0,1.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,9.02,13.635,80.0,2011.0,1.0,1.0,1.0
2,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,9.02,13.635,80.0,2011.0,1.0,1.0,2.0
3,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,9.84,14.395,75.0,2011.0,1.0,1.0,3.0
4,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,9.84,14.395,75.0,2011.0,1.0,1.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10881,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,15.58,19.695,50.0,2012.0,12.0,19.0,19.0
10882,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,14.76,17.425,57.0,2012.0,12.0,19.0,20.0
10883,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,13.94,15.910,61.0,2012.0,12.0,19.0,21.0
10884,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,13.94,17.425,61.0,2012.0,12.0,19.0,22.0


In [22]:
preds = pipeline.predict(X_train)
print(f"Linear Regression RMSLE: {rmsle(log_y_train, preds, True):.4f}")

Linear Regression RMSLE: 1.0127


In [14]:
# Define the custom scorer
scorer = make_scorer(rmsle, greater_is_better=False)

# Add the scorer to the pipeline and perform cross-validation
scores = -1 * cross_val_score(pipeline, X_train, log_y_train, cv=5, scoring=scorer)

# Print the mean and standard deviation of the scores
print("RMSLE scores: ", scores)
print("Mean RMSLE: ", scores.mean())
print("RMSLE standard deviation: ", scores.std())



RMSLE scores:  [1.05722445 0.94580409 1.09784306 1.03996723 0.99754961]
Mean RMSLE:  1.0276776875285407
RMSLE standard deviation:  0.05207343002492441


