In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
import math
import os
import sys
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from tempfile import NamedTemporaryFile
from zipfile import ZipFile
import tarfile
import shutil

Downloading bids16-machine-learning, 15050 bytes compressed
Downloaded and uncompressed: bids16-machine-learning
Data source import complete.


In [None]:
# Function to download Kaggle data
def download_data(data_source_mapping, kaggle_input_path='/kaggle/input', kaggle_working_path='/kaggle/working', chunk_size=40960):
  """Downloads and uncompresses Kaggle data from a provided URL."""
  os.makedirs(kaggle_input_path, exist_ok=True)
  os.makedirs(kaggle_working_path, exist_ok=True)

  for data_source_mapping_item in data_source_mapping.split(','):
    directory, download_url_encoded = data_source_mapping_item.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(kaggle_input_path, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(chunk_size)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(chunk_size)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

  print('Data source import complete.')


In [None]:
# Specify the data source mapping
DATA_SOURCE_MAPPING = 'bids16-machine-learning:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F68885%2F7645460%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240228%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240228T100145Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D04fc00095c8ee62d15debb1245f8e18371d0e0dd151f0f30054d3cfc155ee407d7a56af1c4cc46bc0d97b838cf00948a70fe2a39dcec3c6feb0462f022c6f61c3178a2d40073e8f3b548a381ce59a93c663cd29bc58f70843818098a3fdce0790452a37518eb1dadafd695c7d42eff3c2d5e5f070643237408355965ab5b03684b1acd8ce99b8ddedcd1393c589ba2e2ea99e08bb05b5ec6789e683c7418a2935178eac93e3bc9cc29a06f84e1ea60ea8768437261406254feba5993076420593c875c740446c3fec619e64b723500a7879353980b81049a39ff6ecf217bf8e8c8205f6c2e32dc4c52fed8f711b26e74b8f7bec7bee2e8e6fa7ec57acfad1b28'
# Call the function to download the data
download_data(DATA_SOURCE_MAPPING)

# Paths for Kaggle data input and working directories
KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'

# Kaggle challenge Yoav Yosef

In [None]:
# Load data using pandas
df_train = pd.read_csv(os.path.join(KAGGLE_INPUT_PATH, 'bids16-machine-learning', 'train.csv'))
df_test = pd.read_csv(os.path.join(KAGGLE_INPUT_PATH, 'bids16-machine-learning', 'test.csv'))

# Copying the train and test data sets to perform tests and feature engineering
df_train_copy = df_train.copy()
df_test_copy = df_test.copy()

In [None]:
# Dropping 'id' column since it's not required for prediction
df_train_copy = df_train_copy.drop(columns=['id'])
df_test_copy = df_test_copy.drop(columns=['id'])

In [None]:
df_train.head()

In [None]:
# Define the lists of categorical and continuous features
categorical_features = ['season', 'mnth', 'weekday', 'workingday', 'weathersit', 'holiday']
continuous_features = ['cnt', 'temp', 'atemp', 'hum', 'windspeed']

In [None]:
# Check for missing values
def check_missing_values(df, title):
    """Prints missing values in a DataFrame."""
    print(f"Missing values in {title}:\n{df.isnull().sum()}\n")

In [None]:
# Handle missing values by filling with the median
def handle_missing_values(df, title):
    """Handles missing values in a DataFrame by filling with the median."""
    for column in df.columns:
      if df[column].isnull().any():
        median_value = df[column].median()
        df[column] = df[column].fillna(median_value)
    check_missing_values(df, title)
    return df


Index(['season', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit',
       'temp', 'atemp', 'hum', 'windspeed', 'cnt'],
      dtype='object')

In [None]:
# Apply the check and handle missing values for train and test sets
check_missing_values(df_train_copy, 'training set')
df_train_copy = handle_missing_values(df_train_copy, 'training set')
check_missing_values(df_test_copy, 'test set')
df_test_copy = handle_missing_values(df_test_copy, 'test set')

In [None]:
# Explore data descriptively and visually
def explore_data(df, cat_features, num_features):
    """Explores the data descriptively and visually, including pairplots, correlation matrix and catplots."""
    print("Descriptive statistics for continuous features:\n", df[num_features].describe().T)

    # Pairplot for continuous features
    sns.pairplot(df[num_features])
    plt.show()

    # Correlation matrix for all features
    correlation_matrix = df.corr()
    plt.figure(figsize=(10, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")
    plt.title("Correlation Matrix")
    plt.show()

    # Interaction plots for categorical variables
    for cat_feature in cat_features:
        sns.catplot(x=cat_feature, y='cnt', data=df, kind='bar')
        plt.title(f'{cat_feature} vs cnt')
        plt.show()


explore_data(df_train_copy, categorical_features, continuous_features)


In [None]:
# Function to calculate wind chill index (WCI) / wind chill factor (WCF)
def wind_chill_index(temp, windspeed, temp_scale=10, wind_scale=10):
  """Calculates the wind chill index using a generic formula with unit scaling."""
  scaled_temp = temp * temp_scale
  scaled_windspeed = windspeed * wind_scale
  wind_chill = 35.74 + 0.6215 * scaled_temp - 35.75 * (scaled_windspeed ** 0.16) + 0.4275 * scaled_temp * (scaled_windspeed ** 0.16)
  wind_chill_scaled = wind_chill / (temp_scale * wind_scale)
  return wind_chill_scaled

In [None]:
# Function to categorize temperature based on season mean.
def categorize_temperature_by_season_simple(temp, season, temp_mean):
  """Categorizes temperature as "cold", "normal", or "hot" based on season and mean temperature."""
  if temp < temp_mean * 0.1:  # Cold (10th percentile threshold)
    return "0"
  elif temp > temp_mean * 0.9:  # Hot (90th percentile threshold)
    return "2"
  else:
    return "1"  # Normal

In [None]:
# Function to add quarter
def add_quarter(df, month_col, quarter_col):
  """Adds a new feature indicating the quarter of the year (1-4)."""
  df[quarter_col] = (df[month_col] - 1) // 3 + 1

In [None]:
# Function to add week within a month
def add_week_within_month(df, month_col, day_col, week_col):
  """Adds a new feature indicating the week within the month (1-4 or 5)."""
  year = 2024
  df[week_col] = df.apply(lambda row: (math.ceil(pd.Timestamp(year=year,month=row[month_col], day=1).dayofweek + row[day_col]) // 7) +1, axis=1)

In [None]:
# Calculate mean temperature by season and weather conditions
season_mean = df_train_copy.groupby('season')['temp'].mean()
weathersit_mean = df_train_copy.groupby('weathersit')['temp'].mean()

df_train_copy['temp_mean_by_season'] = df_train_copy['season'].map(season_mean)
df_test_copy['temp_mean_by_season'] = df_test_copy['season'].map(season_mean)
df_train_copy['temp_mean_by_weathersit'] = df_train_copy['weathersit'].map(weathersit_mean)
df_test_copy['temp_mean_by_weathersit'] = df_test_copy['weathersit'].map(weathersit_mean)

In [None]:
print(df_train_copy.columns)
print(df_train_copy.dtypes)

In [None]:
# Feature engineering with interactions 'temp' and 'atemp'
df_train_copy['feels_like_diff'] = df_train_copy['temp'] - df_train_copy['atemp']
df_test_copy['feels_like_diff'] = df_test_copy['temp'] - df_test_copy['atemp']
df_train_copy['feels_like_ratio'] = df_train_copy['temp'] / df_train_copy['atemp']
df_test_copy['feels_like_ratio'] = df_test_copy['temp'] / df_test_copy['atemp']

# Feature engineering with interactions 'temp' and 'hum'
df_train_copy['temp_hum_interaction'] = df_train_copy['temp'] * df_train_copy['hum']
df_test_copy['temp_hum_interaction'] = df_test_copy['temp'] * df_test_copy['hum']

# 'hum' 'temp' ratio
df_train_copy['hum_temp_ratio'] = df_train_copy['hum'] / df_train_copy['temp']
df_test_copy['hum_temp_ratio'] = df_test_copy['hum'] / df_test_copy['temp']

# Wind chill
df_train_copy['windchill_index'] = wind_chill_index(df_train_copy['temp'], df_train_copy['windspeed'])
df_test_copy['windchill_index'] = wind_chill_index(df_test_copy['temp'], df_test_copy['windspeed'])

# Transformation features for 'temp'
df_train_copy['temp_log'] = np.log(df_train_copy['temp'])
df_test_copy['temp_log'] = np.log(df_test_copy['temp'])

# Categorize temperature
df_train_copy['hot_cold_for_season'] = df_train_copy[['temp', 'season', 'temp_mean_by_season']].apply(lambda row: categorize_temperature_by_season_simple(*row), axis=1)
df_test_copy['hot_cold_for_season'] = df_test_copy[['temp', 'season', 'temp_mean_by_season']].apply(lambda row: categorize_temperature_by_season_simple(*row), axis=1)

In [None]:
# Calculate the week within the month and add the quarter
add_week_within_month(df_train_copy, df_train_copy['mnth'], df_train_copy['weekday'], 'week_within_month')
add_quarter(df_train_copy, df_train_copy['mnth'], 'quarter')
add_week_within_month(df_test_copy, df_test_copy['mnth'], df_test_copy['weekday'], 'week_within_month')
add_quarter(df_test_copy, df_test_copy['mnth'], 'quarter')

In [None]:
# Dropping 'holiday' and 'atemp' column 
df_train_copy = df_train_copy.drop(columns=['holiday','atemp'])
df_test_copy = df_test_copy.drop(columns=['holiday', 'atemp'])


In [None]:
# Preprocess data
def preprocess_data(train_df, test_df):
    # List of features to be one-hot encoded
    one_hot_features = ['season', 'mnth']

    # Select categorical and continuous features in training set
    train_categorical = train_df[one_hot_features]
    continuous_features = ['cnt', 'temp', 'feels_like_diff', 'feels_like_ratio', 'temp_hum_interaction', 'hum_temp_ratio',
                           'windchill_index', 'temp_log']
    train_numerical = train_df[continuous_features]

    # Select categorical and continuous features in test set
    test_categorical = test_df[one_hot_features]
    test_continuous_features = [feature for feature in continuous_features if feature != 'cnt']
    test_numerical = test_df[test_continuous_features]

    # Encode categorical features using pd.get_dummies() with 'category' dtype for both sets
    train_encoded = pd.get_dummies(train_categorical, columns=one_hot_features)
    test_encoded = pd.get_dummies(test_categorical, columns=one_hot_features)


    # Convert boolean values to integers for both sets
    train_encoded = train_encoded.astype(int)
    test_encoded = test_encoded.astype(int)

    # Concatenate numerical and encoded parts for both sets
    train_prepared = pd.concat([train_numerical, train_encoded], axis=1)
    test_prepared = pd.concat([test_numerical, test_encoded], axis=1)

    return train_prepared, test_prepared

df_train_prepared, df_test_prepared = preprocess_data(df_train_copy, df_test_copy)


In [None]:
# Label Encoding 'weathersit'
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
train_label_encode = encoder.fit_transform(df_train_copy['weathersit'])
test_label_encode = encoder.fit_transform(df_test_copy['weathersit'])
df_train_prepared['label_encode_weathersit'] = train_label_encode
df_test_prepared['label_encode_weathersit'] = test_label_encode

# Split into training features and training target
X_train = df_train_prepared.drop(columns=['cnt'])
y_train = df_train_prepared['cnt']

In [None]:
# RMSLE function
def rmsle(y, y_pred):
    assert len(y) == len(y_pred)
    terms_to_sum = [
        (math.log(max(y_pred[i] + 1e-9, 1)) - math.log(max(y[i] + 1e-9, 1))) ** 2.0
        for i, pred in enumerate(y_pred)
    ]
    return (sum(terms_to_sum) * (1.0 / len(y))) ** 0.5

In [None]:
# Function to train OLS model
def train_ols(X_train, y_train, X_test):
    y_train_ols = y_train
    X_train_ols = X_train
    model_ols = sm.OLS(y_train_ols, X_train_ols)
    results_ols = model_ols.fit()
    predictions_ols_test = results_ols.predict(X_test)
    return predictions_ols_test

# Function to train a Decision Tree model
def train_decision_tree(X_train, y_train, X_test):
    model = DecisionTreeRegressor(random_state=42)
    model.fit(X_train, y_train)
    predictions_dev = model.predict(X_test)
    return predictions_dev

# Function to train a Random Forest model
def train_random_forest(X_train, y_train, X_test):
    model = RandomForestRegressor(random_state=42)
    model.fit(X_train, y_train)
    predictions_dev = model.predict(X_test)
    return predictions_dev

# Function to train a Gradient Boosting model
def train_adaboost(X_train, y_train, X_test):
    model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=5),random_state=42,n_estimators=100, learning_rate=0.1)
    model.fit(X_train, y_train)
    predictions_dev = model.predict(X_test)
    return predictions_dev


# Function to train a XGBoost model
def train_xgboost(X_train, y_train, X_test):
      best_params = {'colsample_bytree': 1, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.7}
      model = XGBRegressor(**best_params, random_state=42)
      model.fit(X_train, y_train)
      predictions_dev = model.predict(X_test)
      return model, predictions_dev

In [None]:
# Train and evaluate different models
pred_ols_test = train_ols(X_train, y_train, df_test_prepared)
pred_dt_test = train_decision_tree(X_train, y_train, df_test_prepared)
pred_rf_test = train_random_forest(X_train, y_train, df_test_prepared)
pred_ab_test = train_adaboost(X_train, y_train, df_test_prepared)
xgboost_model_1, pred_xgb_test_1 = train_xgboost(X_train, y_train, df_test_prepared)

In [None]:
# Stacked model
from sklearn.ensemble import StackingRegressor, RandomForestRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [None]:
# Define the base models with best parameters
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, gamma =0, subsample = 0.7, colsample_bytree = 1, random_state=42)
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
ada_model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=5),random_state=42,n_estimators=100, learning_rate=0.1)

In [None]:
# Define the final estimator (can be same as one of the base models or a different estimator)
final_estimator = LinearRegression()
# Define the stacked model
stacked_estimators = [
    ('xgb', xgb_model),
    ('rf', rf_model),
    ('ada', ada_model)
]
stacked_model = StackingRegressor(estimators=stacked_estimators, final_estimator=final_estimator)
# Train the stacked model
stacked_model.fit(X_train, y_train)
# Make predictions on the test set
predictions_stacked = stacked_model.predict(df_test_prepared)


In [None]:
# Define the final estimator (can be same as one of the base models or a different estimator)
final_estimator = LinearRegression()
# Define the stacked model
stacked_estimators = [
    ('xgb', xgb_model),
    ('rf', rf_model),
    ('ada', ada_model)
]
stacked_model = StackingRegressor(estimators=stacked_estimators, final_estimator=final_estimator)
# Train the stacked model
stacked_model.fit(X_train, y_train)
# Make predictions on the test set
predictions_stacked = stacked_model.predict(df_test_prepared)


In [None]:
# Create a DataFrame for submission
submission_df = pd.DataFrame({'id': df_test['id'], 'cnt': predictions_stacked})

In [None]:
#Show feature importances
print("XGBoost feature importances")
plot_feature_importance(xgboost_model_1, df_train_prepared.columns)

In [None]:
# Save the submission file
#submission_df.to_csv('/kaggle/working/submission_yoav_stacked.csv', index=False)

# Show result
#submission_df