# Short notebook 2

**[group 94] Gosling slayers**  
Andrea Grimsdatter Stallvik. Studet number: 528429  
Taheera Ahmed. Student number: 491658

This notebook generated the submission with the filename `11_nov_1055.csv`, which recieved a public score of 146.844926 on the leaderboard.

In this attempt we are using the mean value of featuers per hour + some extra hourly min/max values for select features during data processing, adding some interaction features, adding some rolling averages for some features for 24 hr, and dropping some columns.

*Note:* The packages and version numbers used for this notebook are:  
pandas==1.5.3  
numpy==1.26.0  
scikit-learn==1.2.2  
xgboost==1.7.6  
catboost==1.1.1  
seaborn==0.12.2  
matplotlib==3.8.0  
notebook==7.0.4  

With python version 3.10.7

## Function for data loading / data processing 

In [None]:
"""
Helper functions for:
- loading data
- parsing submission
- ...

NOTE: all functions file should be pasted into the long notebook before submission.
"""

import pandas as pd
import numpy as np
import os

def check_file_exists(file_path):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File {file_path} does not exist.")
    

def load_data(mean=False, mean_stats = False, roll_avg=False, remove_out=False):
    """Loads data, drops rows that have missing values for the target variable."""

    # --- Check if files exist ---
    file_paths = [
        'data/A/train_targets.parquet',
        'data/B/train_targets.parquet',
        'data/C/train_targets.parquet',
        'data/A/X_train_estimated.parquet',
        'data/B/X_train_estimated.parquet',
        'data/C/X_train_estimated.parquet',
        'data/A/X_train_observed.parquet',
        'data/B/X_train_observed.parquet',
        'data/C/X_train_observed.parquet',
    ]

    for file_path in file_paths:
        check_file_exists(file_path)

    # ---- load data from files ----
    train_a = pd.read_parquet('data/A/train_targets.parquet')
    train_b = pd.read_parquet('data/B/train_targets.parquet')
    train_c = pd.read_parquet('data/C/train_targets.parquet')

    X_train_observed_a = pd.read_parquet('data/A/X_train_observed.parquet').rename(columns={'date_forecast': 'time'})
    X_train_observed_b = pd.read_parquet('data/B/X_train_observed.parquet').rename(columns={'date_forecast': 'time'})
    X_train_observed_c = pd.read_parquet('data/C/X_train_observed.parquet').rename(columns={'date_forecast': 'time'})

    X_train_estimated_a = pd.read_parquet('data/A/X_train_estimated.parquet').rename(columns={'date_forecast': 'time'})
    X_train_estimated_b = pd.read_parquet('data/B/X_train_estimated.parquet').rename(columns={'date_forecast': 'time'})
    X_train_estimated_c = pd.read_parquet('data/C/X_train_estimated.parquet').rename(columns={'date_forecast': 'time'})

    # --- get features for each hour before concatinating ---
    if mean:
        X_train_observed_a = get_hourly_mean(X_train_observed_a)
        X_train_observed_b = get_hourly_mean(X_train_observed_b)
        X_train_observed_c = get_hourly_mean(X_train_observed_c)

        X_train_estimated_a = get_hourly_mean(X_train_estimated_a)
        X_train_estimated_b = get_hourly_mean(X_train_estimated_b)
        X_train_estimated_c = get_hourly_mean(X_train_estimated_c)
    elif mean_stats:
        X_train_observed_a = get_hourly_stats(X_train_observed_a)
        X_train_observed_b = get_hourly_stats(X_train_observed_b)
        X_train_observed_c = get_hourly_stats(X_train_observed_c)

        X_train_estimated_a = get_hourly_stats(X_train_estimated_a)
        X_train_estimated_b = get_hourly_stats(X_train_estimated_b)
        X_train_estimated_c = get_hourly_stats(X_train_estimated_c)
    else:
        X_train_observed_a = get_hourly(X_train_observed_a)
        X_train_observed_b = get_hourly(X_train_observed_b)
        X_train_observed_c = get_hourly(X_train_observed_c)

        X_train_estimated_a = get_hourly(X_train_estimated_a)
        X_train_estimated_b = get_hourly(X_train_estimated_b)
        X_train_estimated_c = get_hourly(X_train_estimated_c)

    X_train_observed_a.rename(columns={"time_hour": "time"}, inplace=True)
    X_train_observed_b.rename(columns={"time_hour": "time"}, inplace=True)
    X_train_observed_c.rename(columns={"time_hour": "time"}, inplace=True)
    X_train_estimated_a.rename(columns={"time_hour": "time"}, inplace=True)
    X_train_estimated_b.rename(columns={"time_hour": "time"}, inplace=True)
    X_train_estimated_c.rename(columns={"time_hour": "time"}, inplace=True)

    X_train_observed_a["estimated_flag"] = 0
    X_train_observed_b["estimated_flag"] = 0
    X_train_observed_c["estimated_flag"] = 0
    X_train_estimated_a["estimated_flag"] = 1
    X_train_estimated_b["estimated_flag"] = 1
    X_train_estimated_c["estimated_flag"] = 1

    # --- merge observed and estimated data with target data, lining up time-stamps correctly ----
    train_obs_a = pd.merge(train_a, X_train_observed_a, on='time', how='inner')
    train_obs_b = pd.merge(train_b, X_train_observed_b, on='time', how='inner') # NOTE: 4 missing values for target
    train_obs_c = pd.merge(train_c, X_train_observed_c, on='time', how='inner') # NOTE: 6059 missing values for target

    train_est_a = pd.merge(train_a, X_train_estimated_a, on='time', how='inner')
    train_est_b = pd.merge(train_b, X_train_estimated_b, on='time', how='inner')
    train_est_c = pd.merge(train_c, X_train_estimated_c, on='time', how='inner')

    data_a = pd.concat([train_obs_a, train_est_a], axis=0, ignore_index=True)
    data_b = pd.concat([train_obs_b, train_est_b], axis=0, ignore_index=True)
    data_c = pd.concat([train_obs_c, train_est_c], axis=0, ignore_index=True)

    # remove rows that the target value is missing from since they will not be useful in model training
    data_a = data_a.dropna(subset=['pv_measurement'])
    data_b = data_b.dropna(subset=['pv_measurement'])
    data_c = data_c.dropna(subset=['pv_measurement'])

    # add columnns for rolling average
    if roll_avg:
        data_a = rolling_average(data_a)
        data_b = rolling_average(data_b)
        data_c = rolling_average(data_c)

    if remove_out:
        data_a = remove_ouliers(data_a)
        data_b = remove_ouliers(data_b, remove_b_outliers=True)
        data_c = remove_ouliers(data_c)

    return data_a, data_b, data_c


def load_data_interpolate(roll_avg=False, remove_out=False):
    
    # ---- load data from files ----
    train_a = pd.read_parquet('data/A/train_targets.parquet')
    train_b = pd.read_parquet('data/B/train_targets.parquet')
    train_c = pd.read_parquet('data/C/train_targets.parquet')

    X_train_observed_a = pd.read_parquet('data/A/X_train_observed.parquet').rename(columns={'date_forecast': 'time'})
    X_train_observed_b = pd.read_parquet('data/B/X_train_observed.parquet').rename(columns={'date_forecast': 'time'})
    X_train_observed_c = pd.read_parquet('data/C/X_train_observed.parquet').rename(columns={'date_forecast': 'time'})

    X_train_estimated_a = pd.read_parquet('data/A/X_train_estimated.parquet').rename(columns={'date_forecast': 'time'})
    X_train_estimated_b = pd.read_parquet('data/B/X_train_estimated.parquet').rename(columns={'date_forecast': 'time'})
    X_train_estimated_c = pd.read_parquet('data/C/X_train_estimated.parquet').rename(columns={'date_forecast': 'time'})

    X_train_observed_a.rename(columns={"time_hour": "time"}, inplace=True)
    X_train_observed_b.rename(columns={"time_hour": "time"}, inplace=True)
    X_train_observed_c.rename(columns={"time_hour": "time"}, inplace=True)
    X_train_estimated_a.rename(columns={"time_hour": "time"}, inplace=True)
    X_train_estimated_b.rename(columns={"time_hour": "time"}, inplace=True)
    X_train_estimated_c.rename(columns={"time_hour": "time"}, inplace=True)

    X_train_observed_a["estimated_flag"] = 0
    X_train_observed_b["estimated_flag"] = 0
    X_train_observed_c["estimated_flag"] = 0
    X_train_estimated_a["estimated_flag"] = 1
    X_train_estimated_b["estimated_flag"] = 1
    X_train_estimated_c["estimated_flag"] = 1

    # --- merge observed and estimated data with target data, leave NaN values for pv_measurement ----
    train_obs_a = pd.merge(train_a, X_train_observed_a, on='time', how='right')
    train_obs_b = pd.merge(train_b, X_train_observed_b, on='time', how='right')
    train_obs_c = pd.merge(train_c, X_train_observed_c, on='time', how='right')

    train_est_a = pd.merge(train_a, X_train_estimated_a, on='time', how='right')
    train_est_b = pd.merge(train_b, X_train_estimated_b, on='time', how='right')
    train_est_c = pd.merge(train_c, X_train_estimated_c, on='time', how='right')

    data_a = pd.concat([train_obs_a, train_est_a], axis=0, ignore_index=True)
    data_b = pd.concat([train_obs_b, train_est_b], axis=0, ignore_index=True)
    data_c = pd.concat([train_obs_c, train_est_c], axis=0, ignore_index=True)

    data_a = fill_pv_values(data_a)
    data_b = fill_pv_values(data_b)
    data_c = fill_pv_values(data_c)

    # add columnns for rolling average
    if roll_avg:
        data_a = rolling_average(data_a)
        data_b = rolling_average(data_b)
        data_c = rolling_average(data_c)

    if remove_out:
        data_a = remove_ouliers(data_a)
        data_b = remove_ouliers(data_b, remove_b_outliers=True)
        data_c = remove_ouliers(data_c)

    return data_a, data_b, data_c


def remove_ouliers(data, remove_b_outliers = False):
    """Removes datapoints that have been static over long stretches (likely due to sensor error!)."""

    threshold = 0.01
    window_size = 24

    # Calculate standard deviation for each window
    std_dev = data['pv_measurement'].rolling(window=window_size, min_periods=1).std()

    # Identify constant stretches and create a mask to filter out these points
    constant_mask = std_dev < threshold

    # Filter out constant stretches from the data
    filtered_data = data[~constant_mask]

    if remove_b_outliers:
        "removing some extra outliers"
        # Remove rows where pv_measurement > 100 and diffuse_rad:W < 30
        filtered_data = filtered_data[~((filtered_data["pv_measurement"] > 100) & (filtered_data["diffuse_rad:W"] < 30))]

        # Remove rows where pv_measurement > 200 and diffuse_rad:W < 40
        filtered_data = filtered_data[~((filtered_data["pv_measurement"] > 200) & (filtered_data["diffuse_rad:W"] < 40))]

    return filtered_data


def get_hourly(df):
    
    df["minute"] = df["time"].dt.minute

    min_vals = df["minute"].unique()

    df_list = []

    for value in min_vals:
        filtered_data = df[df['minute'] == value].copy()
        filtered_data.drop(columns=['minute'], inplace=True)
        filtered_data.columns = [f'{col}_{value}' for col in filtered_data.columns]
        filtered_data["time_hour"] = filtered_data["time_"+str(value)].apply(lambda x: x.floor('H'))
        df_list.append(filtered_data)

    # merge df's on hourly time
    merged_df = pd.merge(df_list[0], df_list[1], on="time_hour")
    for df in df_list[2:]:
        merged_df = pd.merge(merged_df, df, on="time_hour")

    return merged_df

def get_hourly_mean(df):
    """Returns a dataframe in which """
    
    # get a column for the start hour
    df["time_hour"] = df["time"].apply(lambda x: x.floor('H'))
    
    # get the mean value for the entire hour
    mean_df = df.groupby('time_hour').agg('mean').reset_index()

    return mean_df

def get_hourly_stats(df, important_features = ['clear_sky_energy_1h:J','clear_sky_rad:W', 'direct_rad:W', 'direct_rad_1h:J', 'diffuse_rad:W', 'diffuse_rad_1h:J', 'total_cloud_cover:p', 'sun_elevation:d']):
    """Returns a dataframe with hourly mean for all features and min/max for selected important features."""
    
    # get a column for the start hour
    df["time_hour"] = df["time"].apply(lambda x: x.floor('H'))
    
    # get the mean value for all features for the entire hour
    mean_df = df.groupby('time_hour').agg('mean', numeric_only=True).reset_index()

    # get min and max for selected important features
    min_max_df = df.groupby('time_hour')[important_features].agg(['min', 'max'], numeric_only=True).reset_index()

    min_max_df.columns = ['{}_{}'.format(col[0], col[1]) if col[1] != 'time_hour' else col[1] for col in min_max_df.columns]
    min_max_df.rename(columns={"time_hour_":"time_hour"}, inplace=True)

    # merge the mean and min/max dataframes on the time_hour column
    result_df = pd.merge(mean_df, min_max_df, on='time_hour')

    return result_df


def fill_pv_values(df):
    """Fill the pv-values to account for the entire hour"""

    # get a column for the start hour
    df["time_hour"] = df["time"].apply(lambda x: x.floor('H'))

    # Calculate linear interpolation for each hour
    df['pv_measurement'] = df.groupby('time_hour')['pv_measurement'].transform(lambda x: x.interpolate())

    # Drop the temporary column used for grouping
    df = df.drop(columns=["time_hour"])

    return df


def rolling_average(df, window_size=24,features=['clear_sky_energy_1h:J','clear_sky_rad:W', 'direct_rad:W', 'direct_rad_1h:J', 'diffuse_rad:W', 'diffuse_rad_1h:J', 'total_cloud_cover:p', 'sun_elevation:d']):
    
    #hard-code new features #TODO: add as param accessible outside of functions.py
    features = ['precip_5min:mm', 'rain_water:kgm2', 'prob_rime:p', 't_1000hPa:K', 'visibility:m',] # just this 7 nov # 'snow_water:kgm2'
               # 'clear_sky_energy_1h:J','clear_sky_rad:W', 'direct_rad:W', 'direct_rad_1h:J', 'diffuse_rad:W', 'diffuse_rad_1h:J', 'total_cloud_cover:p', 'sun_elevation:d'] # added for 8 nov
    
    # Ensure the 'time' column is datetime and set as index
    df['time'] = pd.to_datetime(df['time'])
    df.set_index('time', inplace=True, drop=False)
    df.sort_index(inplace=True)

    # Calculate rolling averages for the specified features
    for feature in features:
        rolling_feature_name = f"{feature}_rolling_avg_{window_size}"
        df[rolling_feature_name] = df[feature].rolling(window=window_size).mean()

    # Handle missing data if necessary
    df.fillna(method='bfill', inplace=True)  # Forward fill

    return df


def get_train_targets(data):
    """Sepperate out features from the training data"""
    targets = data["pv_measurement"]
    X_train = data.drop(columns=["pv_measurement"])
    return X_train, targets


def get_test_data(mean=False, mean_stats=False, roll_avg=False):
    """Parse the test data, getting the data that has a kaggle submission id for all locations"""

    # --- Check if files exist ---
    file_paths = [
        'data/A/X_test_estimated.parquet',
        'data/B/X_test_estimated.parquet',
        'data/C/X_test_estimated.parquet',
        'data/test.csv'
    ]

    for file_path in file_paths:
        check_file_exists(file_path)

    # --- load all test data from file ---
    X_test_estimated_a = pd.read_parquet('data/A/X_test_estimated.parquet').rename(columns={'date_forecast': 'time'})
    X_test_estimated_b = pd.read_parquet('data/B/X_test_estimated.parquet').rename(columns={'date_forecast': 'time'})
    X_test_estimated_c = pd.read_parquet('data/C/X_test_estimated.parquet').rename(columns={'date_forecast': 'time'})

    # --- get hourly and rename ---
    if mean:
        X_test_estimated_a = get_hourly_mean(X_test_estimated_a)
        X_test_estimated_b = get_hourly_mean(X_test_estimated_b)
        X_test_estimated_c = get_hourly_mean(X_test_estimated_c)
    elif mean_stats:
        X_test_estimated_a = get_hourly_stats(X_test_estimated_a)
        X_test_estimated_b = get_hourly_stats(X_test_estimated_b)
        X_test_estimated_c = get_hourly_stats(X_test_estimated_c)
    else:
        # X_test_estimated_a = get_hourly(X_test_estimated_a)
        # X_test_estimated_b = get_hourly(X_test_estimated_b)
        # X_test_estimated_c = get_hourly(X_test_estimated_c)
        pass

    X_test_estimated_a.rename(columns={"time_hour": "time"}, inplace=True)
    X_test_estimated_b.rename(columns={"time_hour": "time"}, inplace=True)
    X_test_estimated_c.rename(columns={"time_hour": "time"}, inplace=True)

    X_test_estimated_a["estimated_flag"] = 1
    X_test_estimated_b["estimated_flag"] = 1
    X_test_estimated_c["estimated_flag"] = 1

    # --- load kaggle submission data ---
    test = pd.read_csv('data/test.csv')
    test["time"] = pd.to_datetime(test["time"]) # convert "time" to datetime format to facilitate merge
    kaggle_submission_a = test[test["location"]=="A"]
    kaggle_submission_b = test[test["location"]=="B"]
    kaggle_submission_c = test[test["location"]=="C"]

    # --- get only the test data with a corresponding kaggle submission id ---
    X_test_a = pd.merge(X_test_estimated_a, kaggle_submission_a, on="time", how="right")
    X_test_b = pd.merge(X_test_estimated_b, kaggle_submission_b, on="time", how="right")
    X_test_c = pd.merge(X_test_estimated_c, kaggle_submission_c, on="time", how="right")

    if roll_avg:
        X_test_a = rolling_average(X_test_a)
        X_test_b = rolling_average(X_test_b)
        X_test_c = rolling_average(X_test_c)

    return X_test_a, X_test_b, X_test_c


def prepare_submission(X_test_a, X_test_b, X_test_c, pred_a, pred_b, pred_c):
    """Parses the test data and predictions into a single df in kaggle submission format"""
    
    submission_a = X_test_a.copy()
    submission_b = X_test_b.copy()
    submission_c = X_test_c.copy()

    submission_a["prediction"] = pred_a
    submission_b["prediction"] = pred_b
    submission_c["prediction"] = pred_c

    submission = pd.concat([submission_a, submission_b, submission_c])

    submission = submission[["id", "prediction"]]

    return submission

## Featureadder class for the sklearn pipeline

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
import pandas as pd
import numpy as np

class FeatureAdder(BaseEstimator, TransformerMixin):
    """Adds features."""

    def __init__(self, drop_cols = []):
        self.drop_cols = drop_cols

    def fit(self, X, y=None):
        return self
    
    def cyclic_encoding(self, df):
        df['time'] = pd.to_datetime(df['time'])
        df['normalized_time'] = (df['time'].dt.hour + df['time'].dt.minute / 60 + df['time'].dt.second / 3600) / 24.0
        df['sine_encoded'] = np.sin(2 * np.pi * df['normalized_time'])
        df['cosine_encoded'] = np.cos(2 * np.pi * df['normalized_time'])

        month = df['time'].dt.month
        df['sine_encoded_month'] = np.sin(2 * np.pi * month)
        df['cosine_encoded_month'] = np.cos(2 * np.pi * month)

        df.drop('normalized_time', axis=1, inplace=True)
        return df

    def transform(self, X):
        X_copy = X.copy()

        # # add moth
        # X_copy['month'] = X_copy['time'].apply(lambda x: x.month)

        # # add hour
        # X_copy['hour'] = X_copy['time'].apply(lambda x: x.hour)

        X_copy = self.cyclic_encoding(X_copy)

        # -- additive effects:
        X_copy["sun_rad_1"] = (X_copy['sun_azimuth:d'] * X_copy['direct_rad:W']) / 1000000
        X_copy["sun_rad_2"] = (X_copy['sun_elevation:d'] * X_copy['direct_rad:W']) / 1000000
        #X_copy["sun_wind_1"] = (X_copy['wind_speed_10m:ms'] * X_copy['direct_rad:W']) / 1000
        X_copy["sun_wind_2"] = (X_copy['wind_speed_10m:ms'] * X_copy['diffuse_rad:W']) / 1000
        X_copy["temp_sun"] = (X_copy['t_1000hPa:K'] * X_copy['sun_azimuth:d'])/1000
        X_copy["rad_day_1"] = (X_copy['is_day:idx'] * X_copy['diffuse_rad:W']) / 1000
        X_copy['mult_coulds'] = (X_copy['clear_sky_rad:W'] * X_copy['cloud_base_agl:m']) / 100000

        #X_copy["dirrad_airdensity"] = (X_copy['direct_rad:W'] * X_copy['air_density_2m:kgm3'])/1000 #unsure
        X_copy["ratio_rad1"] = (X_copy['direct_rad:W'] / X_copy['diffuse_rad:W']) # good one!
        #X_copy["diffrad_airdensity"] = (X_copy['diffuse_rad:W'] * X_copy['air_density_2m:kgm3'])/1000 #unsure
        X_copy["temp_rad_1"] = (X_copy['t_1000hPa:K'] * X_copy['direct_rad:W'])/1000

        # X_copy["ratio_rad1"] = (X_copy['direct_rad:W'] / X_copy['diffuse_rad:W']) # good one!
        # X_copy["temp_rad_1"] = (X_copy['t_1000hPa:K'] * X_copy['direct_rad:W'])/1000

        return X_copy

## Run pipeline, prepare submission, and write to file

In [None]:
import pandas as pd
import numpy as np
#from functions import load_data, get_train_targets, get_test_data, prepare_submission, remove_ouliers
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
import catboost as cb
from scipy.stats import uniform, randint
import warnings
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
#from featureadder import FeatureAdder
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor

# Suppress all FutureWarnings
warnings.filterwarnings("ignore", category=FutureWarning)

data_a, data_b, data_c = load_data(mean_stats=True, remove_out=True, roll_avg=True)

X_train_a, targets_a = get_train_targets(data_a)
X_train_b, targets_b = get_train_targets(data_b)
X_train_c, targets_c = get_train_targets(data_c)

X_test_a, X_test_b, X_test_c = get_test_data(mean_stats=True, roll_avg=True)


drop_cols = ['time', 'elevation:m', 'fresh_snow_1h:cm', 'ceiling_height_agl:m', 'snow_density:kgm3', 
             'wind_speed_w_1000hPa:ms', 'snow_drift:idx', 'fresh_snow_3h:cm', 'is_in_shadow:idx', 'dew_or_rime:idx', 'fresh_snow_6h:cm', 'prob_rime:p'] # this second line is columns with feature importance == 0

class ColumnDropper(BaseEstimator, TransformerMixin):
    """Drops columns from the data."""

    def __init__(self, drop_cols = []):
        self.drop_cols = drop_cols

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        return X_copy.drop(columns=self.drop_cols)

data_process_pipeline = Pipeline([
    ('add_features', FeatureAdder()),
    ('drop_cols', ColumnDropper(drop_cols=drop_cols)),
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='median')),
    ('standar', StandardScaler()),
])


base_modelsA = [
    ('cat_boost1', cb.CatBoostRegressor(random_state=1, silent=True, objective="MAE", border_count=157, depth=13, iterations=828, l2_leaf_reg=7.677745179031975, learning_rate=0.012997359346271088)), #andrea gjør søk
    ('cat_boost2', cb.CatBoostRegressor(random_state=2, silent=True, depth=10)),
    ('xgb_reg1', XGBRegressor(random_state=12, eval_metric="mae", colsample_bytree=0.588602113426499, max_depth=12, n_estimators=500, reg_alpha=1e-09, reg_lambda=0.0001, xee=0.588602113426499, learning_rate=0.023222800065583988,gboost__subsample=0.4665774062657444)), #Taheera gjør søk
    ('xgb_reg2', XGBRegressor(random_state=42)),
    ('xgb_reg3', XGBRegressor(random_state=16, eval_metric="mae")),
    ('cat_boost3', cb.CatBoostRegressor(random_state=3, silent=True)),
    #('cat_boost4', cb.CatBoostRegressor(random_state=32, silent=True, objective="MAE", depth=10)), #lagt til
]

base_modelsB = [
    ('cat_boost1', cb.CatBoostRegressor(random_state=1, silent=True, objective="MAE", depth=10)),
    ('cat_boost2', cb.CatBoostRegressor(random_state=2, silent=True, depth=10)),
    ('xgb_reg1', XGBRegressor(random_state=12, eval_metric="mae")),
    ('xgb_reg2', XGBRegressor(random_state=42)),
    ('cat_boost3', cb.CatBoostRegressor(random_state=3, silent=True)),
]

base_modelsC = [
    ('cat_boost1', cb.CatBoostRegressor(random_state=1, silent=True, objective="MAE", depth=10)),
    ('cat_boost2', cb.CatBoostRegressor(random_state=2, silent=True, depth=10)),
    ('xgb_reg1', XGBRegressor(random_state=12, eval_metric="mae")),
    ('xgb_reg2', XGBRegressor(random_state=42)),
    ('cat_boost3', cb.CatBoostRegressor(random_state=3, silent=True)),
]

# Define meta-learner
meta_learnerA = LinearRegression()
meta_learnerB = LinearRegression()
meta_learnerC = LinearRegression()

# Create the stacking regressor
stacked_modelA = StackingRegressor(estimators=base_modelsA, final_estimator=meta_learnerA)
stacked_modelB = StackingRegressor(estimators=base_modelsB, final_estimator=meta_learnerB)
stacked_modelC = StackingRegressor(estimators=base_modelsC, final_estimator=meta_learnerC)

modelA_pipeline = Pipeline([
    ('data_process', data_process_pipeline),
    ('stacked_model', stacked_modelA)
])

modelB_pipeline = Pipeline([
    ('data_process', data_process_pipeline),
    ('stacked_model', stacked_modelB)
])

modelC_pipeline = Pipeline([
    ('data_process', data_process_pipeline),
    ('stacked_model', stacked_modelC)
])

print("training location A model")
modelA_pipeline.fit(X_train_a, targets_a)
pred_a = modelA_pipeline.predict(X_test_a.drop(columns=["id", "prediction", "location"]))

print("training location B model")
modelB_pipeline.fit(X_train_b, targets_b)
pred_b = modelB_pipeline.predict(X_test_b.drop(columns=["id", "prediction", "location"]))

print("training location C model")
modelC_pipeline.fit(X_train_c, targets_c)
pred_c = modelC_pipeline.predict(X_test_c.drop(columns=["id", "prediction", "location"]))

submission = prepare_submission(X_test_a, X_test_b, X_test_c, pred_a, pred_b, pred_c)
submission['prediction'] = submission['prediction'].apply(lambda x: 0 if x < 0.1 else x)

submission.to_csv('submissions/11_nov_1055.csv', index=False)