In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from catboost import CatBoostRegressor
import seaborn as sns

In [2]:
train_raw = pd.read_csv('../../data/train_data.csv', parse_dates=["startdate"])
test_raw = pd.read_csv('../../data/test_data.csv', parse_dates=["startdate"])
submit = pd.read_csv('../../data/sample_solution.csv')
target = 'contest-tmp2m-14d__tmp2m'


In [3]:
def rmse(actual, predicted):
    return mean_squared_error(actual, predicted, squared=False)

def location_nom(train, test):
    # Ref: https://www.kaggle.com/code/flaviafelicioni/wids-2023-different-locations-train-test-solved
    scale = 14

    train.loc[:,'lat']=round(train.lat,scale)
    train.loc[:,'lon']=round(train.lon,scale)
    test.loc[:,'lat']=round(test.lat,scale)
    test.loc[:,'lon']=round(test.lon,scale)

    all_df = pd.concat([train, test], axis=0)
    all_df['loc_group'] = all_df.groupby(['lat','lon']).ngroup()
    train = all_df.iloc[:len(train)]
    test = all_df.iloc[len(train):].drop(target, axis=1)
    
    return train, test


def categorical_encode(train, test, feature_name):
    # le = LabelEncoder()
    # train['climateregions__climateregion'] = ohe.fit_transform(train['climateregions__climateregion'])
    # test['climateregions__climateregion'] = ohe.transform(test['climateregions__climateregion'])
    # return train, test
    # using OneHotEncoder
    ohe = OneHotEncoder()
    train_encoded = ohe.fit_transform(train[[feature_name]])
    test_encoded = ohe.transform(test[[feature_name]])
    
    train = train.drop([feature_name], axis=1)
    test = test.drop([feature_name], axis=1)
    
    train_encoded = pd.DataFrame(train_encoded.toarray(), columns=ohe.get_feature_names_out([feature_name]))
    test_encoded = pd.DataFrame(test_encoded.toarray(), columns=ohe.get_feature_names_out([feature_name]))
    
    train = pd.concat([train, train_encoded], axis=1)
    test = pd.concat([test, test_encoded], axis=1)
    
    return train, test

    
def fill_na(df):
    # TODO: fill na with mean or median
    df = df.sort_values(by=['loc_group', 'startdate']).ffill()
    return df

def add_season(df):
    month_to_season = {
        1: 0,
        2: 0,
        3: 1,
        4: 1,
        5: 1,
        6: 2,
        7: 2,
        8: 2,
        9: 3,
        10: 3,
        11: 3,
        12: 0,
    }
    df["season"] = df["month"].apply(lambda x: month_to_season[x])

from sklearn.preprocessing import FunctionTransformer
def sin_transformer(period):
    return FunctionTransformer(lambda x: np.sin(x / period * 2 * np.pi))


def cos_transformer(period):
    return FunctionTransformer(lambda x: np.cos(x / period * 2 * np.pi))

def encode_cyclical(df):
    # encode the day with a period of 365
    df["day_of_year_sin"] = sin_transformer(365).fit_transform(df["day_of_year"])
    df["day_of_year_cos"] = cos_transformer(365).fit_transform(df["day_of_year"])

    # encode the month with a period of 12
    df["month_sin"] = sin_transformer(12).fit_transform(df["month"])
    df["month_cos"] = cos_transformer(12).fit_transform(df["month"])

def creat_new_featute(df):
    df['year'] = df['startdate'].dt.year
    df['month'] = df['startdate'].dt.month
    df['day_of_year'] = df['startdate'].dt.dayofyear
    add_season(df)
    encode_cyclical(df)
    return df

#TODO: drop features with high correlation
def feature_engineering(train_raw, test_raw):
    train, test = location_nom(train_raw, test_raw)
    train = fill_na(train)
    train = creat_new_featute(train)
    test = creat_new_featute(test)
    train, test = categorical_encode(train, test, 'climateregions__climateregion')
    train, test = categorical_encode(train, test, 'season')

    # Xingjian: not drap lat and lon
    drop_cols = ['index', 'startdate', target]
    # drop_cols = ['index', 'startdate', 'lat', 'lon', target]
    features = [col for col in train.columns if col not in drop_cols]
    X = train[features]
    X_test = test[features]
    y = train[target]

    return X, y, X_test