# Imports

In [26]:
from sklearn import *
import lightgbm as lgb
import xgboost as xgb
import catboost as cat
from sklearn.model_selection import train_test_split
from sqlalchemy import column

from sklearn.compose import make_column_transformer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer


# Helpers

In [27]:
import numpy as np 

def rmsle(y_true, y_pred):
    """
    Computes the Root Mean Squared Logarithmic Error 
    
    Args:
        y_true (np.array): n-dimensional vector of ground-truth values 
        y_pred (np.array): n-dimensional vecotr of predicted values 
    
    Returns:
        A scalar float with the rmsle value 
    
    Note: You can alternatively use sklearn and just do: 
        `sklearn.metrics.mean_squared_log_error(y_true, y_pred) ** 0.5`
    """
    assert (y_true >= 0).all(), 'Received negative y_true values'
    assert (y_pred >= 0).all(), 'Received negative y_pred values'
    assert y_true.shape == y_pred.shape, 'y_true and y_pred have different shapes'
    y_true_log1p = np.log1p(y_true)  # log(1 + y_true)
    y_pred_log1p = np.log1p(y_pred)  # log(1 + y_pred)
    return np.sqrt(np.mean(np.square(y_pred_log1p - y_true_log1p)))

def convert_to_category(X):
    for col in X.columns:
        if ( (X[col].dtype != 'int64') and (X[col].dtype != 'float64') and (X[col].dtype != 'bool')):
            X[col] = X[col].astype('category')
            
    return X


# Data

In [28]:
import pandas as pd

stores_train = pd.read_csv('data/stores_train.csv')
stores_test = pd.read_csv('data/stores_test.csv')
plaace_hierarchy = pd.read_csv('data/plaace_hierarchy.csv')
grunnkrets = pd.read_csv('data/grunnkrets_norway_stripped.csv')
grunnkrets_ages = pd.read_csv('data/grunnkrets_age_distribution.csv')
grunnkrets_household_types = pd.read_csv('data/grunnkrets_households_num_persons.csv')
grunnkrets_household_income = pd.read_csv('data/grunnkrets_income_households.csv')
busstops = pd.read_csv('data/busstops_norway.csv')

In [29]:
stores_train_copy= stores_train.copy()

# plaace_hierarchy
plaace_hierarchy_copy = plaace_hierarchy.copy()
plaace_hierarchy_copy.drop(columns=['sales_channel_name'], inplace=True)

plaace_hierarchy_copy['lv1'] = plaace_hierarchy_copy['lv1'].astype('category')
plaace_hierarchy_copy['lv2'] = plaace_hierarchy_copy['lv2'].astype('category')

# grunnkrets
grunnkrets_copy = grunnkrets.copy()
grunnkrets_copy.drop_duplicates(subset=['grunnkrets_id'], keep='first', inplace=True)


# grunnkrets_ages
grunnkrets_ages_copy = grunnkrets_ages.copy()
grunnkrets_ages_copy.drop_duplicates(subset=['grunnkrets_id'], keep='first', inplace=True)


# grunnkrets_household_types
grunnkrets_household_types_copy = grunnkrets_household_types.copy()
for household_type in grunnkrets_household_types_copy.columns[2:]:
    grunnkrets_household_types_copy[household_type] = grunnkrets_household_types_copy[household_type].astype('category')
grunnkrets_household_types_copy.drop_duplicates(subset=['grunnkrets_id'], keep='first', inplace=True)


# grunnkrets_household_income
grunnkrets_household_income_copy = grunnkrets_household_income.copy()
grunnkrets_household_income_copy.rename(columns={'singles':'singles_income', 'couple_without_children':'couple_without_children_income'}, inplace=True)
grunnkrets_household_income_copy.drop_duplicates(subset=['grunnkrets_id'], keep='first', inplace=True)


grunnkrets_copy.rename(columns={'year': 'year_1'}, inplace=True)
grunnkrets_ages_copy.rename(columns={'year': 'year_2'}, inplace=True)
grunnkrets_household_types_copy.rename(columns={'year': 'year_3'}, inplace=True)
grunnkrets_household_income_copy.rename(columns={'year': 'year_4'}, inplace=True)

df = stores_train_copy
df = pd.merge(df, plaace_hierarchy_copy, on='plaace_hierarchy_id', how='left')
df = pd.merge(df, grunnkrets_copy, on='grunnkrets_id', how='left')
df = pd.merge(df, grunnkrets_ages_copy, on='grunnkrets_id', how='left')
df = pd.merge(df, grunnkrets_household_types_copy, on='grunnkrets_id', how='left')
df = pd.merge(df, grunnkrets_household_income_copy, on='grunnkrets_id', how='left')


In [30]:
columns_to_drop = [
                    'revenue',
                    # 'store_id',
                    # 'plaace_hierarchy_id',
                    # 'grunnkrets_id',
                    # 'geometry',
                    # 'address',
                    # 'year',
                    # 'year_1',
                    # 'year_2',
                    # 'year_3',
                    # 'year_4',
                  ]

X = df.drop(columns=columns_to_drop)
X = convert_to_category(X)

categorical_features = list(X.select_dtypes(include=['category']).columns)

### Scaling

In [32]:
columns_to_scale = [
    'lat', 'lon', 'area_km2',
                     'all_households', 'singles_income', 'couple_without_children_income', 'couple_with_children', 'other_households', 'single_parent_with_children',
                     ]

scaler = preprocessing.StandardScaler()
scaler.fit(X[columns_to_scale])
X[columns_to_scale] = scaler.transform(X[columns_to_scale])


# Encoding

In [33]:
full_pipeline = ColumnTransformer([('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)], remainder='passthrough')

encoder = full_pipeline.fit(X)
X_encoded = encoder.transform(X)