# 1. Imports

In [1]:
import pandas as pd
import numpy as np

# 2. Data Preprocessing

In [2]:
def convert_to_category(X):
    for col in X.columns:
        if ( (X[col].dtype != 'int64') and (X[col].dtype != 'float64') and (X[col].dtype != 'bool')):
            X[col] = X[col].astype('category')
            
    return X

In [3]:
stores_train = pd.read_csv('data/stores_train_with_extra_features.csv')
stores_test = pd.read_csv('data/stores_test_with_extra_features.csv')

plaace_hierarchy = pd.read_csv('data/plaace_hierarchy.csv')
grunnkrets = pd.read_csv('data/grunnkrets_norway_stripped.csv')
grunnkrets_ages = pd.read_csv('data/grunnkrets_age_distribution.csv')
grunnkrets_household_types = pd.read_csv('data/grunnkrets_households_num_persons.csv')
grunnkrets_household_income = pd.read_csv('data/grunnkrets_income_households.csv')


stores_train_copy= stores_train.copy()

plaace_hierarchy_copy = plaace_hierarchy.copy()
plaace_hierarchy_copy.drop(columns='sales_channel_name', inplace=True)
plaace_hierarchy_copy['lv1'] = plaace_hierarchy_copy['lv1'].astype('category')
plaace_hierarchy_copy['lv2'] = plaace_hierarchy_copy['lv2'].astype('category')


grunnkrets_copy = grunnkrets.copy()
grunnkrets_copy.rename(columns={'year': 'year_1'}, inplace=True)

grunnkrets_ages_copy = grunnkrets_ages.copy()
grunnkrets_ages_copy.rename(columns={'year': 'year_2'}, inplace=True)
grunnkrets_ages_copy['grunnkrets_population'] = grunnkrets_ages_copy.iloc[:, 2:].sum(axis=1)

grunnkrets_household_types_copy = grunnkrets_household_types.copy()
grunnkrets_household_types_copy.rename(columns={'year': 'year_3'}, inplace=True)
grunnkrets_household_types_copy[grunnkrets_household_types_copy.columns[2:]] = grunnkrets_household_types_copy[grunnkrets_household_types_copy.columns[2:]].astype('int64')

grunnkrets_household_income_copy = grunnkrets_household_income.copy()
grunnkrets_household_income_copy.rename(columns={'year': 'year_4', 'singles': 'singles_income','couple_without_children':'couple_without_children_income'}, inplace=True)
grunnkrets_household_income_copy.rename(columns={'singles': 'singles_income', }, inplace=True)

    

df = stores_train_copy

df = pd.merge(df, plaace_hierarchy_copy, on='plaace_hierarchy_id', how='left')
df = pd.merge(df, grunnkrets_copy, on='grunnkrets_id', how='left')
df = pd.merge(df, grunnkrets_ages_copy, on='grunnkrets_id', how='left')
df = pd.merge(df, grunnkrets_household_types_copy, on='grunnkrets_id', how='left')
df = pd.merge(df, grunnkrets_household_income_copy, on='grunnkrets_id', how='left')



df['grunnkrets_population_density'] = df['grunnkrets_population'] / df['area_km2']


df.drop_duplicates(subset=['store_id'], keep='first', inplace=True)
df.reset_index(drop=True, inplace=True)



columns_to_drop = [
                  'store_id',
                  'plaace_hierarchy_id',
                  'grunnkrets_id',
                  'year',
                  'address',
                  'store_name',
                  'year_1',
                  'geometry',
                  'grunnkrets_name',
                  'district_name',
                  'municipality_name',
                  'year_2',
                  'year_3',
                  'year_4',
                  'sales_channel_name',
                  'lv1',
                  'lv2',
                  'lv3',
                  'lv4',
                  ]


X = df.drop(columns=columns_to_drop)
X = convert_to_category(X)

X['revenue'] = np.log1p(X['revenue'])


y = df.revenue
y = np.log1p(y)


categorical_features = X.select_dtypes(include=['category']).columns
numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns


X[numerical_columns] = np.log(X[numerical_columns])
X['revenue'] = np.exp(X['revenue'])

  result = func(self.values, **kwargs)


In [4]:
stores_test_copy = stores_test.copy()

df = stores_test_copy

df = pd.merge(df, plaace_hierarchy_copy, on='plaace_hierarchy_id', how='left')
df = pd.merge(df, grunnkrets_copy, on='grunnkrets_id', how='left')
df = pd.merge(df, grunnkrets_ages_copy, on='grunnkrets_id', how='left')
df = pd.merge(df, grunnkrets_household_types_copy, on='grunnkrets_id', how='left')
df = pd.merge(df, grunnkrets_household_income_copy, on='grunnkrets_id', how='left')

df.drop_duplicates(subset=['store_id'], keep='first', inplace=True)
df['grunnkrets_population_density'] = df['grunnkrets_population'] / df['area_km2']

X_test = df.drop(columns = columns_to_drop)
X_test = convert_to_category(X_test)


numerical_columns = X_test.select_dtypes(include=['int64', 'float64']).columns

X_test[numerical_columns] = np.log(X_test[numerical_columns])

  result = func(self.values, **kwargs)


# 3. Model

In [5]:
import h2o
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,23 mins 46 secs
H2O_cluster_timezone:,Europe/Oslo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.38.0.2
H2O_cluster_version_age:,17 days
H2O_cluster_name:,H2O_from_python_levit_fgfmpc
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.572 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,16


In [6]:
train = h2o.H2OFrame(X)

x = train.columns
y = "revenue"
x.remove(y)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [None]:
from h2o.automl import H2OAutoML

aml = H2OAutoML(max_models=175, seed=1, sort_metric='RMSE')
aml.train(x=x, y=y, training_frame=train)

In [None]:
test = h2o.H2OFrame(X_test)

In [None]:
y_pred = aml.leader.predict(test)
preds = np.expm1(y_pred.as_data_frame())

In [None]:
submission = pd.DataFrame()
submission['id'] = stores_test['store_id']
submission['predicted'] = np.asarray(preds)

submission.to_csv('sample_submission_H20_175.csv', index=False)
submission