# Load Libraries and Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [2]:
data = {f[:-4]:pd.read_csv(f) for f in os.listdir() if f.endswith('.csv')}

# Looking at the Data

In [3]:
data['train']

Unnamed: 0,row_id,cfips,county,state,first_day_of_month,microbusiness_density,active
0,1001_2019-08-01,1001,Autauga County,Alabama,2019-08-01,3.007682,1249
1,1001_2019-09-01,1001,Autauga County,Alabama,2019-09-01,2.884870,1198
2,1001_2019-10-01,1001,Autauga County,Alabama,2019-10-01,3.055843,1269
3,1001_2019-11-01,1001,Autauga County,Alabama,2019-11-01,2.993233,1243
4,1001_2019-12-01,1001,Autauga County,Alabama,2019-12-01,2.993233,1243
...,...,...,...,...,...,...,...
122260,56045_2022-06-01,56045,Weston County,Wyoming,2022-06-01,1.803249,101
122261,56045_2022-07-01,56045,Weston County,Wyoming,2022-07-01,1.803249,101
122262,56045_2022-08-01,56045,Weston County,Wyoming,2022-08-01,1.785395,100
122263,56045_2022-09-01,56045,Weston County,Wyoming,2022-09-01,1.785395,100


In [4]:
data['census_starter']

Unnamed: 0,pct_bb_2017,pct_bb_2018,pct_bb_2019,pct_bb_2020,pct_bb_2021,cfips,pct_college_2017,pct_college_2018,pct_college_2019,pct_college_2020,...,pct_it_workers_2017,pct_it_workers_2018,pct_it_workers_2019,pct_it_workers_2020,pct_it_workers_2021,median_hh_inc_2017,median_hh_inc_2018,median_hh_inc_2019,median_hh_inc_2020,median_hh_inc_2021
0,76.6,78.9,80.6,82.7,85.5,1001,14.5,15.9,16.1,16.7,...,1.3,1.1,0.7,0.6,1.1,55317,58786.0,58731,57982.0,62660.0
1,74.5,78.1,81.8,85.1,87.9,1003,20.4,20.7,21.0,20.2,...,1.4,1.3,1.4,1.0,1.3,52562,55962.0,58320,61756.0,64346.0
2,57.2,60.4,60.5,64.6,64.6,1005,7.6,7.8,7.6,7.3,...,0.5,0.3,0.8,1.1,0.8,33368,34186.0,32525,34990.0,36422.0
3,62.0,66.1,69.2,76.1,74.6,1007,8.1,7.6,6.5,7.4,...,1.2,1.4,1.6,1.7,2.1,43404,45340.0,47542,51721.0,54277.0
4,65.8,68.5,73.0,79.6,81.0,1009,8.7,8.1,8.6,8.9,...,1.3,1.4,0.9,1.1,0.9,47412,48695.0,49358,48922.0,52830.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3137,82.2,82.4,84.0,86.7,88.4,56037,15.3,15.2,14.8,13.7,...,0.6,0.6,1.0,0.9,1.0,71083,73008.0,74843,73384.0,76668.0
3138,83.5,85.9,87.1,89.1,90.5,56039,37.7,37.8,38.9,37.2,...,0.7,1.2,1.4,1.5,2.0,80049,83831.0,84678,87053.0,94498.0
3139,83.8,88.2,89.5,91.4,90.6,56041,11.9,10.5,11.1,12.6,...,1.2,1.2,1.4,1.7,0.9,54672,58235.0,63403,72458.0,75106.0
3140,76.4,78.3,78.2,82.8,85.4,56043,15.4,15.0,15.4,15.0,...,1.3,1.0,0.9,0.9,1.1,51362,53426.0,54158,57306.0,62271.0


In [5]:
data['test']

Unnamed: 0,row_id,cfips,first_day_of_month
0,1001_2022-11-01,1001,2022-11-01
1,1003_2022-11-01,1003,2022-11-01
2,1005_2022-11-01,1005,2022-11-01
3,1007_2022-11-01,1007,2022-11-01
4,1009_2022-11-01,1009,2022-11-01
...,...,...,...
25075,56037_2023-06-01,56037,2023-06-01
25076,56039_2023-06-01,56039,2023-06-01
25077,56041_2023-06-01,56041,2023-06-01
25078,56043_2023-06-01,56043,2023-06-01


In [6]:
data['sample_submission']

Unnamed: 0,row_id,microbusiness_density
0,1001_2022-11-01,3.817671
1,1003_2022-11-01,3.817671
2,1005_2022-11-01,3.817671
3,1007_2022-11-01,3.817671
4,1009_2022-11-01,3.817671
...,...,...
25075,56037_2023-06-01,3.817671
25076,56039_2023-06-01,3.817671
25077,56041_2023-06-01,3.817671
25078,56043_2023-06-01,3.817671


# Preprocessing

In [7]:
import sklearn.model_selection as skms

In [8]:
# Whole data processing
Xy = data['train'][['cfips', 'first_day_of_month', 'microbusiness_density']].copy()
Xy[['year', 'month', 'day']] = Xy['first_day_of_month'].str.split('-', expand=True)
Xy = Xy[['cfips', 'year', 'month', 'day', 'microbusiness_density']]
Xy = Xy.merge(data['census_starter'], on='cfips')
Xy = Xy[Xy['microbusiness_density'].notna()]

In [9]:
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

In [10]:
# Split into Train / Test
Xy_train, Xy_test = skms.train_test_split(Xy, test_size=0.2)

# Train Processing
Xy_train.dropna(inplace=True)
# Xy_train = Xy_train[IsolationForest().fit_predict(Xy_train) == 1]

# Test Processing
Xy_test.fillna(-1, inplace=True)

# Split into X and y
y_train = Xy_train.pop('microbusiness_density')
y_test = Xy_test.pop('microbusiness_density')
X_train, X_test = Xy_train, Xy_test

# Scaling
sc = StandardScaler().fit(X_train)
X_train_scaled = sc.transform(X_train)
X_test_scaled = sc.transform(X_test)

In [11]:
X_train

Unnamed: 0,cfips,year,month,day,pct_bb_2017,pct_bb_2018,pct_bb_2019,pct_bb_2020,pct_bb_2021,pct_college_2017,...,pct_it_workers_2017,pct_it_workers_2018,pct_it_workers_2019,pct_it_workers_2020,pct_it_workers_2021,median_hh_inc_2017,median_hh_inc_2018,median_hh_inc_2019,median_hh_inc_2020,median_hh_inc_2021
110057,51011,2022,10,01,59.3,65.5,69.4,74.1,76.8,11.7,...,1.4,1.4,1.3,0.3,0.2,54875,56176.0,58696,55457.0,55268.0
53632,27127,2020,03,01,66.9,69.6,72.0,75.7,78.9,14.3,...,0.8,0.8,0.8,1.0,0.7,51871,53230.0,55404,57243.0,59638.0
111246,51075,2021,02,01,78.8,80.9,83.2,85.8,88.0,24.6,...,1.9,2.5,2.6,2.1,2.2,86652,89741.0,93994,97146.0,100517.0
112580,51147,2021,10,01,62.2,66.0,66.7,70.0,73.8,12.0,...,0.2,0.3,0.8,0.7,0.3,44147,46189.0,47202,44253.0,49019.0
20374,13279,2020,12,01,63.2,66.7,71.2,76.7,76.6,9.8,...,0.9,0.7,0.5,0.9,1.4,35750,36575.0,40175,41244.0,42975.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114152,51678,2022,10,01,65.8,70.0,71.9,80.8,84.1,20.8,...,2.6,2.1,0.0,0.0,0.0,37309,36466.0,42632,50714.0,66114.0
20130,13267,2020,02,01,65.2,69.5,72.8,77.8,77.0,7.4,...,0.5,0.4,0.9,0.8,1.4,36355,38034.0,40730,44053.0,49977.0
90061,44003,2020,06,01,83.4,85.1,86.5,88.6,89.3,20.4,...,2.1,1.7,1.7,1.9,2.1,69047,70223.0,73521,75857.0,79880.0
113648,51540,2019,10,01,78.2,81.1,84.3,85.7,87.8,24.8,...,2.3,2.7,2.9,3.7,3.1,54739,58933.0,59471,59598.0,63470.0


In [12]:
y_train

110057     2.298297
53632      1.992582
111246     5.264557
112580     2.467593
20374      2.127874
            ...    
114152     8.771135
20130      1.322462
90061      6.164172
113648    13.267387
26286      4.827401
Name: microbusiness_density, Length: 97722, dtype: float64

# Testing

In [13]:
def smape(a, f):
  return 1/len(a) * np.sum(2 * np.abs(f-a) / (np.abs(a) + np.abs(f))*100)

In [14]:
smape(np.random.random(1000), np.random.random(1000))

76.21125176974918

In [15]:
k = np.random.random(10)
smape(k, k)

0.0

# A dummy solution

In [16]:
# Zeros
smape(y_test, np.zeros_like(y_test))

199.96728417781048

In [17]:
# Mean
smape(y_test, np.ones_like(y_test)*np.mean(y_train))

62.36007971349328

# Hella basic models

In [18]:
from sklearn.linear_model import LinearRegression

In [19]:
m = LinearRegression().fit(X_train, y_train)
smape(y_test, m.predict(X_test))

45.709769272055745

In [20]:
from sklearn.tree import DecisionTreeRegressor

In [21]:
m = DecisionTreeRegressor().fit(X_train, y_train)
smape(y_test, m.predict(X_test))

2.4340775696236956

In [22]:
m.predict(X_test)

array([4.3657722, 1.7011254, 2.6539462, ..., 4.5993128, 1.8455359,
       1.0651373])

In [23]:
X_test

Unnamed: 0,cfips,year,month,day,pct_bb_2017,pct_bb_2018,pct_bb_2019,pct_bb_2020,pct_bb_2021,pct_college_2017,...,pct_it_workers_2017,pct_it_workers_2018,pct_it_workers_2019,pct_it_workers_2020,pct_it_workers_2021,median_hh_inc_2017,median_hh_inc_2018,median_hh_inc_2019,median_hh_inc_2020,median_hh_inc_2021
31685,19051,2021,01,01,61.9,62.8,64.5,64.9,66.7,11.9,...,2.5,2.0,2.4,1.8,1.6,52390,58464.0,63404,67627.0,76755.0
23384,17013,2021,07,01,63.1,66.8,70.1,73.4,77.2,7.9,...,0.0,0.0,0.0,0.0,0.1,53641,54392.0,63009,66602.0,74792.0
48732,26041,2021,05,01,72.1,73.7,76.2,78.9,81.0,14.2,...,3.0,3.0,2.7,2.2,2.3,44639,46490.0,47434,47008.0,51117.0
145,1007,2021,12,01,62.0,66.1,69.2,76.1,74.6,8.1,...,1.2,1.4,1.6,1.7,2.1,43404,45340.0,47542,51721.0,54277.0
14207,12093,2020,07,01,58.1,59.6,61.5,68.4,72.9,7.9,...,1.1,0.7,0.5,0.5,0.1,39059,40367.0,41760,46097.0,47020.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117307,54045,2022,06,01,72.9,76.6,79.6,82.9,85.2,5.3,...,0.8,0.7,0.7,0.4,0.6,37859,38123.0,36168,36250.0,38493.0
52933,27091,2020,06,01,74.8,76.8,78.9,82.0,84.5,15.0,...,0.5,0.7,0.5,0.7,0.6,53627,53915.0,52798,53851.0,57002.0
91842,45083,2022,08,01,74.1,76.4,79.1,82.1,84.1,15.2,...,2.1,2.2,1.9,1.6,1.8,47575,50179.0,52332,53757.0,57627.0
23524,17021,2020,03,01,72.6,76.5,77.3,79.4,80.6,11.1,...,1.7,2.0,2.1,3.0,2.9,50668,52415.0,52834,52120.0,53188.0


In [24]:
data['sample_submission']

Unnamed: 0,row_id,microbusiness_density
0,1001_2022-11-01,3.817671
1,1003_2022-11-01,3.817671
2,1005_2022-11-01,3.817671
3,1007_2022-11-01,3.817671
4,1009_2022-11-01,3.817671
...,...,...
25075,56037_2023-06-01,3.817671
25076,56039_2023-06-01,3.817671
25077,56041_2023-06-01,3.817671
25078,56043_2023-06-01,3.817671


# Boosted Models

In [22]:
from sklearn.ensemble import RandomForestRegressor

In [23]:
m = RandomForestRegressor().fit(X_train, y_train)
smape(y_test, m.predict(X_test))

2.312906866195007

# Neural Networks

In [33]:
from sklearn.neural_network import MLPRegressor

In [37]:
m = MLPRegressor(hidden_layer_sizes=(15, 8, 4, 2)).fit(X_train_scaled, y_train)
smape(y_test, m.predict(X_test_scaled))

34.92565695430523

: 