In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
from xgboost import XGBRegressor

In [2]:
import pickle
file = open('../input/cs-4780-covid-case-hunters/covid_dataset.pkl', 'rb')
checkpoint = pickle.load(file); file.close(); X_train, y_train, X_val, y_val, X_test = checkpoint["X_train"], checkpoint["y_train_log_pos_cases"], checkpoint["X_val"], checkpoint["y_val_log_pos_cases"], checkpoint["X_test"]

In [3]:
col_list = ['Region','Population', 'Area', 'Pop. Density', 'Coastline',
       'Net migration', 'Infant mortality', 'GDP', 'Literacy', 'Phones',
       'Arable', 'Crops', 'Other', 'Birthrate', 'Deathrate',
       'Agriculture', 'Industry', 'Service', 'Handwashing Facilities',
       'Extreme Poverty', 'Median Age', 'Life expectancy',
       'Human development index']
df = pd.DataFrame(X_train, columns=col_list)
df_val = pd.DataFrame(X_val, columns=col_list)
df_test = pd.DataFrame(X_test, columns=col_list)
df_full = pd.concat([df_val, df])
y_full = np.concatenate((y_val, y_train))

In [4]:
df_full.head()

Unnamed: 0,Region,Population,Area,Pop. Density,Coastline,Net migration,Infant mortality,GDP,Literacy,Phones,...,Birthrate,Deathrate,Agriculture,Industry,Service,Handwashing Facilities,Extreme Poverty,Median Age,Life expectancy,Human development index
0,7.0,8863338.0,637657.0,13.9,0.47,5.37,116.7,500.0,37.8,11.3,...,45.13,16.63,0.65,0.1,0.25,9.831,,16.799999,57.400002,
1,4.0,4494749.0,56542.0,79.5,10.32,1.58,6.84,10600.0,98.5,420.4,...,9.61,11.48,0.07,0.308,0.622,,0.7,44.0,78.489998,0.851
2,8.0,8308504.0,27750.0,299.4,6.38,-3.4,73.45,1600.0,52.9,16.9,...,36.44,12.17,0.28,0.2,0.52,22.863001,23.5,24.299999,64.0,0.51
3,8.0,13547510.0,283560.0,47.8,0.79,-8.58,23.66,3300.0,92.5,125.6,...,22.29,4.23,0.07,0.312,0.618,80.635,3.6,28.1,77.01,0.759
4,8.0,9183984.0,48730.0,188.5,2.64,-3.22,32.38,6000.0,84.7,97.4,...,23.22,5.73,0.112,0.306,0.582,55.181999,1.6,27.6,74.080002,0.756


In [5]:
df_test.head()

Unnamed: 0,Region,Population,Area,Pop. Density,Coastline,Net migration,Infant mortality,GDP,Literacy,Phones,...,Birthrate,Deathrate,Agriculture,Industry,Service,Handwashing Facilities,Extreme Poverty,Median Age,Life expectancy,Human development index
0,0.0,3874050.0,10400.0,372.5,2.16,0.0,24.52,4800.0,87.4,255.6,...,18.52,6.21,0.12,0.21,0.67,,,31.1,78.93,0.744
1,7.0,41236376.0,2505810.0,16.5,0.03,-0.02,62.5,1900.0,61.1,16.3,...,34.53,8.97,0.387,0.203,0.41,23.437,,19.700001,65.309998,0.51
2,7.0,486530.0,23000.0,21.2,1.37,0.0,104.13,1300.0,67.9,22.8,...,39.53,19.31,0.179,0.225,0.596,,22.5,25.4,67.110001,0.524
3,8.0,303770.0,13940.0,21.8,25.41,-2.2,25.21,16700.0,95.6,460.6,...,17.57,9.05,0.03,0.07,0.9,,,34.299999,73.919998,0.814
4,7.0,12525094.0,1267000.0,9.9,0.0,-0.67,121.69,800.0,17.6,1.9,...,50.73,20.91,0.39,0.17,0.44,8.978,44.5,15.1,62.419998,0.394


In [6]:
region_encoded = pd.Series(df_full[df_full.columns[0]])
r = pd.get_dummies(region_encoded)
r.columns = ['Oceania', 'Northern Africa', 'Baltics', 'Northern America', 'Asia', 'Sub-Saharan Africa',
            'C.W. of Ind. States', 'Latin Amer. & Carrib', 'Western Europe', 'Eastern Europe', 'Near East']
r.head()

Unnamed: 0,Oceania,Northern Africa,Baltics,Northern America,Asia,Sub-Saharan Africa,C.W. of Ind. States,Latin Amer. & Carrib,Western Europe,Eastern Europe,Near East
0,0,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,1,0,0


In [7]:
df_full = pd.concat([df_full, r], axis=1)

In [8]:
len(df_full.index)

122

In [9]:
selected_features = ['Population', 'Area', 'Pop. Density', 'Coastline',
       'Net migration', 'Infant mortality', 'GDP', 'Literacy', 'Phones',
       'Arable', 'Crops', 'Other', 'Birthrate', 'Deathrate',
       'Agriculture', 'Industry', 'Service', 'Handwashing Facilities',
       'Extreme Poverty', 'Median Age', 'Life expectancy',
       'Human development index', 'Oceania', 'Northern Africa', 'Baltics', 
        'Northern America', 'Asia', 'Sub-Saharan Africa', 'C.W. of Ind. States', 
        'Latin Amer. & Carrib', 'Western Europe', 'Eastern Europe', 'Near East']
region_encoded_test = pd.Series(df_test[df_test.columns[0]])
r_test = pd.get_dummies(region_encoded_test)
r_test.columns = ['Oceania', 'Northern Africa', 'Baltics', 'Northern America', 'Asia', 
    'C.W. of Ind. States', 'Latin Amer. & Carrib', 'Western Europe', 'Eastern Europe']
r_test.head()

df_test = pd.concat([df_test, r_test], axis=1)
df_test['Near East'] = 0
df_test['Sub-Saharan Africa'] = 0

# df_test = df.reindex(columns=selected_features)


In [10]:
def sel_features(X_train, features):
    feat_list = df.columns.values
    X_train_np = X_train[features].to_numpy()    
    return X_train_np

def process_features(X):
    X = np.asarray(X).astype('float32')
    # Replace nan values with mean
    col_mean = np.nanmean(X, axis=0)
    inds = np.where(np.isnan(X))
    X[inds] = np.take(col_mean, inds[1])
    return X

def evaluate(model, y_val):
    y_pred = model.predict(X_val)
    mse = mean_squared_error(y_val, y_pred)
    print ("Validation loss is ", mse)

In [11]:
selected_features = ['Population', 'Area', 'Pop. Density', 'Coastline',
       'Net migration', 'Infant mortality', 'GDP', 'Literacy', 'Phones',
       'Arable', 'Crops', 'Other', 'Birthrate', 'Deathrate',
       'Agriculture', 'Industry', 'Service', 'Handwashing Facilities',
       'Extreme Poverty', 'Median Age', 'Life expectancy',
       'Human development index', 'Oceania', 'Northern Africa', 'Baltics', 
        'Northern America', 'Asia', 'Sub-Saharan Africa', 'C.W. of Ind. States', 
        'Latin Amer. & Carrib', 'Western Europe', 'Eastern Europe', 'Near East']

df_test = df_test.reindex(columns=selected_features)
df_full = df_full.reindex(columns=selected_features)
# X_train = process_features(sel_features(df, selected_features))
# X_val = process_features(sel_features(df_val, selected_features))
X_test = process_features(sel_features(df_test, selected_features))
X_full = process_features(sel_features(df_full, selected_features))

In [12]:
print(len(X_full), len(y_full))
print (X_full.shape, X_test.shape)
print (df_full.columns == df_test.columns)

122 122
(122, 33) (61, 33)
[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True]


In [13]:
# params={ 'objective':'reg:squarederror',
#          'max_depth': 1, 
#          'colsample_bylevel':0.5,
#          'learning_rate':0.01,
#          'random_state':20}
# xg_reg = XGBRegressor(params=params)

xg_reg = XGBRegressor(
    n_estimators = 100,
    tree_method = "exact",
    objective='reg:squarederror')
xg_scores = cross_val_score(xg_reg, X_full, y_full, scoring='neg_mean_squared_error')
# print (xg_scores, xg_scores.mean())
xg_reg.fit(X_full, y_full)
# evaluate(xg_reg, y_val)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=4,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [14]:
print (xg_scores)

[-1.43829036 -1.9683845  -1.35448265 -4.80160332 -1.931687  ]


In [15]:
test_pred = xg_reg.predict(X_test)
print (test_pred)

[13.96524   12.112449   8.492571   9.418019   8.737672   9.919426
 16.65278   13.31281   12.704893  13.536014  12.539293  14.489888
 13.432001  10.279228  14.07751   10.417597  10.333507   9.926728
 11.928889  10.047097  13.630507  13.26088   13.663947  12.232434
 10.980526   9.820848   6.908434  10.494905  12.365904   9.871782
  5.391561  15.581081   9.661317  11.299507  10.862571  15.356327
 13.283354  13.3406925 10.3086605  7.206792  13.836149  10.961629
 13.468663  15.450327  11.695498  12.205346  16.868208  12.580751
 15.380928   7.3053794 10.90426   11.379649  12.177704  14.612432
 13.66521   13.708232  10.706292   9.319814  12.457947   7.945278
 11.8600235]


In [16]:
pd.DataFrame(test_pred).to_csv("xgb.csv", header=["cases"], index_label="id")

In [17]:
X_full_cop = X_full
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
scaler = MinMaxScaler()
scaler.fit(X_full)

X_transformed = scaler.transform(X_full)


In [18]:
xg_reg2 = XGBRegressor(
    n_estimators = 50,
    tree_method = "exact",
    objective='reg:squarederror')
xg_scores2 = cross_val_score(xg_reg2, X_transformed, y_full, scoring='neg_mean_squared_error')
xg_reg2.fit(X_transformed, y_full)
print (xg_scores2)

[-1.43825293 -1.96868229 -1.35893929 -4.80167723 -1.93188918]


In [19]:
X_test_trans = scaler.transform(X_test)
test_pred2 = xg_reg2.predict(X_test_trans)
print (test_pred2)

[13.965429  12.110649   8.492616   9.418686   8.737853   9.919607
 16.652258  13.312608  12.70498   13.535467  12.53944   14.489878
 13.432911  10.2788    14.076904  10.417877  10.333762   9.926785
 11.928568  10.058866  13.630205  13.260381  13.664455  12.232445
 10.980693   9.8218     6.908755  10.494856  12.366754   9.871595
  5.392031  15.580867   9.661303  11.299245  10.863695  15.355351
 13.2835865 13.339851  10.308841   7.2062907 13.83803   10.962174
 13.468601  15.450024  11.695903  12.20437   16.868322  12.580025
 15.379315   7.3053465 10.904363  11.380254  12.177023  14.612744
 13.625031  13.709637  10.706176   9.319359  12.456134   7.945181
 11.860048 ]


In [20]:
pd.DataFrame(test_pred2).to_csv("xgb_min_max.csv", header=["cases"], index_label="id")

In [21]:
xg_reg3 = XGBRegressor(
    n_estimators = 10,
    tree_method = "exact",
    objective='reg:squarederror',
    reg_alpha=0,
    reg_lambda=0.1,
    learning_rate = 0.29,
    )
xg_scores3 = cross_val_score(xg_reg3, X_transformed, y_full, scoring='neg_mean_squared_error')
xg_reg3.fit(X_transformed, y_full)
print (xg_scores3)
print (min(xg_scores3))
print (xg_scores3.mean())

[-2.1578331  -1.76797652 -1.20831347 -2.88433456 -1.29678214]
-2.8843345642089844
-1.8630479574203491


In [22]:
X_test_trans = scaler.transform(X_test)
test_pred3 = xg_reg3.predict(X_test_trans)
print (test_pred3)

[11.596227  11.862926   8.9867325  9.336996   9.054243   9.133029
 16.118488  12.109767  11.696095  12.046785  12.341949  14.52525
 12.999573   9.734283  13.316322  10.360231   9.312405  10.522708
 10.49146    9.29983   12.300067  12.271862  13.157458  10.926264
 11.807279   9.665037   6.77256    9.391162  12.090258   8.540638
  8.307623  15.383642   9.531819  11.057527  10.269643  15.410346
 12.39474   11.825399   9.872814   3.1634834 13.266973  10.514973
 13.390746  14.867813  11.277546  12.0150385 15.9481325 12.602493
 15.042811   3.3480577 11.75607   12.047779  11.880384  13.625149
 12.6772585 13.331204  10.656838   9.661016  12.351404   7.9523635
 11.764077 ]


In [23]:
pd.DataFrame(test_pred3).to_csv("xgb_min_max_final.csv", header=["cases"], index_label="id")