In [31]:
import pandas as pd
import numpy as np

In [83]:
impute_df = pd.read_csv('impute_df.csv')

In [84]:
impute_df.columns

Index(['Unnamed: 0', 'id', 'date', 'store_nbr', 'family', 'sales',
       'onpromotion', 'data_id', 'dcoilwtico', 'city', 'state', 'type',
       'cluster', 'transactions', 'month', 'day', 'year', 'was_holiday'],
      dtype='object')

In [85]:
# seperate target features from df to impute oil price

test = impute_df[impute_df['dcoilwtico'].isna()]
feature_cols = ['id','city', 'state','dcoilwtico', 'type','store_nbr', 'family',
                 'cluster','month', 'day', 'year']
train = impute_df[~impute_df['dcoilwtico'].isna()]
train = train[feature_cols]
test = test[feature_cols]


In [86]:
# seperate targets and features to impute oil price

X_train = train.drop(['id','dcoilwtico'], axis = 1)
y_train = train['dcoilwtico']

X_test = test.drop(['id','dcoilwtico'], axis = 1)

In [87]:
X_test

Unnamed: 0,city,state,type,store_nbr,family,cluster,month,day,year
0,Quito,Pichincha,D,1,AUTOMOTIVE,13,1,1,2013
1,Quito,Pichincha,D,1,BABY CARE,13,1,1,2013
2,Quito,Pichincha,D,1,BEAUTY,13,1,1,2013
3,Quito,Pichincha,D,1,BEVERAGES,13,1,1,2013
4,Quito,Pichincha,D,1,BOOKS,13,1,1,2013
...,...,...,...,...,...,...,...,...,...
3022267,Quito,Pichincha,B,9,POULTRY,6,8,27,2017
3022268,Quito,Pichincha,B,9,PREPARED FOODS,6,8,27,2017
3022269,Quito,Pichincha,B,9,PRODUCE,6,8,27,2017
3022270,Quito,Pichincha,B,9,SCHOOL AND OFFICE SUPPLIES,6,8,27,2017


In [71]:
categorical_columns = ['city','state','type','family']

for col in categorical_columns:
    X_train[col] = X_train[col].astype('category')
    
for col in categorical_columns:
    X_test[col] = X_test[col].astype('category')

In [72]:
# use lightlgbm to predict oil prices for missing values
import lightgbm as lgb


cat_features = ['city','state','type','family']

model_data = lgb.Dataset(X_train, label=y_train, categorical_feature = cat_features)


# Set hyperparameters for the model
params = {
    'objective': 'regression',
    'metric': 'mse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

# Train the model
num_round = 100
lgb_model = lgb.train(params, model_data, num_round)



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016180 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 2093850, number of used features: 9
[LightGBM] [Info] Start training from score 67.714366


In [73]:
# make preds with lgb
oil_values = lgb_model.predict(X_test)

In [92]:
oil_vals_ser = pd.Series(oil_values)

oil_vals_ser


0         95.162954
1         95.162954
2         95.162954
3         95.162954
4         95.162954
            ...    
935545    48.637495
935546    48.637495
935547    48.637495
935548    48.637495
935549    48.637495
Length: 935550, dtype: float64

In [110]:
test['dcoilwtico'] = oil_vals_ser.values

In [112]:
test

Unnamed: 0,id,city,state,dcoilwtico,type,store_nbr,family,cluster,month,day,year
0,0,Quito,Pichincha,95.162954,D,1,AUTOMOTIVE,13,1,1,2013
1,1,Quito,Pichincha,95.162954,D,1,BABY CARE,13,1,1,2013
2,2,Quito,Pichincha,95.162954,D,1,BEAUTY,13,1,1,2013
3,3,Quito,Pichincha,95.162954,D,1,BEVERAGES,13,1,1,2013
4,4,Quito,Pichincha,95.162954,D,1,BOOKS,13,1,1,2013
...,...,...,...,...,...,...,...,...,...,...,...
3022267,3022267,Quito,Pichincha,48.637495,B,9,POULTRY,6,8,27,2017
3022268,3022268,Quito,Pichincha,48.637495,B,9,PREPARED FOODS,6,8,27,2017
3022269,3022269,Quito,Pichincha,48.637495,B,9,PRODUCE,6,8,27,2017
3022270,3022270,Quito,Pichincha,48.637495,B,9,SCHOOL AND OFFICE SUPPLIES,6,8,27,2017


In [113]:
impute_df

Unnamed: 0.1,Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,data_id,dcoilwtico,city,state,type,cluster,transactions,month,day,year,was_holiday
0,0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,df1,,Quito,Pichincha,D,13,,1,1,2013,False
1,1,1,2013-01-01,1,BABY CARE,0.0,0,df1,,Quito,Pichincha,D,13,,1,1,2013,False
2,2,2,2013-01-01,1,BEAUTY,0.0,0,df1,,Quito,Pichincha,D,13,,1,1,2013,False
3,3,3,2013-01-01,1,BEVERAGES,0.0,0,df1,,Quito,Pichincha,D,13,,1,1,2013,False
4,4,4,2013-01-01,1,BOOKS,0.0,0,df1,,Quito,Pichincha,D,13,,1,1,2013,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3029395,3029395,3029395,2017-08-31,9,POULTRY,,1,df2,47.26,Quito,Pichincha,B,6,,8,31,2017,False
3029396,3029396,3029396,2017-08-31,9,PREPARED FOODS,,0,df2,47.26,Quito,Pichincha,B,6,,8,31,2017,False
3029397,3029397,3029397,2017-08-31,9,PRODUCE,,1,df2,47.26,Quito,Pichincha,B,6,,8,31,2017,False
3029398,3029398,3029398,2017-08-31,9,SCHOOL AND OFFICE SUPPLIES,,9,df2,47.26,Quito,Pichincha,B,6,,8,31,2017,False


In [114]:
impute_df.update(test)

In [116]:
impute_df.isna().sum()

Unnamed: 0           0
id                   0
date                 0
store_nbr            0
family               0
sales            28512
onpromotion          0
data_id              0
dcoilwtico           0
city                 0
state                0
type                 0
cluster              0
transactions    274296
month                0
day                  0
year                 0
was_holiday          0
dtype: int64

In [117]:
impute_df

Unnamed: 0.1,Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,data_id,dcoilwtico,city,state,type,cluster,transactions,month,day,year,was_holiday
0,0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,df1,95.162954,Quito,Pichincha,D,13,,1,1,2013,False
1,1,1,2013-01-01,1,BABY CARE,0.0,0,df1,95.162954,Quito,Pichincha,D,13,,1,1,2013,False
2,2,2,2013-01-01,1,BEAUTY,0.0,0,df1,95.162954,Quito,Pichincha,D,13,,1,1,2013,False
3,3,3,2013-01-01,1,BEVERAGES,0.0,0,df1,95.162954,Quito,Pichincha,D,13,,1,1,2013,False
4,4,4,2013-01-01,1,BOOKS,0.0,0,df1,95.162954,Quito,Pichincha,D,13,,1,1,2013,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3029395,3029395,3029395,2017-08-31,9,POULTRY,,1,df2,47.260000,Quito,Pichincha,B,6,,8,31,2017,False
3029396,3029396,3029396,2017-08-31,9,PREPARED FOODS,,0,df2,47.260000,Quito,Pichincha,B,6,,8,31,2017,False
3029397,3029397,3029397,2017-08-31,9,PRODUCE,,1,df2,47.260000,Quito,Pichincha,B,6,,8,31,2017,False
3029398,3029398,3029398,2017-08-31,9,SCHOOL AND OFFICE SUPPLIES,,9,df2,47.260000,Quito,Pichincha,B,6,,8,31,2017,False


In [118]:
# seperate target features from df to impute oil price

test = impute_df[impute_df['transactions'].isna()]
feature_cols = ['id','city', 'state','dcoilwtico', 'type','store_nbr', 'family',
                 'cluster','month', 'day', 'year', 'transactions']
train = impute_df[~impute_df['transactions'].isna()]
train = train[feature_cols]
test = test[feature_cols]

In [119]:
# seperate targets and features to impute oil price

X_train = train.drop(['id','transactions'], axis = 1)
y_train = train['transactions']

X_test = test.drop(['id','transactions'], axis = 1)

In [120]:
# convert strings to catagorical type

categorical_columns = ['city','state','type','family']

for col in categorical_columns:
    X_train[col] = X_train[col].astype('category')
    
for col in categorical_columns:
    X_test[col] = X_test[col].astype('category')

In [121]:
# create model to predict missing transaction values

cat_features = ['city','state','type','family']

model_data = lgb.Dataset(X_train, label=y_train, categorical_feature = cat_features)


# Set hyperparameters for the model
params = {
    'objective': 'regression',
    'metric': 'mse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

# Train the model
num_round = 100
lgb_model2 = lgb.train(params, model_data, num_round)



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.026389 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 457
[LightGBM] [Info] Number of data points in the train set: 2755104, number of used features: 10
[LightGBM] [Info] Start training from score 1694.602158


In [123]:
trans_preds = lgb_model2.predict(X_test)

trans_pred_ser = pd.Series(trans_preds)

In [124]:
trans_pred_ser

0         1620.242893
1         1620.242893
2         1620.242893
3         1620.242893
4         1620.242893
             ...     
274291    2028.114435
274292    2028.114435
274293    2028.114435
274294    2028.114435
274295    2028.114435
Length: 274296, dtype: float64

In [126]:
test['transactions'] = trans_pred_ser.values

In [127]:
test

Unnamed: 0,id,city,state,dcoilwtico,type,store_nbr,family,cluster,month,day,year,transactions
0,0,Quito,Pichincha,95.162954,D,1,AUTOMOTIVE,13,1,1,2013,1620.242893
1,1,Quito,Pichincha,95.162954,D,1,BABY CARE,13,1,1,2013,1620.242893
2,2,Quito,Pichincha,95.162954,D,1,BEAUTY,13,1,1,2013,1620.242893
3,3,Quito,Pichincha,95.162954,D,1,BEVERAGES,13,1,1,2013,1620.242893
4,4,Quito,Pichincha,95.162954,D,1,BOOKS,13,1,1,2013,1620.242893
...,...,...,...,...,...,...,...,...,...,...,...,...
3029395,3029395,Quito,Pichincha,47.260000,B,9,POULTRY,6,8,31,2017,2028.114435
3029396,3029396,Quito,Pichincha,47.260000,B,9,PREPARED FOODS,6,8,31,2017,2028.114435
3029397,3029397,Quito,Pichincha,47.260000,B,9,PRODUCE,6,8,31,2017,2028.114435
3029398,3029398,Quito,Pichincha,47.260000,B,9,SCHOOL AND OFFICE SUPPLIES,6,8,31,2017,2028.114435


In [128]:
impute_df.update(test)

In [131]:
impute_df.isna().sum()

Unnamed: 0          0
id                  0
date                0
store_nbr           0
family              0
sales           28512
onpromotion         0
data_id             0
dcoilwtico          0
city                0
state               0
type                0
cluster             0
transactions        0
month               0
day                 0
year                0
was_holiday         0
dtype: int64

In [134]:
complete_df = impute_df.copy()

In [135]:
complete_df

Unnamed: 0.1,Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,data_id,dcoilwtico,city,state,type,cluster,transactions,month,day,year,was_holiday
0,0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,df1,95.162954,Quito,Pichincha,D,13,1620.242893,1,1,2013,False
1,1,1,2013-01-01,1,BABY CARE,0.0,0,df1,95.162954,Quito,Pichincha,D,13,1620.242893,1,1,2013,False
2,2,2,2013-01-01,1,BEAUTY,0.0,0,df1,95.162954,Quito,Pichincha,D,13,1620.242893,1,1,2013,False
3,3,3,2013-01-01,1,BEVERAGES,0.0,0,df1,95.162954,Quito,Pichincha,D,13,1620.242893,1,1,2013,False
4,4,4,2013-01-01,1,BOOKS,0.0,0,df1,95.162954,Quito,Pichincha,D,13,1620.242893,1,1,2013,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3029395,3029395,3029395,2017-08-31,9,POULTRY,,1,df2,47.260000,Quito,Pichincha,B,6,2028.114435,8,31,2017,False
3029396,3029396,3029396,2017-08-31,9,PREPARED FOODS,,0,df2,47.260000,Quito,Pichincha,B,6,2028.114435,8,31,2017,False
3029397,3029397,3029397,2017-08-31,9,PRODUCE,,1,df2,47.260000,Quito,Pichincha,B,6,2028.114435,8,31,2017,False
3029398,3029398,3029398,2017-08-31,9,SCHOOL AND OFFICE SUPPLIES,,9,df2,47.260000,Quito,Pichincha,B,6,2028.114435,8,31,2017,False


In [132]:
complete_df.to_csv('cleaned_df.csv')