In [1]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold
import matplotlib.pyplot as plt
import seaborn as sns

# sns.set(context='notebook', style='dark', font_scale=1.5)

PROJ_ROOT = os.path.join(os.pardir)

In [2]:
data_path = os.path.join(PROJ_ROOT, 
                         'data',
                        'processed',
                         'model_data_v2.csv')
df=pd.read_csv(data_path, parse_dates=['new_date'])

In [3]:
def test_mean_target_encoding(train, test, target, categorical, alpha=5):
    # Calculate global mean on the train data
    global_mean = train[target].mean()
    
    # Group by the categorical feature and calculate its properties
    train_groups = train.groupby(categorical)
    category_sum = train_groups[target].sum()
    category_size = train_groups.size()
    
    # Calculate smoothed mean target statistics
    train_statistics = (category_sum + global_mean * alpha) / (category_size + alpha)
    
    # Apply statistics to the test data and fill new categories
    test_feature = test[categorical].map(train_statistics).fillna(global_mean)
    return test_feature.values

def train_mean_target_encoding(train, target, categorical, alpha=5):
    # Create 5-fold cross-validation
    kf = KFold(n_splits=5, random_state=123, shuffle=True)
    train_feature = pd.Series(index=train.index)
    
    # For each folds split
    for train_index, test_index in kf.split(train):
        cv_train, cv_test = train.iloc[train_index], train.iloc[test_index]
      
        # Calculate out-of-fold statistics and apply to cv_test
        cv_test_feature = test_mean_target_encoding(cv_train, cv_test, target, categorical, alpha)
        
        # Save new feature for this particular fold
        train_feature.iloc[test_index] = cv_test_feature       
    return train_feature.values

def mean_target_encoding(train, test, target, categorical, alpha=5):
  
    # Get the train feature
    train_feature = train_mean_target_encoding(train, target, categorical, alpha)
  
    # Get the test feature
    test_feature = test_mean_target_encoding(train, test, target, categorical, alpha)
    
    # Return new features to add to the model
    return train_feature, test_feature

In [4]:
df=df[['new_date', 'latitude', 'longitude', 'huc12_',
       'areaacres', 'za_mean','lc_21', 'lc_22',
       'lc_23', 'lc_24', 'lc_31', 'lc_41', 'lc_42', 'lc_43', 'lc_52', 'lc_71',
       'lc_81', 'lc_82', 'lc_90', 'lc_95', 'month', 'year', 'week',
       'dayofweek', 'hour', 'min', 'quarter', 'tn', 'airtemp_narr',
       'precip3_narr', 'humidity_narr', 'cl_cover_narr', 'sfc_runoff','windspeed_narr',
       'wdirection_narr', 'precip24_narr', 'precip48_narr', 'of_dist','n_bins']]

In [5]:
# Take a look at how skewed the data is
df.n_bins.value_counts(normalize=True)

(0.65, 1.0]      0.233436
(0.0, 0.56]      0.198379
(1.0, 1.5]       0.183535
(2.0, 5.0]       0.170274
(1.5, 2.0]       0.107845
(0.56, 0.65]     0.072445
(5.0, 10.0]      0.030856
(10.0, 800.0]    0.003229
Name: n_bins, dtype: float64

In [6]:
# Split the data and stratify on the 'n_bins' to ensure 
# to maintain the  skewness of the data so that it generalizes better
X_train, X_test, y_train, y_test = train_test_split(df.drop(['tn'],axis=1),
                                                    df['tn'],
                                                    test_size=.2,
                                                    random_state=5,
                                                    stratify=df['n_bins'])

In [7]:
# Double check that the training target data has a similar distribution as the original data
X_train['n_bins'].value_counts(normalize=True)

(0.65, 1.0]      0.233439
(0.0, 0.56]      0.198382
(1.0, 1.5]       0.183525
(2.0, 5.0]       0.170276
(1.5, 2.0]       0.107843
(0.56, 0.65]     0.072450
(5.0, 10.0]      0.030852
(10.0, 800.0]    0.003233
Name: n_bins, dtype: float64

In [8]:
# drop the 'n_bins' feature, it is only used for splitting the data
X_train.drop('n_bins',axis=1,inplace=True)
X_test.drop('n_bins',axis=1,inplace=True)

In [9]:
# add the target variable back to the train set to target mean encode the huc12 feature
X_train['tn']=y_train

In [10]:
# Mean target encode the data
# Other options to try -  median & quartile encoding
X_train['huc12_enc'], X_test['huc12_enc'] = mean_target_encoding(train=X_train,
                                                                           test=X_test,
                                                                           target='tn',
                                                                           categorical='huc12_',
                                                                           alpha=10)



In [11]:
X_train.drop(['tn'], axis=1, inplace=True)

In [12]:
X_train.head()

Unnamed: 0,new_date,latitude,longitude,huc12_,areaacres,za_mean,lc_21,lc_22,lc_23,lc_24,...,precip3_narr,humidity_narr,cl_cover_narr,sfc_runoff,windspeed_narr,wdirection_narr,precip24_narr,precip48_narr,of_dist,huc12_enc
42671,2011-05-10 09:15:00,37.81347,-76.29467,20801010000,753224.28,11.005599,0.000386,7.3e-05,5.4e-05,1.1e-05,...,0.0,80.70566,0.0,-9.969210000000001e+36,3.904339,194.5247,9.5e-05,0.0,59.325955,0.39578
56163,2018-12-04 11:17:00,38.81092,-76.71227,20600060403,20589.41,54.853527,0.071216,0.024671,0.007918,0.001134,...,0.0,71.602455,70.0,-9.969210000000001e+36,6.727732,189.02087,0.0,0.070312,131.76988,1.589201
43828,2013-01-17 12:45:00,38.07695,-77.38472,20801050205,32789.87,57.134444,0.043331,0.012976,0.004979,0.001079,...,5.640625,92.0782,100.0,0.1875,3.616492,196.54283,12.027651,22.729729,108.098002,0.576708
27770,2015-04-15 08:45:00,38.36497,-77.97708,20801031002,18184.48,61.189946,0.055984,0.016072,0.005076,0.001345,...,0.234375,91.23186,45.0,0.0,5.541348,190.23964,18.710938,16.40625,145.411113,1.196272
11665,2009-08-18 08:37:00,38.78538,-76.71343,20600060304,26912.86,45.192859,0.142832,0.096929,0.039623,0.010932,...,0.0,92.767075,67.0,-9.969210000000001e+36,3.020032,341.23962,0.070312,4.179688,130.119706,1.025271


In [13]:
X_test.head()

Unnamed: 0,new_date,latitude,longitude,huc12_,areaacres,za_mean,lc_21,lc_22,lc_23,lc_24,...,precip3_narr,humidity_narr,cl_cover_narr,sfc_runoff,windspeed_narr,wdirection_narr,precip24_narr,precip48_narr,of_dist,huc12_enc
35252,2012-04-05 09:09:00,38.8143,-76.75087,20600060304,26912.86,45.192859,0.142832,0.096929,0.039623,0.010932,...,0.0,84.00469,0.0,0.0,3.60019,195.6321,0.0,0.0,132.663532,1.016469
36227,2011-08-02 12:33:00,39.47922,-79.06802,20700020207,18042.44,42.609748,0.042726,0.007741,0.007557,0.002281,...,0.0,66.43582,0.0,0.0,5.728159,192.72514,1.226562,2.390625,240.66686,0.899548
31046,2013-04-16 10:09:00,38.55728,-76.49402,20600010000,485001.0,11.025856,7.3e-05,9.6e-05,7.8e-05,4.9e-05,...,0.0,94.35949,90.0,-9.969210000000001e+36,3.633292,338.2605,0.015625,0.250257,111.59353,0.816165
50775,2016-03-16 08:30:00,40.08232,-76.71969,20503060504,23161.58,48.263562,0.095196,0.048476,0.011733,0.00266,...,0.0,96.15192,0.0,0.0,1.366666,238.8096,0.21875,1.876276,216.839284,2.317123
48421,2019-04-16 12:00:00,42.0028,-76.635,20501050607,23502.78,48.827315,0.086385,0.03258,0.011459,0.0088,...,0.0,79.152596,0.0,0.0,4.297197,222.77917,1.320312,9.617188,347.298145,1.136891


In [None]:
# X_train.to_csv(os.path.join(PROJ_ROOT, 
#                          'data',
#                         'processed',
#                          'x_train.csv'),index=False)
# y_train.to_csv(os.path.join(PROJ_ROOT, 
#                          'data',
#                         'processed',
#                          'y_train.csv'),index=False)

# X_test.to_csv(os.path.join(PROJ_ROOT, 
#                          'data',
#                         'processed',
#                          'x_test.csv'),index=False)
# y_test.to_csv(os.path.join(PROJ_ROOT, 
#                          'data',
#                         'processed',
#                          'y_test.csv'),index=False)