In [1]:
import pandas as pd

catalog

<kedro.io.data_catalog.DataCatalog at 0x14b778040>

In [4]:
X = catalog.load('dataset')
X_processed = catalog.load('data_preprocess.X')
X_train = catalog.load('split.X_train')
y_train = catalog.load('split.y_train')

In [5]:
X['PoorWeather'].apply(type).unique()

array([<class 'float'>, <class 'str'>], dtype=object)

In [6]:
X['Date'].max()

'1945-9-9'

In [7]:
X['STA'].unique()

array([10001, 10002, 10101, 10102, 10502, 10505, 10701, 10703, 10704,
       10705, 10718, 10802, 10803, 10804, 10806, 10807, 11101, 11103,
       11501, 11601, 11604, 11606, 11610, 11615, 11701, 11704, 11801,
       11901, 11902, 12001, 12101, 12701, 12702, 12706, 12801, 13002,
       13005, 13007, 13013, 13201, 13601, 15027, 16101, 16201, 16202,
       16405, 16407, 16504, 20501, 20602, 21501, 22502, 22504, 22507,
       22508, 22509, 30001, 31001, 31101, 31302, 31401, 31501, 31701,
       32301, 32501, 32503, 32504, 32601, 32602, 32603, 32701, 32702,
       32801, 32803, 32805, 32812, 32815, 32904, 32906, 32907, 33003,
       33013, 33014, 33019, 33023, 33031, 33044, 33109, 33111, 33117,
       33121, 33123, 33205, 33206, 33303, 33305, 33307, 33401, 33405,
       33406, 33501, 34002, 34003, 34005, 34012, 34013, 34017, 34018,
       34022, 34101, 34104, 34105, 34109, 34111, 34113, 34123, 34139,
       34147, 34150, 40601, 41204, 41207, 42204, 42206, 42401, 43201,
       43302, 43305,

In [8]:
X['TSHDSBRSGF'].unique()

array([nan, '1', '1     1', '1 1', '1     1  1', '1  1', '1        1',
       '0', '1   1 1', '1    1', '1     0', '1000', '1001', '100000',
       '1000001000', '101000', '101001', '1000001001', '1000101001',
       '100001', '1000000000', '1000101000', '1010001000', '1000100000',
       '1000100001', '1010001001', '10001001', '11', '11 1', '11    1',
       '1100000', '1 1  1', '1 0  1', '1 0', '1   1 1  1', '1000000',
       '1101000', '1101001', '1   1'], dtype=object)

In [9]:
X['Precip'].unique()

array(['1.016', '0', '2.54', 'T', '3.556', '0.508', '0.254', '1.778',
       '3.302', '0.762', '5.588', '5.08', '2.032', '25.654', '40.132',
       '19.05', '23.876', '4.064', '12.192', '2.794', '33.274', '19.304',
       '47.752', '6.35', '3.048', '8.382', '10.16', '27.178', '30.48',
       '45.212', '10.668', '15.494', '57.658', '58.166', '11.43',
       '31.242', '17.78', '26.924', '26.67', '6.604', '5.334', '4.826',
       '7.62', '8.636', '7.874', '12.446', '90.932', '13.462', '11.938',
       '5.842', '4.318', '6.096', '9.144', '4.572', '9.906', '9.652',
       '1.524', '13.97', '17.272', '33.528', '1.27', '13.716', '8.128',
       '22.352', '39.116', '36.068', '98.044', '3.81', '14.986', '78.232',
       '146.304', '14.732', '21.844', '120.904', '42.164', '23.368',
       '21.082', '44.958', '2.286', '50.038', '17.526', '51.054', '7.112',
       '25.4', '7.366', '16.256', '46.736', '8.89', '24.638', '11.684',
       '17.018', '144.78', '18.034', '50.8', '106.426', '36.576', '53.

In [10]:
X['WindGustSpd'].unique()

array([   nan, 29.632, 59.264, 50.004, 40.744, 51.856, 44.448, 57.412,
       53.708, 42.596, 62.968, 68.524, 61.116, 46.3  , 38.892, 55.56 ,
       31.484, 37.04 , 74.08 , 64.82 , 48.152, 35.188, 27.78 , 33.336,
       20.372, 22.224, 24.076, 25.928, 75.932, 66.672, 18.52 ])

After removing columns will small number of existing changes. We can see, that a lot of preprocessing changes can be done

1. STA represents a distinct meteorology station. Each station may have different location and hence, different weather conditions. We can solve this either by training a linear regression for each station or at least perform one-hot encoding.
2. `TSHDSBRSGF` column represents a set of boolean flags, which is `Day with: Thunder; Sleet; Hail; Dust or Sand; Smoke or Haze; Blowing Snow; Rain; Snow; Glaze; Fog; 0 = No, 1 = Yes`. We will need to split it into separate columns to correctly perform regression on these columns.
3. MIN, MAX, MEA represent temperatures in Fahrenheit, which actually leak the actual temperature, so we remove them from the training dataset.
4. The date can be quite informative for regression model. We don't want to choose YYYYMMDD format, because this may introduce bias in the model (For instance, the end and the start of the year...). Instead of it, we will use KSP format.
5. The precipation column contains 'T' value, which means, that there is a trace of precipation, but it isn't measurable. We want a numerical encoding of the precipation, so we will do the following:
   1. Replace 'T' with 0
   2. Create a new boolean feature column for 'T'
In addition to this, `PRCP` is the same as `Precip`, but in different scale. So drop it.
6. The same logic as in previous point applies to `Snowfall` and `SNF` columns
7. The `PoorWeather` is the same as the `TSHDSBRSGF`, so we will drop it
8. `WindGustSpd` is the same as `SPD`. So drop it.
8. Other values will be filled with zero

After encoding, there are no object values

In [11]:
X_train[X_train.columns[:20]]

Unnamed: 0,PGT,DR,WindGustSpd,Snowfall,Date,is_traceable_precip,is_traceable_snow,thunder,sleet,hail,dust,smoke,blowing_snow,rain,snow,fog,is_observed,STA_10001,STA_10002,STA_10101
11496,0.0,0.0,0.0,0.0,2.808219,False,False,True,False,False,False,False,False,True,False,False,True,0,0,0
55335,0.0,0.0,0.0,0.0,2.287671,True,False,False,False,False,False,False,False,False,False,False,False,0,0,0
33325,0.0,0.0,0.0,0.0,4.535519,False,False,False,False,False,False,False,False,False,False,False,False,0,0,0
93352,0.0,0.0,0.0,0.0,4.226776,False,False,False,False,False,False,False,False,False,False,False,False,0,0,0
58125,0.0,0.0,0.0,0.0,0.680328,False,False,False,False,False,False,False,False,False,False,False,False,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76820,0.0,0.0,0.0,0.0,3.830137,False,False,False,False,False,False,False,False,False,False,False,False,0,0,0
110268,0.0,0.0,0.0,0.0,5.123288,False,False,False,False,False,False,False,False,False,False,False,False,0,0,0
103694,0.0,0.0,0.0,0.0,4.193989,False,False,False,False,False,False,False,False,False,False,False,False,0,0,0
860,0.0,0.0,0.0,0.0,4.855191,False,False,False,False,False,False,False,False,False,False,False,False,1,0,0


In [18]:
X_processed.loc[ (X_processed['WindGustSpd'] != 0) | (X_processed['DR'] != 0) | ((X_processed['PGT'] != 0)), ['WindGustSpd', 'DR', 'PGT']]

Unnamed: 0,WindGustSpd,DR,PGT
51177,29.632,32.0,10.2
51178,59.264,44.0,10.2
51179,50.004,32.0,1.1
51180,40.744,32.0,10.4
51181,51.856,54.0,11.6
...,...,...,...
64042,31.484,11.0,11.5
64043,27.780,9.0,14.5
64044,33.336,9.0,5.5
64045,29.632,9.0,9.0
