In [1]:
import pandas as pd
import numpy as np

In [2]:
weather = pd.read_csv("weather.csv")
power = pd.read_csv("power.csv")

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


### Convert all dates to datetime object

In [3]:
power["DATE"] = power.apply(lambda x: pd.Timestamp(x["Date"]), axis=1)
weather["DATE"] = weather.apply(lambda x: pd.Timestamp(x["DATE"]), axis=1)

### Remove extraneous columns

These columns do not refer to hourly information

In [4]:
weather = weather[["DATE","HourlyAltimeterSetting","HourlyDewPointTemperature","HourlyDryBulbTemperature","HourlyPrecipitation","HourlyPresentWeatherType","HourlyPressureChange","HourlyPressureTendency","HourlyRelativeHumidity","HourlySeaLevelPressure","HourlySkyConditions","HourlyStationPressure","HourlyVisibility","HourlyWetBulbTemperature","HourlyWindDirection","HourlyWindGustSpeed","HourlyWindSpeed"]]

# Clean weather data

**Considerations:**

NaN/blank cannot be assumed to be zero.

s = suspect value (appears with value)

T = trace precipitation amount or snow depth (an amount too small to measure, usually < 0.005 inches water equivalent) (appears instead of numeric value)

M = missing value (appears instead of value)

Blank = value is unreported (appears instead of value)

In [5]:
# There is a lot of NaN values in the weather data. They cannot be assumed to be zero
print(f"Size of weather: {weather.shape[0]}")
print("Number of NaN/blank values in each column:")
weather.isna().sum()

Size of weather: 13802
Number of NaN/blank values in each column:


DATE                             0
HourlyAltimeterSetting        2409
HourlyDewPointTemperature      954
HourlyDryBulbTemperature       954
HourlyPrecipitation           3626
HourlyPresentWeatherType     10014
HourlyPressureChange          9503
HourlyPressureTendency        9503
HourlyRelativeHumidity         954
HourlySeaLevelPressure        3568
HourlySkyConditions           1637
HourlyStationPressure         1216
HourlyVisibility               949
HourlyWetBulbTemperature      1216
HourlyWindDirection            968
HourlyWindGustSpeed          12095
HourlyWindSpeed                954
dtype: int64

### Remove redundant columns

HourlyAltimeterSetting: collinear with HourlyStationPressure

HourlyPressureChange, HourlyPressureTendency: missing many values, redundant with HourlyStationPressure

HourlySeaLevelPressure: a mapping of HourlyStationPressure.

HourlyWindGustSpeed: missing many values, redundant with HourlyWindSpeed

In [6]:
weather_clean = weather.drop(["HourlyAltimeterSetting", "HourlyPressureChange", "HourlyPressureTendency", "HourlySeaLevelPressure", "HourlyWindGustSpeed"],axis=1)

### Remove invalid rows

HourlyDryBulb is one of a set of 4 predictors that consistently occur together in most cases. Rows where they are missing are going to be missing crucial data, and should be removed to be interpolated later.

In [7]:
weather_clean = weather_clean[weather_clean["HourlyDryBulbTemperature"].isna() == False]

In [8]:
print(f"Size of weather: {weather_clean.shape[0]}")
print("Number of NaN/blank values in each column:")
weather_clean.isna().sum()

Size of weather: 12848
Number of NaN/blank values in each column:


DATE                            0
HourlyDewPointTemperature       0
HourlyDryBulbTemperature        0
HourlyPrecipitation          2675
HourlyPresentWeatherType     9060
HourlyRelativeHumidity          0
HourlySkyConditions           683
HourlyStationPressure         262
HourlyVisibility                0
HourlyWetBulbTemperature      262
HourlyWindDirection            19
HourlyWindSpeed                 5
dtype: int64

**Notes on remaining features**

*HourlySkyConditions*: It is likely not worth including feature in models, due to difficulty in representation. It appears to have correlation with precipitation and weather metrics, indicating it might be useful in classifying entries for interpollating missing data points.

The documentation suggests that the overall character of the sky can be classified by the topmost/final listed layer. This is how I will represent the feature.

*HourlyPresentWeatherType*: Will need to be expanded as a dummy variable, because one measurement can have multiple values. We should be sure to consider additive effects of combined weather patterns.

Each weather item is indicated to be light, moderate, and heavy. For now, I will avoid including this information. It may be worth expanding this info to additonal dummy variables if some weather type is shown to have usefulness.

*HourlyPrecipitation*: Need to convert T (trace amount) to a number. I will use 0.01, as that seems to be used interchangably in this data set.

### Reduce HourlySkyConditions
Coverage: CLR (clear sky), FEW (few clouds), SCT (scattered clouds), BKN (broken clouds), OVC (overcast), VV (obscured sky)

In [9]:
def reduceSkyConditions(conditions):
    condition = conditions.split(" ")
    if len(condition) >= 2:
        condition = condition[-2]  # Take last condition ([-1] would be height of last condition)
        condition = condition.split(":")[0]
        return str(condition)
    elif len(condition) == 1:
        if condition[0] == "CLR:00":
            return "CLR"
        else:
            return np.NaN
    else:
        return np.NaN

weather_clean["HourlySkyConditions"] = weather_clean.apply(lambda x: reduceSkyConditions(str(x["HourlySkyConditions"])), axis=1)
weather_clean["HourlySkyConditions"].unique()

array(['OVC', nan, 'SCT', 'FEW', 'BKN', 'CLR', 'VV'], dtype=object)

### Reduce present weather type

Three values given, seperated by pipes. The first (AU, automatic sensor) seems to be most valid for this data set.

In [10]:
# Identify all used AU Codes

au_codes = []

def findAUCodes(conditions):
    condition = conditions.split("|")
    if len(condition) > 1:
        condition = condition[0]
        condition = condition.split(" ")
        types = []
        for cond in condition:
            temp = cond.split(":")[0]
#            if temp[0] == '-' or temp[0] == '+':
#                temp = temp[1:]
            types.append(temp)
        return types
    else:
        return None

out = weather.apply(lambda x: findAUCodes(str(x["HourlyPresentWeatherType"])), axis=1)

for i in out:
    if i is not None:
        for j in i:
            au_codes.append(j)

set(au_codes)

{'',
 '+PL',
 '+RA',
 '+SN',
 '-DZ',
 '-FZ',
 '-PL',
 '-RA',
 '-SN',
 '-TS',
 'BC',
 'BL',
 'BR',
 'DZ',
 'FG',
 'FZ',
 'GS',
 'HZ',
 'MI',
 'PL',
 'RA',
 'SN',
 'TS',
 'VCTS'}

In [11]:
#include possible AU codes from above block (option to ignore +/-)
au_codes =  [
                'BC',
                'BL',
                'BR',
                'DZ',
                'FG',
                'FZ',
                'GS',
                'HZ',
                'MI',
                'PL',
                'RA',
                'SN',
                'TS',
                'VCTS'  # VCTS will co-occur with TS by this code, should be okay though. Can be improved with assumption that VCTS and TS are mutually exclusive
            ]

def reduceWeatherTypes(conditions):
    condition = conditions.split("|")
    if len(condition) > 1:
        condition = condition[0]
        au_dummy = []
        for au in au_codes:
            au_dummy.append(int(au in condition))
        return au_dummy
    else:
        return [0] * len(au_codes)
    
weather_type_dummies = weather_clean.apply(lambda x: reduceWeatherTypes(str(x["HourlyPresentWeatherType"])), result_type='expand', axis=1)
weather_type_dummies.columns = au_codes
# weather_clean = weather_clean.drop(["HourlyPresentWeatherType"],axis=1)  # Should be done after NaN's are dealt with
weather_clean = pd.concat([weather_clean, weather_type_dummies], axis=1)

### Replace T with number in HourlyPrecipitation

In [12]:
weather_clean.loc[weather_clean.HourlyPrecipitation == 'T', 'HourlyPrecipitation'] = 0.01

# Create hourly summary data

There's probably a ton of ways to do this. I will leverage here the observation that the measurements at 54 minute marks are the most consistently good. I will then map each of these hourly values to the nearest whole hour.

In [13]:
weather_summ = weather_clean[weather_clean["DATE"].dt.minute == 54]

In [14]:
print(f"Size of weather: {weather_summ.shape[0]}")
print("Number of NaN/blank values in each column:")
weather_summ.isna().sum()

Size of weather: 8767
Number of NaN/blank values in each column:


DATE                            0
HourlyDewPointTemperature       0
HourlyDryBulbTemperature        0
HourlyPrecipitation           182
HourlyPresentWeatherType     7038
HourlyRelativeHumidity          0
HourlySkyConditions           214
HourlyStationPressure         214
HourlyVisibility                0
HourlyWetBulbTemperature      214
HourlyWindDirection             2
HourlyWindSpeed                 2
BC                              0
BL                              0
BR                              0
DZ                              0
FG                              0
FZ                              0
GS                              0
HZ                              0
MI                              0
PL                              0
RA                              0
SN                              0
TS                              0
VCTS                            0
dtype: int64

##### Assumptions to clean summary data:

Many NaN around 2019-11-22. Maintenance? 

HourlyPrecipitation needs further investigation. Must beware T (Trace) values. Appears to be similar gaps to HourlySkyConditions

HourlySkyConditions have gaps of missing data. Perhaps use a classifier to predict for missing values.

In [15]:
# Adjust each datetime to the nearest hour ***ROUNDING DOWN***
weather_summ["DATE"] = weather_summ["DATE"].apply(lambda x: x.replace(minute=0))
#weather_summ.set_index("DATE")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [16]:
all = pd.Series(data=pd.date_range(start=weather_summ["DATE"].min(), end=weather_summ["DATE"].max(), freq='H'))
mask = all.isin(weather_summ["DATE"].values)
print(all[~mask])

733    2019-03-31 13:00:00
4494   2019-09-04 06:00:00
5637   2019-10-21 21:00:00
6145   2019-11-12 01:00:00
6397   2019-11-22 13:00:00
6399   2019-11-22 15:00:00
6668   2019-12-03 20:00:00
6669   2019-12-03 21:00:00
6837   2019-12-10 21:00:00
6838   2019-12-10 22:00:00
6839   2019-12-10 23:00:00
6848   2019-12-11 08:00:00
6854   2019-12-11 14:00:00
6855   2019-12-11 15:00:00
6907   2019-12-13 19:00:00
6908   2019-12-13 20:00:00
6932   2019-12-14 20:00:00
dtype: datetime64[ns]


We need to fill in the missing data points. This can probably be done by using the averages of the other data points nearby.

# Cleaning power data

It seems that some engines tend to show negative numbers consistently or during startup. This needs to be further investigated.

Otherwise, power data can be converted to hourly information either by an average of all measurements in an hour or by sampling at regular hour intervals. Both methods are valid and produce similar results. The biggest proportional difference occurs in the small (sub 5000 Tonnes) power ranges. Ways to calculate each and compare in Excel are given below.

There might be appliations where a max or min value over an hour will be useful as well.

```=AVERAGE(OFFSET(power!$K$2,(ROW(A1)-1)*60,,60,))```

```=OFFSET(power!$K$2,(ROW(B1)-1)*60,,1,)```

```=(B1-A1)/B1```

In [17]:
# No cleaning done yet
power_clean = power[["DATE","Total Power"]]
#power_clean = power.set_index(["DATE"])

In [18]:
# Check for time series continuity
all = pd.Series(data=pd.date_range(start=power_clean["DATE"].min(), end=power_clean["DATE"].max(), freq='H'))
mask = all.isin(power_clean["DATE"].values)
print(all[~mask])

218   2019-03-10 02:00:00
dtype: datetime64[ns]


In [19]:
# This value should be zero to be consistent with surrounding data values
missing_val = pd.DataFrame([[pd.Timestamp("2019-03-10 02:00:00"), 0]], columns=['DATE','Total Power'])
power_clean = power_clean.append(missing_val, ignore_index=True)

power_clean = power_clean.sort_values(['DATE'])
power_clean = power_clean.reset_index(drop=True)

### Convert to hourly summary

Min and Max need to be fixed, currently they match all rows with min/max values, should only match those with right time and min/max

In [20]:
# Get max hourly
power_summ_idx = power_clean.groupby([power_clean["DATE"].dt.year, power_clean["DATE"].dt.dayofyear, power_clean["DATE"].dt.hour], sort=False)['Total Power'].transform(max) == power_clean['Total Power']
power_summ_max = power_clean[power_summ_idx]
power_summ_max["DATE"] = power_summ_max["DATE"].apply(lambda x: x.replace(minute=0))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [21]:
# Get min hourly
power_summ_idx = power_clean.groupby([power_clean["DATE"].dt.year, power_clean["DATE"].dt.dayofyear, power_clean["DATE"].dt.hour], sort=False)['Total Power'].transform(min) == power_clean['Total Power']
power_summ_min = power_clean[power_summ_idx]
power_summ_min["DATE"] = power_summ_min["DATE"].apply(lambda x: x.replace(minute=0))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [22]:
avg_power_hourly

avg_power_hourly['Total Power']

NameError: name 'avg_power_hourly' is not defined

In [None]:
avg_power_hourly["Total Power"]

In [None]:
power_summ_avg["DATE"]

In [None]:
# Get average hourly
avg_power_hourly = power_clean.groupby([power_clean["DATE"].dt.year, power_clean["DATE"].dt.dayofyear, power_clean["DATE"].dt.hour]).mean()

power_summ_idx = power_clean['DATE'].dt.minute == 0
power_summ_avg = power_clean[power_summ_idx]

power_summ_avg['Total Power'] = avg_power_hourly['Total Power']

In [23]:
# Get regular interval hourly (on top of the hour)
power_summ_idx = power_clean['DATE'].dt.minute == 0
power_summ_reg = power_clean[power_summ_idx]

# Merge data sets

In [28]:
data = pd.merge(power_summ_reg, weather_summ, how="left", on='DATE')

In [29]:
print(f"Size of data: {data.shape[0]}")
print("Number of NaN/blank values in each column:")
data.isna().sum()

Size of data: 8786
Number of NaN/blank values in each column:


DATE                            0
Total Power                     0
HourlyDewPointTemperature      18
HourlyDryBulbTemperature       18
HourlyPrecipitation           200
HourlyPresentWeatherType     7057
HourlyRelativeHumidity         18
HourlySkyConditions           232
HourlyStationPressure         232
HourlyVisibility               18
HourlyWetBulbTemperature      232
HourlyWindDirection            20
HourlyWindSpeed                20
BC                             18
BL                             18
BR                             18
DZ                             18
FG                             18
FZ                             18
GS                             18
HZ                             18
MI                             18
PL                             18
RA                             18
SN                             18
TS                             18
VCTS                           18
dtype: int64

In [31]:
print(data[data.BC.isna()])

                    DATE   Total Power HourlyDewPointTemperature  \
733  2019-03-31 13:00:00   2067.812500                       NaN   
4494 2019-09-04 06:00:00  13808.811040                       NaN   
5637 2019-10-21 21:00:00   6226.552734                       NaN   
6146 2019-11-12 01:00:00   1920.984375                       NaN   
6398 2019-11-22 13:00:00   1272.465893                       NaN   
6400 2019-11-22 15:00:00   1337.913975                       NaN   
6669 2019-12-03 20:00:00      0.130878                       NaN   
6670 2019-12-03 21:00:00      0.069048                       NaN   
6838 2019-12-10 21:00:00   1082.128815                       NaN   
6839 2019-12-10 22:00:00   1117.183392                       NaN   
6840 2019-12-10 23:00:00   1111.797626                       NaN   
6849 2019-12-11 08:00:00   1055.895508                       NaN   
6855 2019-12-11 14:00:00   1134.415771                       NaN   
6856 2019-12-11 15:00:00   1138.051392          

In [1]:
data.to_csv("merged_data.csv")

NameError: name 'data' is not defined