# * A Fine Windy Day

## Feature Engineering

There is no need to split the data into train and test sets as they are already provided.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
dataset=pd.read_csv('train.csv')

In [3]:
dataset.head()

Unnamed: 0,tracking_id,datetime,wind_speed(m/s),atmospheric_temperature(°C),shaft_temperature(°C),blades_angle(°),gearbox_temperature(°C),engine_temperature(°C),motor_torque(N-m),generator_temperature(°C),...,windmill_body_temperature(°C),wind_direction(°),resistance(ohm),rotor_torque(N-m),turbine_status,cloud_level,blade_length(m),blade_breadth(m),windmill_height(m),windmill_generated_power(kW/h)
0,WM_33725,2019-08-04 14:33:20,94.820023,-99.0,41.723019,-0.903423,82.410573,42.523015,2563.124522,76.66556,...,,239.836388,2730.310605,42.084666,BA,Medium,2.217542,0.314065,24.281689,6.766521
1,WM_698,2018-11-05 10:13:20,241.832734,27.764785,-99.0,-99.0,44.104919,46.25887,2372.384119,78.129803,...,,337.944723,1780.2072,107.888643,A2,Medium,4.210346,0.448494,27.262139,5.966275
2,WM_39146,2019-09-14 14:03:20,95.484724,,41.855473,12.652763,42.322098,42.878552,1657.169646,67.654469,...,45.033197,227.850294,1666.0499,-42.931459,ABC,Medium,2.719475,0.302321,27.366127,2.874342
3,WM_6757,2018-12-25 15:33:20,238.819424,-99.0,45.443914,15.115323,44.759643,47.282101,2888.134079,95.389974,...,44.827154,492.08152,1964.502895,42.744596,ABC,,4.857385,0.36714,24.287767,14.851089
4,WM_21521,2019-05-04 03:13:20,10.72289,,41.981183,1.715696,-17.616459,43.469852,781.695419,37.423065,...,-99.0,259.274601,1177.516152,13.387289,AAA,Medium,,0.453374,27.97165,3.519074


In [4]:
#Finding all the categorical features which contain NaN values
cat_feat_nan=[feature for feature in dataset.columns if dataset[feature].isnull().sum()>1 and dataset[feature].dtypes=='O'
              and feature not in ['tracking_id','datetime']]

In [5]:
cat_feat_nan

['turbine_status', 'cloud_level']

In [6]:
#replacing missing values with a new label (altogether new category)
dataset[cat_feat_nan]=dataset[cat_feat_nan].fillna('Missing')

In [7]:
dataset.head()

Unnamed: 0,tracking_id,datetime,wind_speed(m/s),atmospheric_temperature(°C),shaft_temperature(°C),blades_angle(°),gearbox_temperature(°C),engine_temperature(°C),motor_torque(N-m),generator_temperature(°C),...,windmill_body_temperature(°C),wind_direction(°),resistance(ohm),rotor_torque(N-m),turbine_status,cloud_level,blade_length(m),blade_breadth(m),windmill_height(m),windmill_generated_power(kW/h)
0,WM_33725,2019-08-04 14:33:20,94.820023,-99.0,41.723019,-0.903423,82.410573,42.523015,2563.124522,76.66556,...,,239.836388,2730.310605,42.084666,BA,Medium,2.217542,0.314065,24.281689,6.766521
1,WM_698,2018-11-05 10:13:20,241.832734,27.764785,-99.0,-99.0,44.104919,46.25887,2372.384119,78.129803,...,,337.944723,1780.2072,107.888643,A2,Medium,4.210346,0.448494,27.262139,5.966275
2,WM_39146,2019-09-14 14:03:20,95.484724,,41.855473,12.652763,42.322098,42.878552,1657.169646,67.654469,...,45.033197,227.850294,1666.0499,-42.931459,ABC,Medium,2.719475,0.302321,27.366127,2.874342
3,WM_6757,2018-12-25 15:33:20,238.819424,-99.0,45.443914,15.115323,44.759643,47.282101,2888.134079,95.389974,...,44.827154,492.08152,1964.502895,42.744596,ABC,Missing,4.857385,0.36714,24.287767,14.851089
4,WM_21521,2019-05-04 03:13:20,10.72289,,41.981183,1.715696,-17.616459,43.469852,781.695419,37.423065,...,-99.0,259.274601,1177.516152,13.387289,AAA,Medium,,0.453374,27.97165,3.519074


In [8]:
dataset[cat_feat_nan].isnull().sum()

turbine_status    0
cloud_level       0
dtype: int64

In [9]:
#Finding all the numerical features which contain NaN values
numerical_feat_na=[feature for feature in dataset.columns if dataset[feature].isnull().sum()>1 and dataset[feature].dtypes!='O']

In [10]:
numerical_feat_na

['wind_speed(m/s)',
 'atmospheric_temperature(°C)',
 'shaft_temperature(°C)',
 'blades_angle(°)',
 'engine_temperature(°C)',
 'motor_torque(N-m)',
 'generator_temperature(°C)',
 'atmospheric_pressure(Pascal)',
 'windmill_body_temperature(°C)',
 'wind_direction(°)',
 'rotor_torque(N-m)',
 'blade_length(m)',
 'windmill_height(m)',
 'windmill_generated_power(kW/h)']

In [11]:
#Replacing the NaN values by median values
for feature in numerical_feat_na:
    median_value=dataset[feature].median()
    dataset[feature].fillna(median_value,inplace=True)

In [12]:
dataset[numerical_feat_na].isnull().sum()

wind_speed(m/s)                   0
atmospheric_temperature(°C)       0
shaft_temperature(°C)             0
blades_angle(°)                   0
engine_temperature(°C)            0
motor_torque(N-m)                 0
generator_temperature(°C)         0
atmospheric_pressure(Pascal)      0
windmill_body_temperature(°C)     0
wind_direction(°)                 0
rotor_torque(N-m)                 0
blade_length(m)                   0
windmill_height(m)                0
windmill_generated_power(kW/h)    0
dtype: int64

In [13]:
dataset.head()

Unnamed: 0,tracking_id,datetime,wind_speed(m/s),atmospheric_temperature(°C),shaft_temperature(°C),blades_angle(°),gearbox_temperature(°C),engine_temperature(°C),motor_torque(N-m),generator_temperature(°C),...,windmill_body_temperature(°C),wind_direction(°),resistance(ohm),rotor_torque(N-m),turbine_status,cloud_level,blade_length(m),blade_breadth(m),windmill_height(m),windmill_generated_power(kW/h)
0,WM_33725,2019-08-04 14:33:20,94.820023,-99.0,41.723019,-0.903423,82.410573,42.523015,2563.124522,76.66556,...,42.786832,239.836388,2730.310605,42.084666,BA,Medium,2.217542,0.314065,24.281689,6.766521
1,WM_698,2018-11-05 10:13:20,241.832734,27.764785,-99.0,-99.0,44.104919,46.25887,2372.384119,78.129803,...,42.786832,337.944723,1780.2072,107.888643,A2,Medium,4.210346,0.448494,27.262139,5.966275
2,WM_39146,2019-09-14 14:03:20,95.484724,16.10241,41.855473,12.652763,42.322098,42.878552,1657.169646,67.654469,...,45.033197,227.850294,1666.0499,-42.931459,ABC,Medium,2.719475,0.302321,27.366127,2.874342
3,WM_6757,2018-12-25 15:33:20,238.819424,-99.0,45.443914,15.115323,44.759643,47.282101,2888.134079,95.389974,...,44.827154,492.08152,1964.502895,42.744596,ABC,Missing,4.857385,0.36714,24.287767,14.851089
4,WM_21521,2019-05-04 03:13:20,10.72289,16.10241,41.981183,1.715696,-17.616459,43.469852,781.695419,37.423065,...,-99.0,259.274601,1177.516152,13.387289,AAA,Medium,3.453333,0.453374,27.97165,3.519074


In [14]:
def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    res = pd.concat([original_dataframe, dummies], axis=1)
    res = res.drop([feature_to_encode], axis=1)
    return(res) 

In [15]:
for feature in cat_feat_nan:
    dataset=encode_and_bind(dataset,feature)

In [16]:
dataset.head()

Unnamed: 0,tracking_id,datetime,wind_speed(m/s),atmospheric_temperature(°C),shaft_temperature(°C),blades_angle(°),gearbox_temperature(°C),engine_temperature(°C),motor_torque(N-m),generator_temperature(°C),...,turbine_status_BB,turbine_status_BBB,turbine_status_BCB,turbine_status_BD,turbine_status_D,turbine_status_Missing,cloud_level_Extremely Low,cloud_level_Low,cloud_level_Medium,cloud_level_Missing
0,WM_33725,2019-08-04 14:33:20,94.820023,-99.0,41.723019,-0.903423,82.410573,42.523015,2563.124522,76.66556,...,0,0,0,0,0,0,0,0,1,0
1,WM_698,2018-11-05 10:13:20,241.832734,27.764785,-99.0,-99.0,44.104919,46.25887,2372.384119,78.129803,...,0,0,0,0,0,0,0,0,1,0
2,WM_39146,2019-09-14 14:03:20,95.484724,16.10241,41.855473,12.652763,42.322098,42.878552,1657.169646,67.654469,...,0,0,0,0,0,0,0,0,1,0
3,WM_6757,2018-12-25 15:33:20,238.819424,-99.0,45.443914,15.115323,44.759643,47.282101,2888.134079,95.389974,...,0,0,0,0,0,0,0,0,0,1
4,WM_21521,2019-05-04 03:13:20,10.72289,16.10241,41.981183,1.715696,-17.616459,43.469852,781.695419,37.423065,...,0,0,0,0,0,0,0,0,1,0


In [17]:
dataset.columns

Index(['tracking_id', 'datetime', 'wind_speed(m/s)',
       'atmospheric_temperature(°C)', 'shaft_temperature(°C)',
       'blades_angle(°)', 'gearbox_temperature(°C)', 'engine_temperature(°C)',
       'motor_torque(N-m)', 'generator_temperature(°C)',
       'atmospheric_pressure(Pascal)', 'area_temperature(°C)',
       'windmill_body_temperature(°C)', 'wind_direction(°)', 'resistance(ohm)',
       'rotor_torque(N-m)', 'blade_length(m)', 'blade_breadth(m)',
       'windmill_height(m)', 'windmill_generated_power(kW/h)',
       'turbine_status_A', 'turbine_status_A2', 'turbine_status_AAA',
       'turbine_status_AB', 'turbine_status_ABC', 'turbine_status_AC',
       'turbine_status_B', 'turbine_status_B2', 'turbine_status_BA',
       'turbine_status_BB', 'turbine_status_BBB', 'turbine_status_BCB',
       'turbine_status_BD', 'turbine_status_D', 'turbine_status_Missing',
       'cloud_level_Extremely Low', 'cloud_level_Low', 'cloud_level_Medium',
       'cloud_level_Missing'],
      dtype

# Feature Scaling

In [18]:
feature_to_scale=[feature for feature in dataset.columns if feature not in ['tracking_id','datetime','windmill_generated_power(kW/h)']]

In [19]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(dataset[feature_to_scale])
data=pd.concat([dataset[['tracking_id','datetime','windmill_generated_power(kW/h)']].reset_index(drop=True),pd.DataFrame(scaler.transform(dataset[feature_to_scale]),columns=feature_to_scale)],axis=1)

In [20]:
data.head()

Unnamed: 0,tracking_id,datetime,windmill_generated_power(kW/h),wind_speed(m/s),atmospheric_temperature(°C),shaft_temperature(°C),blades_angle(°),gearbox_temperature(°C),engine_temperature(°C),motor_torque(N-m),...,turbine_status_BB,turbine_status_BBB,turbine_status_BCB,turbine_status_BD,turbine_status_D,turbine_status_Missing,cloud_level_Extremely Low,cloud_level_Low,cloud_level_Medium,cloud_level_Missing
0,WM_33725,2019-08-04 14:33:20,6.766521,0.336418,-2.423642,0.060191,0.181827,0.947781,-0.014961,1.030403,...,-0.272254,-0.26274,-0.271276,-0.264432,-0.270446,-0.257925,-0.10352,-0.987385,1.028491,-0.099418
1,WM_698,2018-11-05 10:13:20,5.966275,2.27229,0.609053,-5.112879,-1.872976,0.070476,0.595157,0.799731,...,-0.272254,-0.26274,-0.271276,-0.264432,-0.270446,-0.257925,-0.10352,-0.987385,1.028491,-0.099418
2,WM_39146,2019-09-14 14:03:20,2.874342,0.345171,0.330044,0.06506,0.465785,0.029644,0.043103,-0.065212,...,-0.272254,-0.26274,-0.271276,-0.264432,-0.270446,-0.257925,-0.10352,-0.987385,1.028491,-0.099418
3,WM_6757,2018-12-25 15:33:20,14.851089,2.23261,-2.423642,0.196974,0.517368,0.085471,0.762265,1.423452,...,-0.272254,-0.26274,-0.271276,-0.264432,-0.270446,-0.257925,-0.10352,-0.987385,-0.972298,10.058524
4,WM_21521,2019-05-04 03:13:20,3.519074,-0.770978,0.330044,0.069681,0.23669,-1.343115,0.139671,-1.123965,...,-0.272254,-0.26274,-0.271276,-0.264432,-0.270446,-0.257925,-0.10352,-0.987385,1.028491,-0.099418


In [22]:
data.drop(['tracking_id','datetime'],inplace=True,axis=1)

In [24]:
data.head()

Unnamed: 0,windmill_generated_power(kW/h),wind_speed(m/s),atmospheric_temperature(°C),shaft_temperature(°C),blades_angle(°),gearbox_temperature(°C),engine_temperature(°C),motor_torque(N-m),generator_temperature(°C),atmospheric_pressure(Pascal),...,turbine_status_BB,turbine_status_BBB,turbine_status_BCB,turbine_status_BD,turbine_status_D,turbine_status_Missing,cloud_level_Extremely Low,cloud_level_Low,cloud_level_Medium,cloud_level_Missing
0,6.766521,0.336418,-2.423642,0.060191,0.181827,0.947781,-0.014961,1.030403,0.587276,0.300032,...,-0.272254,-0.26274,-0.271276,-0.264432,-0.270446,-0.257925,-0.10352,-0.987385,1.028491,-0.099418
1,5.966275,2.27229,0.609053,-5.112879,-1.872976,0.070476,0.595157,0.799731,0.661182,-0.183652,...,-0.272254,-0.26274,-0.271276,-0.264432,-0.270446,-0.257925,-0.10352,-0.987385,1.028491,-0.099418
2,2.874342,0.345171,0.330044,0.06506,0.465785,0.029644,0.043103,-0.065212,0.132452,-0.18872,...,-0.272254,-0.26274,-0.271276,-0.264432,-0.270446,-0.257925,-0.10352,-0.987385,1.028491,-0.099418
3,14.851089,2.23261,-2.423642,0.196974,0.517368,0.085471,0.762265,1.423452,1.532367,-0.174363,...,-0.272254,-0.26274,-0.271276,-0.264432,-0.270446,-0.257925,-0.10352,-0.987385,-0.972298,10.058524
4,3.519074,-0.770978,0.330044,0.069681,0.23669,-1.343115,0.139671,-1.123965,-1.39344,0.361997,...,-0.272254,-0.26274,-0.271276,-0.264432,-0.270446,-0.257925,-0.10352,-0.987385,1.028491,-0.099418


In [25]:
data.to_csv('X_train.csv',index=False)