In [137]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [138]:
train = pd.read_csv('dataset/train.csv')
test= pd.read_csv('dataset/test.csv')
train.head()

Unnamed: 0,tracking_id,datetime,wind_speed(m/s),atmospheric_temperature(°C),shaft_temperature(°C),blades_angle(°),gearbox_temperature(°C),engine_temperature(°C),motor_torque(N-m),generator_temperature(°C),...,windmill_body_temperature(°C),wind_direction(°),resistance(ohm),rotor_torque(N-m),turbine_status,cloud_level,blade_length(m),blade_breadth(m),windmill_height(m),windmill_generated_power(kW/h)
0,WM_33725,2019-08-04 14:33:20,94.820023,-99.0,41.723019,-0.903423,82.410573,42.523015,2563.124522,76.66556,...,,239.836388,2730.310605,42.084666,BA,Medium,2.217542,0.314065,24.281689,6.766521
1,WM_698,2018-11-05 10:13:20,241.832734,27.764785,-99.0,-99.0,44.104919,46.25887,2372.384119,78.129803,...,,337.944723,1780.2072,107.888643,A2,Medium,4.210346,0.448494,27.262139,5.966275
2,WM_39146,2019-09-14 14:03:20,95.484724,,41.855473,12.652763,42.322098,42.878552,1657.169646,67.654469,...,45.033197,227.850294,1666.0499,-42.931459,ABC,Medium,2.719475,0.302321,27.366127,2.874342
3,WM_6757,2018-12-25 15:33:20,238.819424,-99.0,45.443914,15.115323,44.759643,47.282101,2888.134079,95.389974,...,44.827154,492.08152,1964.502895,42.744596,ABC,,4.857385,0.36714,24.287767,14.851089
4,WM_21521,2019-05-04 03:13:20,10.72289,,41.981183,1.715696,-17.616459,43.469852,781.695419,37.423065,...,-99.0,259.274601,1177.516152,13.387289,AAA,Medium,,0.453374,27.97165,3.519074


In [139]:
train.dtypes

tracking_id                        object
datetime                           object
wind_speed(m/s)                   float64
atmospheric_temperature(°C)       float64
shaft_temperature(°C)             float64
blades_angle(°)                   float64
gearbox_temperature(°C)           float64
engine_temperature(°C)            float64
motor_torque(N-m)                 float64
generator_temperature(°C)         float64
atmospheric_pressure(Pascal)      float64
area_temperature(°C)              float64
windmill_body_temperature(°C)     float64
wind_direction(°)                 float64
resistance(ohm)                   float64
rotor_torque(N-m)                 float64
turbine_status                     object
cloud_level                        object
blade_length(m)                   float64
blade_breadth(m)                  float64
windmill_height(m)                float64
windmill_generated_power(kW/h)    float64
dtype: object

# Deal with NaN Values:

In [140]:
train.isnull().sum()

tracking_id                          0
datetime                             0
wind_speed(m/s)                    273
atmospheric_temperature(°C)       3450
shaft_temperature(°C)                2
blades_angle(°)                    216
gearbox_temperature(°C)              1
engine_temperature(°C)              12
motor_torque(N-m)                   24
generator_temperature(°C)           12
atmospheric_pressure(Pascal)      2707
area_temperature(°C)                 0
windmill_body_temperature(°C)     2363
wind_direction(°)                 5103
resistance(ohm)                      1
rotor_torque(N-m)                  572
turbine_status                    1759
cloud_level                        276
blade_length(m)                   5093
blade_breadth(m)                     0
windmill_height(m)                 543
windmill_generated_power(kW/h)     207
dtype: int64

### Categorical Features

In [141]:
# Function to impute most occured category and add importance vairable
def impute_categorical(DataFrame,ColName):
    #1. add new column and replace if category is null then 1 else 0
    DataFrame[ColName+"_Imputed"] =   np.where(DataFrame[ColName].isnull(),1,0)
    
    # 2. Take most occured category in that vairable (.mode())
    Mode_Category = DataFrame[ColName].mode()[0]
    
    ## 2.1 Replace NAN values with most occured category in actual vairable
    DataFrame[ColName].fillna(Mode_Category,inplace=True)

In [142]:
for c in ['turbine_status','cloud_level']:
    impute_categorical(train, c)
    
# train[['turbine_status','turbine_status_Imputed','cloud_level','cloud_level_Imputed']].head(10)

### Numerical

In [143]:
train.columns

Index(['tracking_id', 'datetime', 'wind_speed(m/s)',
       'atmospheric_temperature(°C)', 'shaft_temperature(°C)',
       'blades_angle(°)', 'gearbox_temperature(°C)', 'engine_temperature(°C)',
       'motor_torque(N-m)', 'generator_temperature(°C)',
       'atmospheric_pressure(Pascal)', 'area_temperature(°C)',
       'windmill_body_temperature(°C)', 'wind_direction(°)', 'resistance(ohm)',
       'rotor_torque(N-m)', 'turbine_status', 'cloud_level', 'blade_length(m)',
       'blade_breadth(m)', 'windmill_height(m)',
       'windmill_generated_power(kW/h)', 'turbine_status_Imputed',
       'cloud_level_Imputed'],
      dtype='object')

In [144]:
import datawig

def impute_any(output_column):
    df_train, df_test = datawig.utils.random_split(train)

    #Initialize a SimpleImputer model
    imputer = datawig.SimpleImputer(
        input_columns=['tracking_id', 'datetime', 'wind_speed(m/s)', 'shaft_temperature(°C)', 'blades_angle(°)',\
                       'gearbox_temperature(°C)', 'engine_temperature(°C)', 'motor_torque(N-m)', 'generator_temperature(°C)',\
                       'area_temperature(°C)', 'resistance(ohm)', 'rotor_torque(N-m)', 'turbine_status', 'cloud_level',\
                       'blade_length(m)', 'windmill_height(m)'], # column(s) containing information about the column we want to impute
        output_column= output_column, # the column we'd like to impute values for
        output_path = 'imputer_model' # stores model data and metrics
        )

    #Fit an imputer model on the train data
    imputer.fit(train_df=df_train, num_epochs=50)

    #Impute missing values and return original dataframe with predictions
    imputed = imputer.predict(df_test)
    return imputed
# w = impute_any('wind_speed(m/s)')

In [145]:
for i in train.columns:
    if train[i].isnull().sum()>0:
        mean = train[i].mean()
        train[i].replace(np.nan, mean, inplace=True)

In [146]:
wind = 'windmill_generated_power(kW/h)'
train.shape

(28200, 24)

In [147]:
train.isnull().sum()

tracking_id                       0
datetime                          0
wind_speed(m/s)                   0
atmospheric_temperature(°C)       0
shaft_temperature(°C)             0
blades_angle(°)                   0
gearbox_temperature(°C)           0
engine_temperature(°C)            0
motor_torque(N-m)                 0
generator_temperature(°C)         0
atmospheric_pressure(Pascal)      0
area_temperature(°C)              0
windmill_body_temperature(°C)     0
wind_direction(°)                 0
resistance(ohm)                   0
rotor_torque(N-m)                 0
turbine_status                    0
cloud_level                       0
blade_length(m)                   0
blade_breadth(m)                  0
windmill_height(m)                0
windmill_generated_power(kW/h)    0
turbine_status_Imputed            0
cloud_level_Imputed               0
dtype: int64

In [148]:
train

Unnamed: 0,tracking_id,datetime,wind_speed(m/s),atmospheric_temperature(°C),shaft_temperature(°C),blades_angle(°),gearbox_temperature(°C),engine_temperature(°C),motor_torque(N-m),generator_temperature(°C),...,resistance(ohm),rotor_torque(N-m),turbine_status,cloud_level,blade_length(m),blade_breadth(m),windmill_height(m),windmill_generated_power(kW/h),turbine_status_Imputed,cloud_level_Imputed
0,WM_33725,2019-08-04 14:33:20,94.820023,-99.000000,41.723019,-0.903423,82.410573,42.523015,2563.124522,76.665560,...,2730.310605,42.084666,BA,Medium,2.217542,0.314065,24.281689,6.766521,0,0
1,WM_698,2018-11-05 10:13:20,241.832734,27.764785,-99.000000,-99.000000,44.104919,46.258870,2372.384119,78.129803,...,1780.207200,107.888643,A2,Medium,4.210346,0.448494,27.262139,5.966275,0,0
2,WM_39146,2019-09-14 14:03:20,95.484724,0.383727,41.855473,12.652763,42.322098,42.878552,1657.169646,67.654469,...,1666.049900,-42.931459,ABC,Medium,2.719475,0.302321,27.366127,2.874342,0,0
3,WM_6757,2018-12-25 15:33:20,238.819424,-99.000000,45.443914,15.115323,44.759643,47.282101,2888.134079,95.389974,...,1964.502895,42.744596,ABC,Low,4.857385,0.367140,24.287767,14.851089,0,1
4,WM_21521,2019-05-04 03:13:20,10.722890,0.383727,41.981183,1.715696,-17.616459,43.469852,781.695419,37.423065,...,1177.516152,13.387289,AAA,Medium,2.254034,0.453374,27.971650,3.519074,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28195,WM_7814,2019-01-02 02:43:20,94.765699,23.576793,45.399352,5.377222,-1.085171,48.528248,2791.600990,90.898875,...,1980.861921,45.909054,BB,Medium,2.774335,0.418299,24.590801,9.587934,0,0
28196,WM_32512,2019-07-26 12:53:20,94.196738,24.034329,42.068979,-99.000000,44.285153,43.487939,2207.882276,72.244645,...,1712.840457,36.974913,BB,Low,-3.250989,0.461531,26.051604,4.522195,0,0
28197,WM_5193,2018-12-12 02:13:20,94.160463,28.674296,45.004213,9.550358,49.377706,44.042632,2801.657374,94.814637,...,1951.728713,88.319152,D,Medium,2.254034,0.380264,28.533850,11.096599,0,0
28198,WM_12173,2019-02-03 19:13:20,95.430377,26.560254,48.032624,3.051389,81.443896,44.821365,2760.647280,90.144418,...,1968.917692,47.562627,BCB,Low,3.001855,0.346447,47.747269,9.373239,0,0


In [149]:
from sklearn.model_selection import train_test_split
X = train.drop([wind, 'tracking_id', 'datetime'], axis=1)
y = train[wind]

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

# X_train.shape, X_test.shape

# Encoding

In [150]:
import category_encoders as ce
encoder = ce.OneHotEncoder(cols=['turbine_status', 'cloud_level'])

train_enc = encoder.fit_transform(X)
train_enc

Unnamed: 0,wind_speed(m/s),atmospheric_temperature(°C),shaft_temperature(°C),blades_angle(°),gearbox_temperature(°C),engine_temperature(°C),motor_torque(N-m),generator_temperature(°C),atmospheric_pressure(Pascal),area_temperature(°C),...,turbine_status_13,turbine_status_14,cloud_level_1,cloud_level_2,cloud_level_3,blade_length(m),blade_breadth(m),windmill_height(m),turbine_status_Imputed,cloud_level_Imputed
0,94.820023,-99.000000,41.723019,-0.903423,82.410573,42.523015,2563.124522,76.665560,103402.961872,26.897875,...,0,0,1,0,0,2.217542,0.314065,24.281689,0,0
1,241.832734,27.764785,-99.000000,-99.000000,44.104919,46.258870,2372.384119,78.129803,17030.904078,39.801469,...,0,0,1,0,0,4.210346,0.448494,27.262139,0,0
2,95.484724,0.383727,41.855473,12.652763,42.322098,42.878552,1657.169646,67.654469,16125.927107,36.116065,...,0,0,1,0,0,2.719475,0.302321,27.366127,0,0
3,238.819424,-99.000000,45.443914,15.115323,44.759643,47.282101,2888.134079,95.389974,18689.732336,46.020045,...,0,0,0,1,0,4.857385,0.367140,24.287767,0,1
4,10.722890,0.383727,41.981183,1.715696,-17.616459,43.469852,781.695419,37.423065,114468.169007,34.572941,...,0,0,1,0,0,2.254034,0.453374,27.971650,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28195,94.765699,23.576793,45.399352,5.377222,-1.085171,48.528248,2791.600990,90.898875,19428.725079,45.429230,...,0,0,1,0,0,2.774335,0.418299,24.590801,0,0
28196,94.196738,24.034329,42.068979,-99.000000,44.285153,43.487939,2207.882276,72.244645,16596.485400,25.142681,...,0,0,0,1,0,-3.250989,0.461531,26.051604,0,0
28197,94.160463,28.674296,45.004213,9.550358,49.377706,44.042632,2801.657374,94.814637,19083.881449,45.129442,...,0,1,1,0,0,2.254034,0.380264,28.533850,0,0
28198,95.430377,26.560254,48.032624,3.051389,81.443896,44.821365,2760.647280,90.144418,18360.785707,45.603927,...,0,0,0,1,0,3.001855,0.346447,47.747269,0,0


# Feature Scaling

In [151]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()

cols = train_enc.columns
train_enc = scaler.fit_transform(train_enc)

train_enc = pd.DataFrame(train_enc, columns=[cols])

In [152]:
train_enc

Unnamed: 0,wind_speed(m/s),atmospheric_temperature(°C),shaft_temperature(°C),blades_angle(°),gearbox_temperature(°C),engine_temperature(°C),motor_torque(N-m),generator_temperature(°C),atmospheric_pressure(Pascal),area_temperature(°C),...,turbine_status_13,turbine_status_14,cloud_level_1,cloud_level_2,cloud_level_3,blade_length(m),blade_breadth(m),windmill_height(m),turbine_status_Imputed,cloud_level_Imputed
0,0.021131,-5.098807,-0.485797,-0.057098,7.364677,-0.306915,0.334157,0.157490,0.850475,-0.522613,...,0.0,0.0,1.0,-1.0,0.0,-0.437013,-0.829425,-0.542467,0.0,0.0
1,2.007867,0.613484,-35.314461,-14.973036,0.166014,0.838526,0.214310,0.196289,-0.017302,0.658955,...,0.0,0.0,1.0,-1.0,0.0,0.610392,0.489684,0.457114,0.0,0.0
2,0.030114,-0.620365,-0.453015,2.004169,-0.169026,-0.197905,-0.235074,-0.081285,-0.026394,0.321486,...,0.0,0.0,1.0,-1.0,0.0,-0.173200,-0.944663,0.491989,0.0,0.0
3,1.967145,-5.098807,0.435117,2.378610,0.289054,1.152256,0.538367,0.653647,-0.000636,1.228383,...,0.0,0.0,0.0,0.0,0.0,0.950471,-0.308618,-0.540429,0.0,1.0
4,-1.115361,-0.620365,-0.421902,0.341148,-11.433092,-0.016608,-0.785153,-0.882354,0.961647,0.180184,...,0.0,0.0,1.0,-1.0,0.0,-0.417832,0.537562,0.695069,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28195,0.020397,0.424764,0.424088,0.897897,-8.326419,1.534334,0.477713,0.534642,0.006789,1.174282,...,0.0,0.0,1.0,-1.0,0.0,-0.144366,0.193391,-0.438797,0.0,0.0
28196,0.012708,0.445382,-0.400173,-14.973036,0.199885,-0.011062,0.110950,0.040345,-0.021667,-0.683334,...,0.0,0.0,0.0,0.0,0.0,-3.311235,0.617609,0.051125,0.0,0.0
28197,0.012218,0.654469,0.326292,1.532437,1.156913,0.159011,0.484032,0.638402,0.003324,1.146831,...,0.0,1.0,1.0,-1.0,0.0,-0.417832,-0.179833,0.883619,0.0,0.0
28198,0.029380,0.559205,1.075817,0.544245,7.183012,0.397776,0.458264,0.514651,-0.003941,1.190279,...,0.0,0.0,0.0,0.0,0.0,-0.024783,-0.511666,7.327399,0.0,0.0


# PCA and tSNE

In [153]:
# necessary functions
from sklearn.decomposition import PCA
labels = np.reshape(train[wind].to_numpy(), (28200,1))

def pca_analysis(df, n_components):
    pca = PCA(n_components=n_components)
    pca_result = pca.fit_transform(df.iloc[:, :-1].values)

    pca_concatenate = np.concatenate([pca_result, labels], axis=1)
    return pca_concatenate
def make_dataframe(array, n_features):
    col = []
    for i in range(1, n_features+1):
        col.append(f'pc{i}')
    col.append(wind)
    return pd.DataFrame(array, columns = col)

from sklearn.manifold import TSNE
def tsne_analysis(df, n_components):
    tsne = TSNE(n_components=n_components)
    tsne_result = tsne.fit_transform(df.iloc[:, :-1].values)

    tsne_concatenate = np.concatenate([tsne_result, labels], axis=1)
    return tsne_concatenate
def tsne_dataframe(array, n_features):
    col = []
    for i in range(1, n_features+1):
        col.append(f'tsne{i}')
    col.append(wind)
    return pd.DataFrame(array, columns = col)

# def get_xy(df):
#     if df.shape==37:
#         X = df.drop(wind, axis=1)
#     else:
#         X = df
#     y = train[wind].copy()
#     return X, y
def get_xy(df):
    X = df.drop(wind, axis=1)
    y = train[wind].copy()
    return X, y

In [154]:
train_full = pd.concat([train_enc, train[wind]], axis=1)
train_full[wind]

0         6.766521
1         5.966275
2         2.874342
3        14.851089
4         3.519074
           ...    
28195     9.587934
28196     4.522195
28197    11.096599
28198     9.373239
28199     2.860342
Name: windmill_generated_power(kW/h), Length: 28200, dtype: float64

# NN

In [155]:
from sklearn.model_selection import ShuffleSplit, train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, LabelEncoder
from sklearn.metrics import accuracy_score, r2_score
from keras import optimizers
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization
from keras.optimizers import Adam

In [158]:
accuracy = []

for k in [0, 2, 3, 4, 5, 10, 15, 20, 25, 30, 35]:
    if k==0:
        name = 'train_enc'
        dfs = train_full
    elif k==22 or k==23:
        name = f'tsne{k-20}'
        vars()[name] = tsne_analysis(train_full, k-20)
        dfs = tsne_dataframe(vars()[name], k-20)
    else:
        name = f'pca{k}'
        vars()[name] = pca_analysis(train_full, k)
        dfs = make_dataframe(vars()[name], k)
    # print(dfs.shape)

    X, y = get_xy(dfs)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
    result = []
    
    Adam(learning_rate=0.01)
    model = Sequential()
    model.add(Dense(24,activation='tanh',input_dim=X_train.shape[1]))
    model.add(Dense(12,activation='relu'))
    model.add(Dense(1))

    model.compile(loss='mean_squared_error',optimizer='adam')

    model.fit(X_train, y_train, epochs=100,batch_size=256,verbose=False)
    y_pred = model.predict(X_test)
    score = max(0, 100 * r2_score(y_test, y_pred))
    
    print(score)

83.9470009227647
47.61783600971052
60.125409201832284
60.182278429386706
57.621416156391824
68.97061316865125
71.68972326270011
82.45661172703818
83.8135366727733
82.77255829908648
76.84222916661305


In [157]:
# def plot(y_test, y_pred):
#     fig, (ax1, ax2) = plt.subplots(1, 2,figsize=(20,7)) 
#     sns.distplot(y_test.values,label='test values', ax=ax1)
#     sns.distplot(y_pred ,label='prediction', ax=ax1)
#     ax1.set_xlabel('Distribution plot')
#     ax2.scatter(y_test,y_pred, c='orange',label='predictions')
#     ax2.plot(y_test,c='blue',label='y=x')
#     ax2.set_xlabel('test value')
#     ax2.set_ylabel('estimated $\log(radius)$')
#     ax1.legend()
#     ax2.legend()
#     ax2.axis('scaled')
# plot(y_test, y_pred)

### Grid Search

# Test with test.csv

In [227]:
test

Unnamed: 0,tracking_id,datetime,wind_speed(m/s),atmospheric_temperature(°C),shaft_temperature(°C),blades_angle(°),gearbox_temperature(°C),engine_temperature(°C),motor_torque(N-m),generator_temperature(°C),...,area_temperature(°C),windmill_body_temperature(°C),wind_direction(°),resistance(ohm),rotor_torque(N-m),turbine_status,cloud_level,blade_length(m),blade_breadth(m),windmill_height(m)
0,WM_19817,2019-04-17 08:53:20,94.32,17.64,89.71,51.15,40.46,39.59,1073.20,66.83,...,24.00,43.76,445.98,1664.22,21.91,BA,Medium,3.19,0.40,25.57
1,WM_18723,2019-03-30 07:43:20,10.09,13.98,43.27,46.52,40.03,41.18,517.44,37.28,...,29.43,42.73,499.60,1165.11,-35.05,A,Medium,3.02,0.44,24.37
2,WM_34552,2019-08-10 11:33:20,347.15,31.42,41.08,26.93,43.11,43.44,1480.72,70.01,...,29.92,43.26,245.43,1667.72,27.20,B2,Medium,2.61,0.39,27.65
3,WM_28570,2019-06-26 03:53:20,24.47,-99.00,14.38,66.51,13.74,15.58,887.98,41.45,...,23.89,13.50,,1329.74,15.25,BBB,Low,2.87,0.45,24.19
4,WM_36934,2019-08-27 16:43:20,97.00,33.28,41.41,1.84,121.57,43.93,2053.92,68.01,...,35.91,-99.00,442.43,691.41,34.26,A,Low,3.55,0.37,4.89
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12081,WM_13376,2019-02-12 11:33:20,-15.04,18.52,43.55,-1.09,44.55,44.49,2424.37,75.85,...,28.42,42.13,509.80,1798.14,-14.37,,Medium,3.29,0.41,20.89
12082,WM_1630,2018-11-12 17:33:20,97.59,-99.00,44.75,1.75,43.31,42.85,2085.63,73.09,...,50.62,47.35,345.71,1680.25,-7.06,AB,Low,,0.37,24.79
12083,WM_24703,2019-05-27 11:53:20,16.19,8.11,43.25,-0.69,214.90,41.18,788.69,39.51,...,20.98,41.77,205.57,2343.17,12.33,BA,Low,3.73,0.46,5.65
12084,WM_22893,2019-05-13 21:53:20,93.25,-99.00,43.84,-1.17,41.98,42.04,2079.86,71.48,...,29.11,82.75,260.58,1732.88,,A,Medium,2.25,0.39,24.59


In [228]:
test.isnull().sum()

tracking_id                         0
datetime                            0
wind_speed(m/s)                   126
atmospheric_temperature(°C)      1427
shaft_temperature(°C)               1
blades_angle(°)                   106
gearbox_temperature(°C)             1
engine_temperature(°C)              5
motor_torque(N-m)                  11
generator_temperature(°C)           5
atmospheric_pressure(Pascal)     1151
area_temperature(°C)                1
windmill_body_temperature(°C)     926
wind_direction(°)                2160
resistance(ohm)                     0
rotor_torque(N-m)                 281
turbine_status                    797
cloud_level                       125
blade_length(m)                  2114
blade_breadth(m)                    0
windmill_height(m)                255
dtype: int64

In [229]:
for c in ['turbine_status','cloud_level']:
    impute_categorical(test, c)

In [230]:
for i in test.columns:
    if test[i].isnull().sum()>0:
        mean = test[i].mean()
        test[i].replace(np.nan, mean, inplace=True)
test

Unnamed: 0,tracking_id,datetime,wind_speed(m/s),atmospheric_temperature(°C),shaft_temperature(°C),blades_angle(°),gearbox_temperature(°C),engine_temperature(°C),motor_torque(N-m),generator_temperature(°C),...,wind_direction(°),resistance(ohm),rotor_torque(N-m),turbine_status,cloud_level,blade_length(m),blade_breadth(m),windmill_height(m),turbine_status_Imputed,cloud_level_Imputed
0,WM_19817,2019-04-17 08:53:20,94.32,17.64,89.71,51.15,40.46,39.59,1073.20,66.83,...,445.98,1664.22,21.91,BA,Medium,3.19,0.40,25.57,0,0
1,WM_18723,2019-03-30 07:43:20,10.09,13.98,43.27,46.52,40.03,41.18,517.44,37.28,...,499.60,1165.11,-35.05,A,Medium,3.02,0.44,24.37,0,0
2,WM_34552,2019-08-10 11:33:20,347.15,31.42,41.08,26.93,43.11,43.44,1480.72,70.01,...,245.43,1667.72,27.20,B2,Medium,2.61,0.39,27.65,0,0
3,WM_28570,2019-06-26 03:53:20,24.47,-99.00,14.38,66.51,13.74,15.58,887.98,41.45,...,308.38,1329.74,15.25,BBB,Low,2.87,0.45,24.19,0,0
4,WM_36934,2019-08-27 16:43:20,97.00,33.28,41.41,1.84,121.57,43.93,2053.92,68.01,...,442.43,691.41,34.26,A,Low,3.55,0.37,4.89,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12081,WM_13376,2019-02-12 11:33:20,-15.04,18.52,43.55,-1.09,44.55,44.49,2424.37,75.85,...,509.80,1798.14,-14.37,BB,Medium,3.29,0.41,20.89,1,0
12082,WM_1630,2018-11-12 17:33:20,97.59,-99.00,44.75,1.75,43.31,42.85,2085.63,73.09,...,345.71,1680.25,-7.06,AB,Low,2.20,0.37,24.79,0,0
12083,WM_24703,2019-05-27 11:53:20,16.19,8.11,43.25,-0.69,214.90,41.18,788.69,39.51,...,205.57,2343.17,12.33,BA,Low,3.73,0.46,5.65,0,0
12084,WM_22893,2019-05-13 21:53:20,93.25,-99.00,43.84,-1.17,41.98,42.04,2079.86,71.48,...,260.58,1732.88,26.57,A,Medium,2.25,0.39,24.59,0,0


In [231]:
X = test.drop(['tracking_id', 'datetime'], axis=1)
X

Unnamed: 0,wind_speed(m/s),atmospheric_temperature(°C),shaft_temperature(°C),blades_angle(°),gearbox_temperature(°C),engine_temperature(°C),motor_torque(N-m),generator_temperature(°C),atmospheric_pressure(Pascal),area_temperature(°C),...,wind_direction(°),resistance(ohm),rotor_torque(N-m),turbine_status,cloud_level,blade_length(m),blade_breadth(m),windmill_height(m),turbine_status_Imputed,cloud_level_Imputed
0,94.32,17.64,89.71,51.15,40.46,39.59,1073.20,66.83,16681.04,24.00,...,445.98,1664.22,21.91,BA,Medium,3.19,0.40,25.57,0,0
1,10.09,13.98,43.27,46.52,40.03,41.18,517.44,37.28,54283.32,29.43,...,499.60,1165.11,-35.05,A,Medium,3.02,0.44,24.37,0,0
2,347.15,31.42,41.08,26.93,43.11,43.44,1480.72,70.01,214812.84,29.92,...,245.43,1667.72,27.20,B2,Medium,2.61,0.39,27.65,0,0
3,24.47,-99.00,14.38,66.51,13.74,15.58,887.98,41.45,54283.32,23.89,...,308.38,1329.74,15.25,BBB,Low,2.87,0.45,24.19,0,0
4,97.00,33.28,41.41,1.84,121.57,43.93,2053.92,68.01,16833.55,35.91,...,442.43,691.41,34.26,A,Low,3.55,0.37,4.89,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12081,-15.04,18.52,43.55,-1.09,44.55,44.49,2424.37,75.85,17239.35,28.42,...,509.80,1798.14,-14.37,BB,Medium,3.29,0.41,20.89,1,0
12082,97.59,-99.00,44.75,1.75,43.31,42.85,2085.63,73.09,17131.45,50.62,...,345.71,1680.25,-7.06,AB,Low,2.20,0.37,24.79,0,0
12083,16.19,8.11,43.25,-0.69,214.90,41.18,788.69,39.51,118707.64,20.98,...,205.57,2343.17,12.33,BA,Low,3.73,0.46,5.65,0,0
12084,93.25,-99.00,43.84,-1.17,41.98,42.04,2079.86,71.48,17155.27,29.11,...,260.58,1732.88,26.57,A,Medium,2.25,0.39,24.59,0,0


In [232]:
import category_encoders as ce
encoder = ce.OneHotEncoder(cols=['turbine_status', 'cloud_level'])

test_enc = encoder.fit_transform(X)
test_enc

# Feature Scaling
cols = test_enc.columns
test_enc = scaler.fit_transform(test_enc)

test_enc = pd.DataFrame(test_enc, columns=[cols])

test_enc

Unnamed: 0,wind_speed(m/s),atmospheric_temperature(°C),shaft_temperature(°C),blades_angle(°),gearbox_temperature(°C),engine_temperature(°C),motor_torque(N-m),generator_temperature(°C),atmospheric_pressure(Pascal),area_temperature(°C),...,turbine_status_13,turbine_status_14,cloud_level_1,cloud_level_2,cloud_level_3,blade_length(m),blade_breadth(m),windmill_height(m),turbine_status_Imputed,cloud_level_Imputed
0,0.01,0.15,11.16,7.15,-0.51,-1.20,-0.60,-0.10,-0.02,-0.79,...,0.00,0.00,1.00,-1.00,0.00,0.07,0.04,-0.13,0.00,0.00
1,-1.14,-0.01,-0.11,6.51,-0.59,-0.72,-0.94,-0.88,0.36,-0.30,...,0.00,0.00,1.00,-1.00,0.00,-0.01,0.44,-0.54,0.00,0.00
2,3.47,0.78,-0.64,3.80,-0.02,-0.03,-0.34,-0.02,1.97,-0.25,...,0.00,0.00,1.00,-1.00,0.00,-0.22,-0.12,0.57,0.00,0.00
3,-0.94,-5.14,-7.12,9.28,-5.48,-8.47,-0.71,-0.77,0.36,-0.81,...,0.00,0.00,0.00,0.00,0.00,-0.09,0.49,-0.60,0.00,0.00
4,0.05,0.86,-0.56,0.33,14.56,0.12,0.01,-0.07,-0.02,0.30,...,0.00,0.00,0.00,0.00,0.00,0.26,-0.30,-7.10,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12081,-1.48,0.19,-0.04,-0.08,0.25,0.28,0.24,0.13,-0.01,-0.39,...,0.00,0.00,1.00,-1.00,0.00,0.12,0.08,-1.71,1.00,0.00
12082,0.06,-5.14,0.25,0.32,0.02,-0.21,0.03,0.06,-0.02,1.65,...,0.00,0.00,0.00,0.00,0.00,-0.43,-0.31,-0.39,0.00,0.00
12083,-1.06,-0.28,-0.11,-0.02,31.91,-0.72,-0.77,-0.82,1.01,-1.07,...,0.00,0.00,0.00,0.00,0.00,0.35,0.58,-6.84,0.00,0.00
12084,-0.00,-5.14,0.03,-0.09,-0.23,-0.46,0.03,0.02,-0.02,-0.33,...,0.00,0.00,1.00,-1.00,0.00,-0.41,-0.05,-0.46,0.00,0.00


In [233]:
X_train_final = train_enc
y_train_final = train_full[wind]
X_test_final = test_enc

rf.fit(X_train_final, y_train_final)
y_pred_final = rf.predict(X_test_final)

y_pred_final

array([ 2.58469555,  3.01381011,  3.48398268, ...,  5.60518568,
        5.34673143, 10.2436051 ])

In [234]:
y_pred_final_df = pd.DataFrame(y_pred_final, columns=[wind])
y_pred_final_df

Unnamed: 0,windmill_generated_power(kW/h)
0,2.58
1,3.01
2,3.48
3,6.19
4,3.49
...,...
12081,6.50
12082,3.26
12083,5.61
12084,5.35


In [235]:
submission = pd.concat([test[['tracking_id', 'datetime']], y_pred_final_df], axis=1)
submission.isnull().sum()

tracking_id                       0
datetime                          0
windmill_generated_power(kW/h)    0
dtype: int64

In [236]:
submission.shape

(12086, 3)

In [237]:
submission.to_csv('submission3.csv', index=False)