In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor

In [3]:
df = pd.read_csv('train.csv', parse_dates=['Datetime'], index_col='Datetime')
df

Unnamed: 0_level_0,ActivePower,AmbientTemperatue,BearingShaftTemperature,Blade1PitchAngle,Blade2PitchAngle,Blade3PitchAngle,GearboxBearingTemperature,GearboxOilTemperature,GeneratorRPM,GeneratorWinding1Temperature,GeneratorWinding2Temperature,HubTemperature,MainBoxTemperature,NacellePosition,ReactivePower,RotorRPM,TurbineStatus,WindDirection,WindSpeed
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2018-01-01 06:20:00+00:00,26.212347,28.696304,,,,,,,,,,,,252.00,3.976499,,,252.00,3.042750
2018-01-01 06:40:00+00:00,59.632658,29.052567,,,,,,,,,,,,255.00,11.091660,,,255.00,3.424814
2018-01-01 06:50:00+00:00,40.889650,28.984758,,,,,,,,,,,,264.75,4.234497,,,264.75,3.507172
2018-01-01 10:40:00+00:00,34.398111,30.282058,,,,,,,,,,,,283.75,5.347792,,,283.75,3.442212
2018-01-01 12:00:00+00:00,29.797653,29.333519,,,,,,,,,,,,274.00,6.022326,,,274.00,3.031363
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-03-15 23:10:00+00:00,272.852287,23.500851,43.740629,-0.524250,-0.108540,-0.108540,63.668076,56.801410,1029.697556,64.904780,64.000634,35.012077,35.406250,182.00,54.889963,9.243551,2.0,182.00,5.156629
2020-03-15 23:20:00+00:00,295.675931,23.409183,43.529209,-0.684376,-0.168882,-0.168882,63.361840,56.551527,1032.996150,64.109157,63.189782,35.004216,35.280933,182.00,60.673948,9.269149,2.0,182.00,5.257433
2020-03-15 23:30:00+00:00,295.127333,23.336964,43.331115,-0.743521,-0.292512,-0.292512,63.156279,56.393276,1031.935870,63.740245,62.798514,35.003815,35.140925,182.00,59.551241,9.252811,2.0,182.00,5.188531
2020-03-15 23:40:00+00:00,342.942216,23.310323,43.169442,-0.955621,-0.516466,-0.516466,63.191997,56.273992,1047.165900,63.657965,62.699627,35.003815,35.093750,182.00,69.460091,9.385111,2.0,182.00,5.408458


In [4]:
df.isna().sum()

ActivePower                         0
AmbientTemperatue                 896
BearingShaftTemperature         26684
Blade1PitchAngle                43282
Blade2PitchAngle                43365
Blade3PitchAngle                43365
GearboxBearingTemperature       26682
GearboxOilTemperature           26670
GeneratorRPM                    26674
GeneratorWinding1Temperature    26656
GeneratorWinding2Temperature    26649
HubTemperature                  26792
MainBoxTemperature              26704
NacellePosition                 20424
ReactivePower                      42
RotorRPM                        26680
TurbineStatus                   26466
WindDirection                   20424
WindSpeed                         308
dtype: int64

In [None]:
# First, let's check the percentage of missing values per row
missing_per_row = df.isnull().sum(axis=1) / len(df.columns)
print("Rows with more than 50% missing values:", sum(missing_per_row > 0.5))

# Create and fit the MICE imputer with improved settings
mice_imputer = IterativeImputer(
    estimator=RandomForestRegressor(
        n_estimators=100,     # Increased for better predictions
        max_depth=15,         # Increased to capture more complex relationships
        min_samples_leaf=5,   # Prevents overfitting
        n_jobs=-1
    ),
    initial_strategy='mean',  # Start with mean instead of simple imputation
    n_nearest_features=10,    # Use only the most correlated features for each imputation
    random_state=42,
    max_iter=10,             # Increased iterations for better convergence
    verbose=1
)

# Optional: Remove rows with too many missing values (e.g., more than 70%)
df_filtered = df[missing_per_row <= 0.7].copy()

# Fit and transform the data
df_imputed = df_filtered.copy()
df_imputed.loc[:, :] = mice_imputer.fit_transform(df_filtered)

# Add small random noise to imputed values to avoid identical values
for column in df_imputed.columns:
    missing_mask = df_filtered[column].isnull()
    if missing_mask.any():
        # Calculate the standard deviation of the non-missing values
        std = df_filtered[column].std()
        # Add small random noise (0.1% of std) to imputed values
        noise = np.random.normal(0, 0.001 * std, size=sum(missing_mask))
        df_imputed.loc[missing_mask, column] += noise

# Verify the imputation and check for duplicate values
print("\nOriginal DataFrame null counts:")
print(df_filtered.isnull().sum().sum())
print("\nImputed DataFrame null counts:")
print(df_imputed.isnull().sum().sum())

# Check for identical imputed values
for column in df_imputed.columns:
    if df_filtered[column].isnull().any():
        n_unique = len(df_imputed[column][df_filtered[column].isnull()].unique())
        print(f"\nUnique imputed values in {column}: {n_unique}")

Rows with more than 50% missing values: 26647
[IterativeImputer] Completing matrix with shape (71312, 19)
[IterativeImputer] Change: 1772293.4851440701, scaled tolerance: 65746.528 
[IterativeImputer] Change: 1774508.5242144163, scaled tolerance: 65746.528 
[IterativeImputer] Change: 5237.322430744682, scaled tolerance: 65746.528 
[IterativeImputer] Early stopping criterion reached.

Original DataFrame null counts:
350413

Imputed DataFrame null counts:
0

Unique imputed values in AmbientTemperatue: 845

Unique imputed values in BearingShaftTemperature: 20802

Unique imputed values in Blade1PitchAngle: 37398

Unique imputed values in Blade2PitchAngle: 37482

Unique imputed values in Blade3PitchAngle: 37482

Unique imputed values in GearboxBearingTemperature: 20800

Unique imputed values in GearboxOilTemperature: 20780

Unique imputed values in GeneratorRPM: 20784

Unique imputed values in GeneratorWinding1Temperature: 20774

Unique imputed values in GeneratorWinding2Temperature: 20769


In [6]:
df_imputed

Unnamed: 0_level_0,ActivePower,AmbientTemperatue,BearingShaftTemperature,Blade1PitchAngle,Blade2PitchAngle,Blade3PitchAngle,GearboxBearingTemperature,GearboxOilTemperature,GeneratorRPM,GeneratorWinding1Temperature,GeneratorWinding2Temperature,HubTemperature,MainBoxTemperature,NacellePosition,ReactivePower,RotorRPM,TurbineStatus,WindDirection,WindSpeed
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2018-01-01 06:20:00+00:00,26.212347,28.696304,45.741359,-1.095731,3.559173,3.559652,64.468257,59.368376,277.983743,81.566585,80.852870,39.768562,39.306343,252.00,3.976499,2.883984,2.187659,252.00,3.042750
2018-01-01 06:40:00+00:00,59.632658,29.052567,45.810119,-1.111884,3.559173,3.559652,64.520599,59.479540,277.983743,81.592571,80.872443,40.686020,39.889216,255.00,11.091660,3.129672,2.187659,255.00,3.424814
2018-01-01 06:50:00+00:00,40.889650,28.984758,45.890186,-1.111884,3.559173,3.559652,64.541429,59.479540,277.983743,81.592571,80.894053,40.686020,39.891512,264.75,4.234497,2.839942,2.187659,264.75,3.507172
2018-01-01 10:40:00+00:00,34.398111,30.282058,45.810119,-1.111320,3.559173,3.559652,64.520599,59.479540,277.983743,81.592571,80.894053,41.196204,40.711584,283.75,5.347792,2.839942,2.187659,283.75,3.442212
2018-01-01 12:00:00+00:00,29.797653,29.333519,45.741359,-1.111884,3.559173,3.559652,64.468257,59.368376,277.983743,81.592571,80.894053,40.686020,39.784645,274.00,6.022326,2.883984,2.187659,274.00,3.031363
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-03-15 23:10:00+00:00,272.852287,23.500851,43.740629,-0.524250,-0.108540,-0.108540,63.668076,56.801410,1029.697556,64.904780,64.000634,35.012077,35.406250,182.00,54.889963,9.243551,2.000000,182.00,5.156629
2020-03-15 23:20:00+00:00,295.675931,23.409183,43.529209,-0.684376,-0.168882,-0.168882,63.361840,56.551527,1032.996150,64.109157,63.189782,35.004216,35.280933,182.00,60.673948,9.269149,2.000000,182.00,5.257433
2020-03-15 23:30:00+00:00,295.127333,23.336964,43.331115,-0.743521,-0.292512,-0.292512,63.156279,56.393276,1031.935870,63.740245,62.798514,35.003815,35.140925,182.00,59.551241,9.252811,2.000000,182.00,5.188531
2020-03-15 23:40:00+00:00,342.942216,23.310323,43.169442,-0.955621,-0.516466,-0.516466,63.191997,56.273992,1047.165900,63.657965,62.699627,35.003815,35.093750,182.00,69.460091,9.385111,2.000000,182.00,5.408458


In [9]:
df_imputed.isna().sum().sum()

Unnamed: 0,ActivePower,AmbientTemperatue,BearingShaftTemperature,Blade1PitchAngle,Blade2PitchAngle,Blade3PitchAngle,GearboxBearingTemperature,GearboxOilTemperature,GeneratorRPM,GeneratorWinding1Temperature,GeneratorWinding2Temperature,HubTemperature,MainBoxTemperature,NacellePosition,ReactivePower,RotorRPM,TurbineStatus,WindDirection,WindSpeed
count,77202.0,77202.0,77202.0,77202.0,77202.0,77202.0,77202.0,77202.0,77202.0,77202.0,77202.0,77202.0,77202.0,77202.0,77202.0,77202.0,77202.0,77202.0,77202.0
mean,744.836678,28.401707,44.665926,1.187485,3.559874,3.560144,66.523681,58.937169,1161.200966,78.961293,78.310149,37.633714,39.337204,197.582359,107.718258,10.478371,1746.072,197.58587,6.536043
std,597.080992,4.090631,4.23866,7.684061,7.340647,7.340647,7.289593,4.780025,438.143903,18.661814,18.707825,4.230657,4.663128,76.705143,118.537132,3.824077,323706.9,76.70514,2.403385
min,0.0,0.0,0.0,-1.960466,-2.406213,-2.406213,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-74.684579,0.0,0.0,0.0,0.0
25%,225.710444,25.469923,42.052584,-1.057897,0.810358,0.810358,63.794252,56.83968,1029.740459,64.615231,63.826495,35.002976,36.052844,163.0,0.489189,9.229943,2.0,163.0,4.70119
50%,556.1359,28.057838,46.02868,-0.919407,3.559173,3.559652,65.879412,58.517786,1206.82836,81.44298,80.817304,38.046076,39.360102,197.512927,62.479894,10.821384,2.0,197.5262,6.12062
75%,1279.882107,31.135201,46.869728,0.715646,3.559173,3.559652,69.389295,60.406635,1556.444811,83.773523,83.207273,40.547899,42.385587,221.276786,181.501182,13.955013,2.184042,221.276786,7.98517
max,1779.032433,41.809714,55.088655,89.02752,88.915344,88.915344,82.237932,70.764581,1809.9417,126.773031,126.043018,47.996185,53.76875,357.0,403.71362,16.273495,65746530.0,357.0,22.970893


In [12]:
df_imputed.to_csv('train_imputed.csv', index=True)