In [6]:
%load_ext autoreload
%autoreload 2 #make sure to automatically load externally updated files
import os
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,MaxAbsScaler,OneHotEncoder
from sklearn.model_selection import train_test_split,StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import r2_score
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from helper import *
from sklearn.ensemble import RandomForestRegressor
import seaborn as sns
from sklearn.metrics import mean_squared_error, mean_absolute_error,mean_squared_log_error
import math
from sklearn.preprocessing import StandardScaler,MaxAbsScaler,OneHotEncoder
from sklearn.model_selection import train_test_split,StratifiedShuffleSplit, cross_val_score,cross_validate,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.metrics import balanced_accuracy_score,precision_score,recall_score
from sklearn.svm import SVC
from pandas.plotting import scatter_matrix
from lightgbm import LGBMRegressor

def print_info(variable_name):
    print("---", variable_name, "     type = ", type(eval(variable_name)), "     Value = ", eval(variable_name)," --- ")

def print_infos(*variable_names):
    """
        Prints information about the variables

        Example: print_infos('var1','var2')
    """
    for variable_name in variable_names:
        print("---", variable_name, "     type = ", type(eval(variable_name)), "     Value = ", eval(variable_name)," --- ")

def print_types(*variable_names,print_shape=True):
    """
        Prints types about the variables

        :param print_shape(bool): Prints shape of variables (Needs them to be a np array, DataFrame or Series)

        Example: print_types('var1','var2')
    """
    for variable_name in variable_names:
        if not print_shape:
            print("---", variable_name, "     type = ", type(eval(variable_name)))
        if print_shape:
            print("---", variable_name, "     type = ", type(eval(variable_name)),end="") #makes to to not have a new line
            try:
                print("     Shape = ", eval(variable_name).shape," --- \n")
            except:
                pass


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Load Data

In [7]:
data=pd.read_csv("./data/trainset.csv")
if False:
    data.head(10000).to_excel("./data/trainset.xlsx")
data=data.drop(["index"],axis=1)

data['error'] = (data['error_category'] != 'NO_ERROR').astype(int)

data_full=pd.read_csv("./data/trainset_full.csv")
data_full=data_full.drop(["index"],axis=1)

Evaluate Imputation

In [8]:
features_with_nan=['wind_speed', 'power', 'rotor_speed', 'generator_speed', 'temp_environment', 'temp_hydraulic_oil', 'temp_gear_bearing', 'blade_angle_avg']

In [9]:
def evaluate_imputation(data,data_pred,data_full,column,print_result=False):
    """
        Evaluate the imputaton accuracy of a specific column
        
        returns ||y_pred-y_true||_2/ (num_nan_values) for the column specified

        :param data: raw training data
        :param column (str): column name 
    """
    num_nan_values=int(data.loc[:,[column]].isnull().sum())
    sum_of_squares=float(np.sum((data_pred.loc[:,[column]]-data_full.loc[:,[column]])**2))
    average_error=np.sqrt(sum_of_squares)/num_nan_values

    if print_result:
        print("--- Column ", column, " imputation--- with ",num_nan_values ," num_nan_values: ")
        print("(Average || ||_2) absolute error: ",average_error)
    return average_error,num_nan_values

def evaluate_imputation_single_column(data_pred,data_true,column):
    """
    calculates r2 score of specific imputed column

    @param column (str): the column to evaluate
    """
    missing_mask=data[column].isna()
    r2=r2_score(
        y_true=np.array(data_true[column][missing_mask]), #only those columns with nan values
        y_pred=np.array(data_pred[column][missing_mask])
    )
    return r2

def evaluate_impuation(data_pred,data_true):
    print("R2 score of imputation")
    for column in features_with_nan:
        r2=evaluate_imputation_single_column(data_pred,data_full,column)
        print(r2, column)

Example: how to use evaluation

In [10]:
column="wind_speed"
data_pred=data.ffill()

print(evaluate_imputation_single_column(data_pred,data_full,column))


0.7229440668866738


In [11]:
data_pred=data.ffill()
evaluate_impuation(data_pred,data_full)



R2 score of imputation
0.7229440668866738 wind_speed
0.7394281474530568 power
0.5387601939311892 rotor_speed
0.5724850955915846 generator_speed
0.9706766580278108 temp_environment
0.9792879186765434 temp_hydraulic_oil
0.8906588997410945 temp_gear_bearing
0.24649833536166954 blade_angle_avg


Impute single feature

In [12]:
def get_data_without_nan(data,feature_to_be_imputed):
    """
        only returns the rows where the specific column ("feature_to_be_imputed") is not NaN
        @feature_to_be_imputed (str): name of column


    """
    data_without_nan =data[data[feature_to_be_imputed].notnull()] #dataframe. only rows s.t. feature_imputed is not nan
    return data_without_nan

def create_X_y_for_single_feature_imputation(data_without_nan,feature_to_be_imputed):
    """
        Creats X, and y. Only uses the columns without NaN values in this feature

        @data (DataFrame): the data dataframe
        @param feature_to_be_imputed (str): name of column to be imputed ^= y

        @return X,y 
    """

    y=data_without_nan[feature_to_be_imputed]

    feat_temp=['turbine_id', 'wind_speed', 'power', 'nacelle_direction',
    'wind_direction' ,'rotor_speed', 'generator_speed' ,'temp_environment'
    ,'temp_hydraulic_oil' ,'temp_gear_bearing' ,'cosphi' ,'blade_angle_avg',
    'hydraulic_pressure', 'park_id']
    X=data_without_nan[feat_temp]
    X=X.drop([feature_to_be_imputed],axis=1)

    return X,y

#How to use it
if False:
    feature_to_be_imputed="wind_speed"
    X,y=create_X_y_for_single_feature_imputation(data,feature_to_be_imputed)

    print_infos('X.head()','y.head()')


In [28]:
"""
    Predict one feature using others. D
"""
action=1
match action:
    case 1:
        features_to_be_imputed=features_with_nan
    case 2:
        features_to_be_imputed=["temp_hydraulic_oil"]

model=LGBMRegressor()

for feature_to_be_imputed in features_to_be_imputed:
    print("-- Starting with ", feature_to_be_imputed)

    #Get DataFram where there is no NaN values in this column
    data_without_nan=get_data_without_nan(data,feature_to_be_imputed)

    X,y=create_X_y_for_single_feature_imputation(data_full,feature_to_be_imputed)


    action=2
    match action:
        #Evaluate using single train test split
        case 1:
            X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.5)
            #print_types("X_train","X_test","y_train","y_test") #print shapes

            model.fit(X_train,y_train)
            y_pred=model.predict(X_test)
            score=r2_score(y_test,y_pred)
            print(score,feature_to_be_imputed)
        #Evaluate using CV
        case 2:
            kfold = KFold(n_splits=8, shuffle=True)
            score_dict=cross_validate(model,X,y,cv=kfold,scoring="r2")
            scores=score_dict["test_score"]
            print(np.mean(scores),feature_to_be_imputed,scores)
   

-- Starting with  wind_speed
0.989544594398575 wind_speed [0.98938594 0.9896638  0.98947846 0.98938253 0.98947286 0.98947192
 0.98978693 0.9897143 ]
-- Starting with  power
0.9950065196247153 power [0.99502275 0.9950746  0.99485842 0.99500359 0.99494273 0.99508168
 0.99502771 0.99504068]
-- Starting with  rotor_speed
0.9997682916616242 rotor_speed [0.99977017 0.99977193 0.9997573  0.99977915 0.99975635 0.99976446
 0.99977795 0.99976903]
-- Starting with  generator_speed
0.9996675422162713 generator_speed [0.99966839 0.99966724 0.99966497 0.9996655  0.99966926 0.99966739
 0.99966794 0.99966964]
-- Starting with  temp_environment
0.6340898079998434 temp_environment [0.63580387 0.63505672 0.63325894 0.63244797 0.63286099 0.63582767
 0.63367803 0.63378427]
-- Starting with  temp_hydraulic_oil
0.9470137324468012 temp_hydraulic_oil [0.94781703 0.94710144 0.94694142 0.94747793 0.9463428  0.94730556
 0.94701167 0.94611202]
-- Starting with  temp_gear_bearing
0.940280830353346 temp_gear_bearing

In [14]:


model=LGBMRegressor()
#scoreMult_regr(model,X,y,train_size=0.5,ntimes=1,printAllScores=False,batchDisplay=True)

#print(type(X.head()))
#print(X.head().dtypes)
#print(y.head().dtypes)

X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.7)


model.fit(X_train,y_train)
y_pred=model.predict(X_test)
score=r2_score(y_test,y_train)
print(score)

#CV(model, X,y,nsplits=3,ntimes=1,printIntRes=True,printRes=True,n_jobs=-1)

ValueError: Found input variables with inconsistent numbers of samples: [410158, 957032]

In [None]:
sorted(sklearn.metrics.SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_ovr',
 'roc_auc_ovr_we