In [9]:
import pandas as pd

In [10]:
# loading the dataset
file_path = 'Steel_industry_data.csv'
data = pd.read_csv(file_path)

# displaying the first few features of the dataset and also to understand the structure of the dataset
print(data.head())
print("\n",data.columns.tolist())

               date  Usage_kWh  Lagging_Current_Reactive.Power_kVarh  \
0  01/01/2018 00:15       3.17                                  2.95   
1  01/01/2018 00:30       4.00                                  4.46   
2  01/01/2018 00:45       3.24                                  3.28   
3  01/01/2018 01:00       3.31                                  3.56   
4  01/01/2018 01:15       3.82                                  4.50   

   Leading_Current_Reactive_Power_kVarh  CO2(tCO2)  \
0                                   0.0        0.0   
1                                   0.0        0.0   
2                                   0.0        0.0   
3                                   0.0        0.0   
4                                   0.0        0.0   

   Lagging_Current_Power_Factor  Leading_Current_Power_Factor   NSM  \
0                         73.21                         100.0   900   
1                         66.77                         100.0  1800   
2                         70.

Attribute Information:

Date Continuous - time data taken on the first of the month

Usage_kWh - Industry Energy Consumption Continuous kWh

Lagging Current - reactive power Continuous kVarh

Leading Current - reactive power Continuous kVarh

CO2 - Continuous ppm

NSM - Number of Seconds from midnight Continuous S

Week - status Categorical (Weekend (0) or a Weekday(1))

Day of week - Categorical Sunday, Monday : Saturday

Load Type - Categorical Light Load, Medium Load, Maximum Load



Dependent Variable (Target): Usage_kWh

Independent Variables (Features):

Lagging_Current_Reactive.Power_kVarh

Leading_Current_Reactive_Power_kVarh

CO2(tCO2)

Lagging_Current_Power_Factor

Leading_Current_Power_Factor

NSM

WeekStatus (Encoded)

Day_of_week (Encoded)

Load_Type (Encoded)

In [11]:
# To find the total number of records in the dataset
total_records = data.shape[0]
total_records

35040

In [13]:
# Convert 'date' column to datetime data type
data['date'] = pd.to_datetime(data['date'])

# Verify the conversion by checking the data types again
data.dtypes

date                                    datetime64[ns]
Usage_kWh                                      float64
Lagging_Current_Reactive.Power_kVarh           float64
Leading_Current_Reactive_Power_kVarh           float64
CO2(tCO2)                                      float64
Lagging_Current_Power_Factor                   float64
Leading_Current_Power_Factor                   float64
NSM                                              int64
WeekStatus                                      object
Day_of_week                                     object
Load_Type                                       object
dtype: object

In [15]:
# Correcting the provided code

# initially categorize based on the type of the data
numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns.tolist()

# then we categorize based on unique values and content
binary_cols = [col for col in data.columns if data[col].nunique() == 2]
nominal_cols = ['Day_of_week']  # Based on prior knowledge of the data
categorical_cols = ['WeekStatus', 'Load_Type']  # Encoded but originally categorical
remaining_numerical = [col for col in numerical_cols if col not in binary_cols]
datetime_cols = data.select_dtypes(include=['datetime']).columns.tolist()

categorization = {
    "Binary": binary_cols,
    "Nominal": nominal_cols,
    "Categorical": categorical_cols,
    "Numerical": remaining_numerical,
    "Datetime": datetime_cols
}

# Convert the categorization dictionary to a DataFrame for display as a table
categorization_df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in categorization.items()]))

categorization_df

Unnamed: 0,Binary,Nominal,Categorical,Numerical,Datetime
0,WeekStatus,Day_of_week,WeekStatus,Usage_kWh,date
1,,,Load_Type,Lagging_Current_Reactive.Power_kVarh,
2,,,,Leading_Current_Reactive_Power_kVarh,
3,,,,CO2(tCO2),
4,,,,Lagging_Current_Power_Factor,
5,,,,Leading_Current_Power_Factor,
6,,,,NSM,


In [21]:
#let's convert the categorization dictionary to pair each feature with its categorized data type
features_and_types = []
for k, v in categorization.items():
    for i in v:
        if pd.notnull(i):
            features_and_types.append((i, k))

features_types_df_no_nan = pd.DataFrame(features_and_types_corrected, columns=['Feature Name', 'Data Type Categorized'])

features_types_df_no_nan


Unnamed: 0,Feature Name,Data Type Categorized
0,WeekStatus,Binary
1,Day_of_week,Nominal
2,WeekStatus,Categorical
3,Load_Type,Categorical
4,Usage_kWh,Numerical
5,Lagging_Current_Reactive.Power_kVarh,Numerical
6,Leading_Current_Reactive_Power_kVarh,Numerical
7,CO2(tCO2),Numerical
8,Lagging_Current_Power_Factor,Numerical
9,Leading_Current_Power_Factor,Numerical


In [22]:
summary_statistics = data.describe()
summary_statistics

Unnamed: 0,Usage_kWh,Lagging_Current_Reactive.Power_kVarh,Leading_Current_Reactive_Power_kVarh,CO2(tCO2),Lagging_Current_Power_Factor,Leading_Current_Power_Factor,NSM
count,35040.0,35040.0,35040.0,35040.0,35040.0,35040.0,35040.0
mean,27.386892,13.035384,3.870949,0.011524,80.578056,84.36787,42750.0
std,33.44438,16.306,7.424463,0.016151,18.921322,30.456535,24940.534317
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.2,2.3,0.0,0.0,63.32,99.7,21375.0
50%,4.57,5.0,0.0,0.0,87.96,100.0,42750.0
75%,51.2375,22.64,2.09,0.02,99.0225,100.0,64125.0
max,157.18,96.91,27.76,0.07,100.0,100.0,85500.0


In [None]:
from sklearn.feature_selection import VarianceThreshold

# Assuming 'X' is your set of independent variables (features)
# Note: Make sure 'X' does not contain non-numeric columns for this step
selector = VarianceThreshold(threshold=(.8 * (1 - .8)))
X_reduced = selector.fit_transform(X)

# Features selected
features_selected = X.columns[selector.get_support(indices=True)]
features_removed_variance = list(set(X.columns) - set(features_selected))


In [None]:
import numpy as np

# Calculate correlation matrix
corr_matrix = X[features_selected].corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find features with correlation greater than 0.8
highly_correlated_features = [column for column in upper.columns if any(upper[column] > 0.8)]

# You might decide to remove these or further investigate which ones to keep


In [None]:
from sklearn.ensemble import RandomForestRegressor

# Fit a model (using RandomForest for demonstration)
model = RandomForestRegressor()
model.fit(X_train[features_selected], y_train)

# Get feature importances
importances = model.feature_importances_

# Sort features by importance
features_importance_sorted = np.argsort(importances)[::-1]

# Print the feature importances
print("Feature ranking:")
for f in range(X_train[features_selected].shape[1]):
    print(f"{f + 1}. feature {features_selected[features_importance_sorted[f]]} ({importances[features_importance_sorted[f]]})")

# Based on this, you might decide to remove features with very low importance
