# Outage Duration Prediction

**Name(s)**: Neil Sharma, Xiang Ding

**Website Link**: (your website link)

## Code

In [2]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Preprocessing
from sklearn.preprocessing import OneHotEncoder, StandardScaler, QuantileTransformer
from sklearn.impute import SimpleImputer

# Model selection
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression, HuberRegressor, QuantileRegressor

# Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import plotly.express as px
import plotly.graph_objects as go
pd.options.plotting.backend = 'plotly'

### Framing the Problem

Prediction Problem Type: This problem type is regression, as we are trying to predict a continuous quantity, in this case how long an outage occurs.

Response Variable: Our response variable will be, in minutes, how long an outage occurs for. This value can be seen in the dataset as 'OUTAGE.DURATION'

Metric: We will use mean squared error (MSE) loss as our metric for evaluation. We decided on MSE due to its sensitivity to outliers and its ability to be easily understood as it maintains the same units as our response variable, 'OUTAGE.DURATION'.

In [13]:
##################################################
#     DATA CLEANING CODE FROM PROJECT 3          #
##################################################
df = pd.read_excel('outage.xlsx', skiprows = 5)

df = df.set_index('OBS')
df = df.iloc[1: , :]
df = df[df.columns[1:]]

df['OUTAGE.START.DATE'] = pd.to_datetime(df['OUTAGE.START.DATE'])
df['OUTAGE.START.DATE'] = df['OUTAGE.START.DATE'].dt.date

df['OUTAGE.RESTORATION.DATE'] = pd.to_datetime(df['OUTAGE.RESTORATION.DATE'])
df['OUTAGE.RESTORATION.DATE'] = df['OUTAGE.RESTORATION.DATE'].dt.date
df['CUSTOMERS.AFFECTED_MISSING'] = df['CUSTOMERS.AFFECTED'].isna().astype(int)


df = df[["YEAR", "MONTH", 'U.S._STATE', 'NERC.REGION', 'CLIMATE.REGION', 'ANOMALY.LEVEL', 'CLIMATE.CATEGORY',
         'CAUSE.CATEGORY', 'CAUSE.CATEGORY.DETAIL', 'OUTAGE.DURATION', 'DEMAND.LOSS.MW', 'CUSTOMERS.AFFECTED', 'PI.UTIL.OFUSA',
         'OUTAGE.START.TIME', 'OUTAGE.RESTORATION.TIME', 'POPULATION']]
df = df.dropna(subset=['ANOMALY.LEVEL', 'CLIMATE.CATEGORY', 'OUTAGE.DURATION', 'OUTAGE.START.TIME', 'OUTAGE.RESTORATION.TIME', 'CUSTOMERS.AFFECTED'])


def categorize_time(time_str):
    if pd.isna(time_str):
        return 'Unknown'  
    time = pd.to_datetime(time_str, format='%I:%M:%S %p', errors='coerce')
    if time.hour < 12:
        return 'Morning'
    else:
        return 'Afternoon/Evening'

# Apply the function to create new columns
df['OUTAGE.START.CATEGORY'] = df['OUTAGE.START.TIME'].apply(categorize_time)
df['OUTAGE.END.CATEGORY'] = df['OUTAGE.RESTORATION.TIME'].apply(categorize_time)


pd.set_option('display.max_columns', None)
df

Unnamed: 0_level_0,YEAR,MONTH,U.S._STATE,NERC.REGION,CLIMATE.REGION,ANOMALY.LEVEL,CLIMATE.CATEGORY,CAUSE.CATEGORY,CAUSE.CATEGORY.DETAIL,OUTAGE.DURATION,DEMAND.LOSS.MW,CUSTOMERS.AFFECTED,PI.UTIL.OFUSA,OUTAGE.START.TIME,OUTAGE.RESTORATION.TIME,POPULATION,OUTAGE.START.CATEGORY,OUTAGE.END.CATEGORY
OBS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1.0,2011.0,7.0,Minnesota,MRO,East North Central,-0.3,normal,severe weather,,3060,,70000.0,2.2,17:00:00,20:00:00,5348119.0,Afternoon/Evening,Afternoon/Evening
3.0,2010.0,10.0,Minnesota,MRO,East North Central,-1.5,cold,severe weather,heavy wind,3000,,70000.0,2.1,20:00:00,22:00:00,5310903.0,Afternoon/Evening,Afternoon/Evening
4.0,2012.0,6.0,Minnesota,MRO,East North Central,-0.1,normal,severe weather,thunderstorm,2550,,68200.0,2.2,04:30:00,23:00:00,5380443.0,Afternoon/Evening,Afternoon/Evening
5.0,2015.0,7.0,Minnesota,MRO,East North Central,1.2,warm,severe weather,,1740,250,250000.0,2.2,02:00:00,07:00:00,5489594.0,Afternoon/Evening,Afternoon/Evening
6.0,2010.0,11.0,Minnesota,MRO,East North Central,-1.4,cold,severe weather,winter storm,1860,,60000.0,2.1,15:00:00,22:00:00,5310903.0,Afternoon/Evening,Afternoon/Evening
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1523.0,2004.0,6.0,Idaho,WECC,Northwest,0.3,normal,system operability disruption,,95,157,35000.0,0.3,17:35:00,19:10:00,1391802.0,Afternoon/Evening,Afternoon/Evening
1524.0,2011.0,1.0,Idaho,WECC,Northwest,-1.3,cold,intentional attack,vandalism,360,0,0.0,0.4,07:00:00,13:00:00,1584134.0,Afternoon/Evening,Afternoon/Evening
1525.0,2003.0,6.0,Idaho,WECC,Northwest,-0.1,normal,public appeal,,1548,0,0.0,0.3,15:12:00,17:00:00,1363380.0,Afternoon/Evening,Afternoon/Evening
1527.0,2016.0,3.0,Idaho,WECC,Northwest,1.6,warm,intentional attack,sabotage,0,0,0.0,0.4,00:00:00,00:00:00,1680026.0,Afternoon/Evening,Afternoon/Evening


### Baseline Model

In [5]:
X = df[['ANOMALY.LEVEL','CAUSE.CATEGORY']]
y = df['OUTAGE.DURATION']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Preprocessing for numerical data: no transformation needed
# Preprocessing for categorical data: OneHotEncoder
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), ['CAUSE.CATEGORY']),
        ('standard', StandardScaler(), ['ANOMALY.LEVEL'])
    ])

# Create a pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', LinearRegression())])

# Train the model
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 9402680.912145877


In [6]:
trace0 = go.Scatter(
    x=np.arange(len(y_test)),
    y=y_test,
    mode='markers',
    name='Actual Values'
)
trace1 = go.Scatter(
    x=np.arange(len(y_pred)),
    y=y_pred,
    mode='markers',
    name='Predicted Values'
)

# Create layout
layout = go.Layout(
    title='Actual vs Predicted Values',
    xaxis=dict(title='Index'),
    yaxis=dict(title='Outage Duration')
)

# Create figure and add traces
fig = go.Figure(data=[trace0, trace1], layout=layout)

# Show plot
fig.show()

### Final Model

In [16]:
def extract_hour(time_val):
    if pd.isna(time_val):
        return np.nan
    return time_val.hour

df['OUTAGE.START.HOUR'] = df['OUTAGE.START.TIME'].apply(extract_hour)
df['OUTAGE.RESTORATION.HOUR'] = df['OUTAGE.RESTORATION.TIME'].apply(extract_hour)

print(df.shape)

features = ['NERC.REGION', 'CLIMATE.REGION', 'ANOMALY.LEVEL', 'CLIMATE.CATEGORY', 'CAUSE.CATEGORY',
            'CUSTOMERS.AFFECTED', 'POPULATION', 'U.S._STATE', 'CAUSE.CATEGORY.DETAIL',
            'PI.UTIL.OFUSA']  # Add other relevant features
target = 'OUTAGE.DURATION'

X = df[features]
y = df[target]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Preprocessing
numeric_features = ['ANOMALY.LEVEL', 'CUSTOMERS.AFFECTED', 'POPULATION', 'PI.UTIL.OFUSA']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('quantile', QuantileTransformer(n_quantiles=min(675, len(X_train)), output_distribution='uniform'))
])

categorical_features = ['NERC.REGION', 'CLIMATE.REGION', 'CLIMATE.CATEGORY', 'CAUSE.CATEGORY', 'U.S._STATE', 'CAUSE.CATEGORY.DETAIL']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Model - Gradient Boosting Regressor
model = GradientBoostingRegressor(loss='squared_error', min_impurity_decrease=0.01)

# Pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])

# Hyperparameter Tuning
param_grid = {
    'model__n_estimators': [100, 200],
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__max_depth': [3, 5, 7]
}
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_absolute_error')

# Fit the model
grid_search.fit(X_train, y_train)

# Post-process predictions: Set negative values to 0
y_pred = grid_search.predict(X_test)
y_pred = np.where(y_pred < 0, 0, y_pred)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print("Best parameters:", grid_search.best_params_)
print("Best score (CV):", -grid_search.best_score_)
print("Test Mean Absolute Error:", mae)

(1056, 20)
Best parameters: {'model__learning_rate': 0.1, 'model__max_depth': 5, 'model__n_estimators': 100}
Best score (CV): 2103.949307279464
Test Mean Absolute Error: 1811.5526018572634


In [17]:
trace0 = go.Scatter(
    x=np.arange(len(y_test)),
    y=y_test,
    mode='markers',
    name='Actual Values'
)
trace1 = go.Scatter(
    x=np.arange(len(y_pred)),
    y=y_pred,
    mode='markers',
    name='Predicted Values'
)

# Create layout
layout = go.Layout(
    title='Actual vs Predicted Values',
    xaxis=dict(title='Index'),
    yaxis=dict(title='Outage Duration')
)

# Create figure and add traces
fig = go.Figure(data=[trace0, trace1], layout=layout)

# Show plot
fig.show()

In [22]:
from sklearn.metrics import confusion_matrix

def classify_predictions(y_true, y_pred, threshold=240):
    return np.where(np.abs(np.array(y_true) - np.array(y_pred)) <= threshold, 1, 0)

# Assuming y_test and y_pred are your actual and predicted values
# Ensure y_test and y_pred are numpy arrays
y_test_array = np.array(y_test)
y_pred_array = np.array(y_pred)

# Apply the function to get 'positive' (1) and 'negative' (0) predictions
y_classified = classify_predictions(y_test_array, y_pred_array)

# Create a binary array for y_test to match the 'positive' criteria
y_actual_class = np.ones_like(y_test_array)

y_classified, y_actual_class

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
        0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
        0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1]),
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

### Fairness Analysis

In [None]:
# TODO