In [1]:
import re

import pandas as pd

from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVR

In [2]:
data_source = pd.read_csv('incident.csv')

In [3]:
data_source.head()

Unnamed: 0,number,opened_at,state,assignment_group,assigned_to,short_description,description,contact_type,reopen_count,resolved_at,close_code,close_notes,closed_at
0,INC0331268,2023-03-11 21:49:01,Closed,NY DB,Nirali Patel,Error on WiFi Router - Laptop Issues - 401,User reported Error on WiFi Router - Laptop Is...,Self-service,3,2023-11-12 00:18:49,Resolved by request,Resolution Notes for Error on WiFi Router - La...,2023-12-20 12:51:27
1,INC0331267,2023-10-07 12:14:11,Closed,Business Application Registration Approval Group,Amelia Caputo,Issue on User Account - Security Incidents - 407,User reported Issue on User Account - Security...,Self-service,7,2024-02-09 17:31:12,Resolved by change,Resolution Notes for Issue on User Account - S...,2024-02-11 13:24:25
2,INC0331266,2022-09-20 19:32:41,Closed,Catalog Request Approvers for Sales,Vivian Brzostowski,Issue on Printer - WiFi Problems - 848,User reported Issue on Printer - WiFi Problems...,Chat,6,2022-09-25 10:00:00,Workaround provided,Resolution Notes for Issue on Printer - WiFi P...,2023-07-18 19:22:36
3,INC0331265,2023-11-20 18:34:40,Closed,San Diego Tech Lounge,Mitch Schattner,Malfunction on Mobile Device - Laptop Issues -...,User reported Malfunction on Mobile Device - L...,Chat,1,2024-01-30 04:55:21,Resolved by caller,Resolution Notes for Malfunction on Mobile Dev...,2024-02-04 23:35:20
4,INC0331264,2023-11-10 22:43:08,Closed,San Diego Tech Lounge,Isaac Zackery,Issue on Database Server - Software Bugs - 44,User reported Issue on Database Server - Softw...,Email,6,2023-11-18 16:33:32,Resolved by change,Resolution Notes for Issue on Database Server ...,2023-11-29 11:09:17


In [4]:
# 1. Understanding the Data Structure
print("Dataset Shape:", data_source.shape)
print("\nColumn Data Types and Description:")
data_source.info()

Dataset Shape: (150000, 13)

Column Data Types and Description:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 13 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   number             150000 non-null  object
 1   opened_at          150000 non-null  object
 2   state              150000 non-null  object
 3   assignment_group   150000 non-null  object
 4   assigned_to        150000 non-null  object
 5   short_description  150000 non-null  object
 6   description        150000 non-null  object
 7   contact_type       150000 non-null  object
 8   reopen_count       150000 non-null  int64 
 9   resolved_at        150000 non-null  object
 10  close_code         150000 non-null  object
 11  close_notes        150000 non-null  object
 12  closed_at          150000 non-null  object
dtypes: int64(1), object(12)
memory usage: 14.9+ MB


In [5]:
# 2. Summary Statistics
print("\nSummary Statistics:")
data_source.describe(include='all')


Summary Statistics:


Unnamed: 0,number,opened_at,state,assignment_group,assigned_to,short_description,description,contact_type,reopen_count,resolved_at,close_code,close_notes,closed_at
count,150000,150000,150000,150000,150000,150000,150000,150000,150000.0,150000,150000,150000,150000
unique,150000,149792,2,48,627,125200,146100,6,,149592,10,144491,148850
top,INC0331268,2023-03-22 19:40:42,Closed,Service Desk,Model Manager,Error on Firewall - Software Bugs - 776,User reported Issue on User Account - Software...,Self-service,,2024-02-02 13:06:54,Resolved by caller,Resolution Notes for Error on Database Server ...,2024-03-09 06:29:10
freq,1,2,149984,6145,296,6,4,42854,,3,15188,4,3
mean,,,,,,,,,4.507373,,,,
std,,,,,,,,,2.872279,,,,
min,,,,,,,,,0.0,,,,
25%,,,,,,,,,2.0,,,,
50%,,,,,,,,,5.0,,,,
75%,,,,,,,,,7.0,,,,


In [6]:
data_source['opened_at'] = pd.to_datetime(data_source['opened_at'])
data_source['resolved_at'] = pd.to_datetime(data_source['resolved_at'])

In [7]:
# We need to find the time to resolution 
data_source['time_to_resolution'] = (data_source['resolved_at'] - data_source['opened_at']).dt.total_seconds() / (3600*24)

In [8]:
# We only need the incidents that are closed
closed_incidents = data_source[data_source['state'] == 'Closed']

In [9]:
# Removing the outliers
Q1 = closed_incidents['time_to_resolution'].quantile(0.25)
Q3 = closed_incidents['time_to_resolution'].quantile(0.75)
IQR = Q3 - Q1

closed_incidents = closed_incidents[(closed_incidents['time_to_resolution'] > Q1 - 1.5 * IQR) & (closed_incidents['time_to_resolution'] < Q3 + 1.5 * IQR)]

In [10]:
def extract_error_code(string):
    match = re.search(r'\b\d+\b', string)
    return match.group(0) if match else 0

In [11]:
# Feature Engineering to get the error code from the short description
closed_incidents['error_code'] = closed_incidents['short_description'].apply(extract_error_code).astype(int)

In [12]:
features = ['assignment_group', 'assigned_to', 'contact_type','error_code']
labels = ['time_to_resolution']


In [13]:

label_encoder = LabelEncoder()

In [14]:
for feature in ['assignment_group', 'assigned_to', 'contact_type']:
    closed_incidents[feature] = label_encoder.fit_transform(closed_incidents[feature])

In [15]:
X_train, X_test, y_train, y_test = train_test_split(closed_incidents[features], closed_incidents[labels].values.ravel(), test_size=0.2, random_state=42)

In [16]:
svr_param_grid = {'C': [0.1, 1, 10, 100], 'epsilon': [0.1, 0.01, 0.001]}
gb_param_grid = {'n_estimators': [50, 100, 150], 'learning_rate': [0.05, 0.1, 0.2]}
rf_param_grid = {'n_estimators': [100, 200, 300], 'max_depth': [None, 10, 20]}

In [17]:
svr_grid_search = GridSearchCV(LinearSVR(dual=True), svr_param_grid, cv=5, scoring='neg_mean_squared_error',n_jobs=-1, verbose=2, return_train_score=True)
svr_grid_search.fit(X_train, y_train)
svr_best_model = svr_grid_search.best_estimator_
svr_cv_results = svr_grid_search.cv_results_
mean_train_scores_svr = svr_cv_results['mean_train_score']
print("Mean training scores for Linear SVR:")
print(mean_train_scores_svr)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Mean training scores for Linear SVR:
[-22620.34387355 -19619.64962257 -21800.06977283 -28476.51578693
 -38684.12500615 -31673.44542237 -34112.80275253 -44165.63822799
 -35340.10490693 -51587.35392637 -51342.30895447 -98690.46484895]




In [18]:
gb_grid_search = GridSearchCV(GradientBoostingRegressor(), gb_param_grid, cv=5, scoring='neg_mean_squared_error',n_jobs=-1, verbose=2, return_train_score=True)
gb_grid_search.fit(X_train, y_train)
gb_best_model = gb_grid_search.best_estimator_
gb_cv_results = gb_grid_search.cv_results_
mean_train_scores_gb = gb_cv_results['mean_train_score']
print("Mean training scores for Gradient Boosting Regressor:")
print(mean_train_scores_gb)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Mean training scores for Gradient Boosting Regressor:
[-18568.13474211 -18545.10608672 -18523.09736029 -18544.36030181
 -18501.44192475 -18462.25781859 -18500.28847352 -18424.38188706
 -18356.31492614]


In [19]:
# rf_grid_search = GridSearchCV(RandomForestRegressor(), rf_param_grid, cv=5, scoring='neg_mean_squared_error')
# rf_grid_search.fit(X_train, y_train)
# rf_best_model = rf_grid_search.best_estimator_

In [20]:
svr_y_pred = svr_best_model.predict(X_test)
gb_y_pred = gb_best_model.predict(X_test)
# rf_y_pred = rf_best_model.predict(X_test)


In [21]:
mse_svr = mean_squared_error(y_test, svr_y_pred)
mae_svr = mean_absolute_error(y_test, svr_y_pred)

print("Linear SVR Mean Squared Error:", mse_svr)
print("Linear SVR Mean Absolute Error:", mae_svr)

Linear SVR Mean Squared Error: 24302.069176781475
Linear SVR Mean Absolute Error: 113.51486422643849


In [25]:
mse_gb = mean_squared_error(y_test, gb_y_pred)
mae_gb = mean_absolute_error(y_test, gb_y_pred)

print("Gradient Boosting Regressor Mean Squared Error:", mse_gb)
print("Gradient Boosting Regressor Mean Absolute Error:",mae_gb)

Gradient Boosting Regressor Mean Squared Error: 19055.995151951884
Gradient Boosting Regressor Mean Absolute Error: 113.9344848395556


In [28]:
print(
"""
Liner Regression Model Values: 
Mean Squared Error: 26394.24716318561
Mean Absolute Error: 137.6327382981697
""")


Liner Regression Model Values: 
Mean Squared Error: 26394.24716318561
Mean Absolute Error: 137.6327382981697



In [None]:
# mse_rf = mean_squared_error(y_test, rf_y_pred)
# r2_rf = r2_score(y_test, rf_y_pred)
# print("Random Forest Regressor Mean Squared Error:", mse_rf)
# print("Random Forest Regressor R-squared score:", r2_rf)