# Jira - Resolution Time Prediction Model

## Problem Statement
*Predict the resolution time (in hours) of Jira issues using metadata, text, and temporal features.*


In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read file
fdf=pd.read_csv("GFG_FINAL.csv")

  fdf=pd.read_csv("GFG_FINAL.csv")


In [4]:
# Droping the Columns which the Columns consist of more than 95% of Null Value
missing_ratio=fdf.isnull().mean()
df=fdf.loc[:, missing_ratio < 1]

df=fdf.loc[:, missing_ratio < 0.95]

In [5]:
# Columns Selection
cols = [
    "Summary",
    "Description",
    "Comment", "Comment.1", "Comment.2",
    "Comment.3", "Comment.4", "Comment.5",
    "Issue Type",
    "Priority",
    "Status",
    "Project key",
    "Project name",
    "Project type",
    "Created",
    "Resolved"
]


df = df[cols].copy()

# Keep only rows where resolution time can be computed
df = df.dropna(subset=["Created", "Resolved"])

print("Shape:", df.shape)

Shape: (16660, 16)


In [6]:
# Changing the type to datetime for certain columns
df["Created"] = pd.to_datetime(df["Created"], errors="coerce")
df["Resolved"] = pd.to_datetime(df["Resolved"], errors="coerce")

# Drop invalid datetime rows (safety)
df = df.dropna(subset=["Created", "Resolved"])

  df["Created"] = pd.to_datetime(df["Created"], errors="coerce")
  df["Resolved"] = pd.to_datetime(df["Resolved"], errors="coerce")


In [7]:
# Creating the Label Column
df["resolution_time_hours"] = (
    df["Resolved"] - df["Created"]
).dt.total_seconds() / 3600

In [8]:
df = df[df["resolution_time_hours"] > 0]

In [9]:
# Merging all Comments Columns
comment_cols = [
    "Comment", "Comment.1", "Comment.2",
    "Comment.3", "Comment.4", "Comment.5"
]

df["all_comments"] = (
    df[comment_cols]
    .fillna("")
    .astype(str)
    .agg(" ".join, axis=1)
)

# Drop individual comment columns
df.drop(columns=comment_cols, inplace=True)

In [10]:
# Creating more time based Columns for deeper clarity
df["created_hour"] = df["Created"].dt.hour
df["created_dayofweek"] = df["Created"].dt.dayofweek
df["is_weekend"] = df["created_dayofweek"].isin([5, 6]).astype(int)

In [11]:
# Null value handling
df['Priority']=df['Priority'].fillna('Unknown').astype(str)

In [12]:
# Sorting the df as per Created timeline
df = df.sort_values("Created")

In [13]:
# Splitting the df
split_idx = int(len(df) * 0.8)

train_df = df.iloc[:split_idx]
test_df  = df.iloc[split_idx:]

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

Train shape: (13249, 15)
Test shape: (3313, 15)


In [14]:
# Splitting the Label
y_train = train_df["resolution_time_hours"]
y_test  = test_df["resolution_time_hours"]

In [15]:
# Dropping the unneccesary columns and splitting the Feature columns
drop_cols = [
    "resolution_time_hours",
    "Created",  # already used for splitting
    "Resolved"
]

X_train = train_df.drop(columns=drop_cols)
X_test  = test_df.drop(columns=drop_cols)

In [16]:
# Segregating the feature columns

text_cols = ["Summary", "Description", "all_comments"]

cat_cols = [
    "Issue Type",
    "Priority",
    "Status",
    "Project key",
    "Project name",
    "Project type"
]

num_cols = [
    "created_hour",
    "created_dayofweek",
    "is_weekend"
]

In [17]:
# Combining the text on text columns

X_train["text_combined"] = (
    X_train[text_cols].fillna("").agg(" ".join, axis=1)
)
X_test["text_combined"] = (
    X_test[text_cols].fillna("").agg(" ".join, axis=1)
)

In [18]:
# Performing the TF-IDF on text columns

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1, 2),
    stop_words="english"
)

X_train_tfidf = tfidf.fit_transform(X_train["text_combined"])
X_test_tfidf  = tfidf.transform(X_test["text_combined"])

In [19]:
# Performing the OneHotEncoding on Catgorical columns

from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=True)

X_train_cat = ohe.fit_transform(X_train[cat_cols])
X_test_cat = ohe.transform(X_test[cat_cols])

In [20]:
# hstacking the numarical columns (optional)
from scipy.sparse import hstack

X_train_num = X_train[num_cols].values
X_test_num  = X_test[num_cols].values

In [21]:
# Combining Final X_train and X_test

X_train_final = hstack([X_train_tfidf, X_train_cat, X_train_num])
X_test_final  = hstack([X_test_tfidf, X_test_cat, X_test_num])

print("Final train shape:", X_train_final.shape)
print("Final test shape:", X_test_final.shape)

Final train shape: (13249, 10014)
Final test shape: (3313, 10014)


### Linear Baseline: Ridge Regression

In [36]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error

baseline = Ridge(alpha=1.0)

baseline.fit(X_train_final, y_train)

y_pred_base = baseline.predict(X_test_final)

mae = mean_absolute_error(y_test, y_pred_base)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_base))

print("Baseline MAE:", mae)
print("Baseline RMSE:", rmse)

Baseline MAE: 10723.825069870922
Baseline RMSE: 12367.781687514444


### LightGBM (raw target)

In [26]:
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [27]:
lgb_model = lgb.LGBMRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=-1,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

lgb_model.fit(
    X_train_final,
    y_train
)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.206156 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 42641
[LightGBM] [Info] Number of data points in the train set: 13249, number of used features: 10010
[LightGBM] [Info] Start training from score 5524.899628


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,300
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [28]:
y_pred_lgb = lgb_model.predict(X_test_final)



In [30]:
lgb_mae = mean_absolute_error(y_test, y_pred_lgb)
lgb_rmse = np.sqrt(mean_squared_error(y_test, y_pred_lgb))

print("LightGBM MAE:", lgb_mae)
print("LightGBM RMSE:", lgb_rmse)

LightGBM MAE: 4251.804980784057
LightGBM RMSE: 4742.646173056067


### LightGBM (log target)

In [31]:
# Because resolution time is highly skewed, a log transformation is applied to stabilize variance.

y_train_log = np.log1p(y_train)
y_test_log = np.log1p(y_test)

In [32]:
lgb_log = lgb.LGBMRegressor(
    n_estimators=300,
    learning_rate=0.05,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

lgb_log.fit(X_train_final, y_train_log)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.180230 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 42641
[LightGBM] [Info] Number of data points in the train set: 13249, number of used features: 10010
[LightGBM] [Info] Start training from score 7.699139


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,300
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [33]:
y_pred_log = lgb_log.predict(X_test_final)
y_pred = np.expm1(y_pred_log)



In [34]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

mae_log = mean_absolute_error(y_test, y_pred)
rmse_log = np.sqrt(mean_squared_error(y_test, y_pred))

print("Log-LightGBM MAE:", mae_log)
print("Log-LightGBM RMSE:", rmse_log)

Log-LightGBM MAE: 3534.8435393083046
Log-LightGBM RMSE: 3970.074069615527


### Random Forest regressor

In [35]:
## Additional Model Comparison: Random Forest
## To verify that performance gains are not specific to LightGBM, a Random Forest regressor is trained using the same features and log-transformed target.


from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train_final, y_train_log)

y_pred_log_rf = rf.predict(X_test_final)
y_pred_rf = np.expm1(y_pred_log_rf)

rf_mae = mean_absolute_error(y_test, y_pred_rf)
rf_rmse = np.sqrt(mean_squared_error(y_test, y_pred_rf))

print("Random Forest MAE:", rf_mae)
print("Random Forest RMSE:", rf_rmse)


Random Forest MAE: 4817.254496075236
Random Forest RMSE: 5365.43946154648


## Model Performance Comparison

| Model | Target | MAE (hours) | RMSE (hours) |
|-----|------|------------|--------------|
| Ridge Regression | Raw | ~10,723 | ~12,367 |
| LightGBM | Raw | ~4,251 | ~4,742 |
| LightGBM | Log | ~3,534 | ~3,970 |
| Random Forest | Log | ~4,817 | ~5,365 |
