In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb
import joblib

In [2]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.4-py3-none-win_amd64.whl (124.9 MB)
Installing collected packages: xgboost
Successfully installed xgboost-2.1.4


In [4]:
data=pd.read_csv("D:\Cost_Benefit_AI_App\pythonProject\data\Civil Engineering Global Project Dataset.csv")

In [None]:
# Drop unnecessary columns
data.drop(columns=['Project ID'], inplace=True)

In [6]:
print(data.head())
print(data.tail())

   Certificates  Years of Experience  age  Time Arrival Strafe  Project Cost  \
0             0                    5   45                 5.26      141144.0   
1            12                    7   34                 2.66     1005257.0   
2             0                    4   21                 5.39       51424.0   
3             1                    3   52                 0.99      186154.0   
4             1                    2   46                 1.41      135718.0   

   Project Proximity  Violation Risk Index  Company PCAB Score  \
0             111.56                  0.63                   2   
1             131.46                  0.31                   3   
2             100.31                  0.52                   3   
3             123.69                  0.67                   3   
4             101.17                  0.66                   2   

   Weekly Overtime Hours  Salary Bracket  is_good  
0                    5.0               2        1  
1                 

In [9]:
# Encode categorical column if necessary
if data['Salary Bracket'].dtype == 'object':
    le = LabelEncoder()
    data['Salary Bracket'] = le.fit_transform(data['Salary Bracket'])

In [11]:
# Features and Target
features = data.columns.drop('is_good')
X = data[features]
y = data['is_good']

In [12]:
# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [13]:
# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [14]:
# Train XGBoost Classifier
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=None,
              n_jobs=None, num_parallel_tree=None, random_state=42, ...)

In [15]:
# Predict
y_pred = model.predict(X_test)
print(y_pred)

[0 1 0 ... 1 1 1]


In [16]:
# Evaluation
print("\nEvaluation Metrics:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Evaluation Metrics:
Accuracy: 0.79743
MAE: 0.20257
MSE: 0.20257
R²: 0.18971981331144538

Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.80      0.80     99952
           1       0.80      0.80      0.80    100048

    accuracy                           0.80    200000
   macro avg       0.80      0.80      0.80    200000
weighted avg       0.80      0.80      0.80    200000



In [18]:
# Save model and scaler
joblib.dump(model, r"D:\Cost_Benefit_AI_App\pythonProject\models\xgb_classifier.pkl")
joblib.dump(scaler, r"D:\Cost_Benefit_AI_App\pythonProject\models\scaler.pkl")

['D:\\Cost_Benefit_AI_App\\pythonProject\\models\\scaler.pkl']

In [19]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

In [21]:
#confusion metrix visualization
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=[0,1], yticklabels=[0,1])
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.savefig(r"D:\Cost_Benefit_AI_App\pythonProject\models\confusion_matrix.png")
plt.close()

In [22]:
#Visualization for classification report
precision, recall, f1, support = precision_recall_fscore_support(y_test, y_pred, average=None, labels=[0, 1])

labels = ['Class 0', 'Class 1']
x = np.arange(len(labels))

plt.figure(figsize=(8, 5))
bar_width = 0.25
plt.bar(x - bar_width, precision, width=bar_width, label='Precision')
plt.bar(x, recall, width=bar_width, label='Recall')
plt.bar(x + bar_width, f1, width=bar_width, label='F1 Score')

plt.xticks(x, labels)
plt.ylabel("Score")
plt.ylim(0, 1)
plt.title("Classification Report Metrics")
plt.legend()
plt.tight_layout()
plt.savefig(r"D:\Cost_Benefit_AI_App\pythonProject\models\classification_matrix.png")
plt.close()