# Training Phase

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load the first Excel file
data = pd.read_excel('train.xlsx')

# Select 'Entity Title' as the feature and 'Category' as the target variable
X = data['Entity Title']
y = data['Category']

# Label encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y_encoded, test_size=0.2, random_state=42)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Convert text data into numerical vectors
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_valid_vectorized = vectorizer.transform(X_valid)

# Initialize and train the XGBoost model
xgb_model = XGBClassifier()
xgb_model.fit(X_train_vectorized, y_train)

# Evaluate on the validation set
y_pred_valid = xgb_model.predict(X_valid_vectorized)
accuracy = accuracy_score(y_valid, y_pred_valid)
print(f"Validation Accuracy: {accuracy}")


Validation Accuracy: 0.8053097345132744


In [None]:
# Get the entity titles and actual categories for the validation set
entity_titles_valid = X_valid.reset_index(drop=True)
actual_categories_valid = label_encoder.inverse_transform(y_valid)

# Inverse transform the predicted categories for the validation set
predicted_categories_valid = label_encoder.inverse_transform(y_pred_valid)

# Create a DataFrame to store predicted and actual categories for validation
validation_results = pd.DataFrame({
    'Entity Title': entity_titles_valid,
    'Predicted Category': predicted_categories_valid,
    'Actual Category': actual_categories_valid
})

# Save the validation results to an Excel file
validation_results.to_excel('validation_results.xlsx', index=False)

# Load validation results
validation_results = pd.read_excel('validation_results.xlsx')

# Filter rows where predicted category is different from actual category
wrong_predictions = validation_results[validation_results['Predicted Category'] != validation_results['Actual Category']]

# Save the wrongly predicted categories to a new Excel file
wrong_predictions.to_excel('wrong_predictions.xlsx', index=False)


155 out of 750 wrong prediction.
28 out of 155 are in BM

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

# Load the saved predictions and actual categories from the Excel file
results_df = pd.read_excel('validation_results.xlsx')

# Extract predicted and actual categories
predicted_categories = results_df['Predicted Category']
actual_categories = results_df['Actual Category']

# Generate confusion matrix
conf_matrix = confusion_matrix(actual_categories, predicted_categories)

# Generate classification report
class_report = classification_report(actual_categories, predicted_categories)

# Display confusion matrix
print("\nConfusion Matrix:")
print(conf_matrix)

# Display classification report
print("\nClassification Report:")
print(class_report)



Confusion Matrix:
[[  1   0   1   0   0   0]
 [  0  29   2   0   0   5]
 [  0   0 333   2   2  37]
 [  0   1  15  61   0   6]
 [  0   0  17   0  24   3]
 [  0   3  56   4   1 147]]

Classification Report:
                       precision    recall  f1-score   support

      Anti Corruption       1.00      0.50      0.67         2
               ESG/ET       0.88      0.81      0.84        36
           Functional       0.79      0.89      0.83       374
                  HSE       0.91      0.73      0.81        83
           Leadership       0.89      0.55      0.68        44
Technical/Engineering       0.74      0.70      0.72       211

             accuracy                           0.79       750
            macro avg       0.87      0.70      0.76       750
         weighted avg       0.80      0.79      0.79       750



# Inference Phase



In [None]:
import joblib

# Save the trained model
joblib.dump(xgb_model, 'xgb_model.pkl')

# Load the trained model when needed
loaded_model = joblib.load('xgb_model.pkl')


# Prediction on New Data

In [None]:
# Load the second Excel file
new_data = pd.read_excel('test.xlsx')

# Use the trained model to predict categories for new data
X_new = new_data['Entity Title']
X_new_vectorized = vectorizer.transform(X_new)
predictions = loaded_model.predict(X_new_vectorized)

# Inverse transform the predictions to get original category labels
predicted_categories = label_encoder.inverse_transform(predictions)
new_data['Predicted Category'] = predicted_categories

# Save the predictions to a new Excel file
new_data.to_excel('predicted_categories.xlsx', index=False)


In [None]:
!ls

predicted_categories.xlsx  test.xlsx   validation_results.xlsx	xgb_model.pkl
sample_data		   train.xlsx  wrong_predictions.xlsx


# Fine Tune

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Load the data
data = pd.read_excel('learning.xlsx')

# Separate features and target variable
X = data['Entity Title']
y = data['Category']

# Label encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Convert text data into numerical vectors
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_valid_vectorized = vectorizer.transform(X_valid)

# Define the XGBoost classifier
xgb_model = XGBClassifier()

# Define the hyperparameters to tune and their respective ranges
param_grid = {
    'learning_rate': [0.1, 0.3, 0.5],
    'max_depth': [3, 5, 7],
    'n_estimators': [100, 200, 300]
}

# Perform Grid Search Cross-Validation
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train_vectorized, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Train the model using the best hyperparameters
best_xgb_model = XGBClassifier(**best_params)
best_xgb_model.fit(X_train_vectorized, y_train)

# Evaluate on the validation set
y_pred_valid = best_xgb_model.predict(X_valid_vectorized)
accuracy = accuracy_score(y_valid, y_pred_valid)
print(f"Validation Accuracy with Best Hyperparameters: {accuracy}")

# Save the validation results to an Excel file
entity_titles_valid = X_valid.reset_index(drop=True)
actual_categories_valid = label_encoder.inverse_transform(y_valid)
predicted_categories_valid = label_encoder.inverse_transform(y_pred_valid)

validation_results = pd.DataFrame({
    'Entity Title': entity_titles_valid,
    'Predicted Category': predicted_categories_valid,
    'Actual Category': actual_categories_valid
})

validation_results.to_excel('validation_results_with_tuning.xlsx', index=False)


Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300}
Validation Accuracy with Best Hyperparameters: 0.7287128712871287


In [None]:
import pandas as pd
# Load validation results
validation_results = pd.read_excel('predicted_categories.xlsx')

# Filter rows where predicted category is different from actual category
wrong_predictions = validation_results[validation_results['Predicted Category'] != validation_results['Actual Category']]

# Save the wrongly predicted categories to an Excel file
wrong_predictions.to_excel('wrong_predictions.xlsx', index=False)
