In [None]:
!pip install joblib
import pandas as pd
import numpy as np
import xgboost as xgb
import pickle
import regex as re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, roc_curve, auc, recall_score
import joblib

# Load the training data containing 'address', 'city', 'province', and 'Result' columns
training_data = pd.read_excel('/content/Training_data .xlsx')

# Concatenate 'address', 'city', and 'province' columns into a single 'full_address' column
training_data['full_address'] = training_data['address'].astype(str) + ', ' + training_data['city'].astype(str) + ', ' + training_data['province'].astype(str)
API_Match = training_data['API']

# Define the replacements for the address column
replacements = {
    r'\bH\b': 'House',
    r'\bh\b': 'house',
    r'\bH(?![oO])\b': 'House',
    r'\bh(?![oO])\b': 'house',
    r'\bst\b': 'Street',
    r'\bSt\b': 'Street',
    r'\bST\b': 'Street',
    r'\bST(?![rR])\b': 'Street',
    r'\bSt(?![rR])\b': 'Street',
    r'\bst(?![rR])\b': 'Street'
}

# Function to perform replacements
def replace_words(text):
    if isinstance(text, str):
        for pattern, replacement in replacements.items():
            text = re.sub(pattern, replacement, text)
        return text
    else:
        return text

# Apply replacements to each string in the 'full_address' column
training_data['updated_address'] = training_data['full_address'].apply(replace_words)

# Define feature extraction functions
def extract_feature_1(text):
    return 'Sector' in text or 'sector' in text or 'SECTOR' in text or 'Block' in text or 'BLOCK' in text or 'block' in text or 'House' in text or 'HOUSE' in text or 'house' in text or 'Street' in text or 'STREET' in text or 'street' in text or 'FLAT' in text or 'flat' in text or 'Flat' in text
#'Sector|sector|SECTOR|Block|block|BLOCK|House|house|HOUSE|Street|street|STREET|Flat|flat|FLAT'

def extract_feature_2(text):

    return 'True' in API_Match

def extract_feature_3(text):
    return re.search(r'(House|house|HOUSE|H|no|NO|No|number|Number|n|N|Street|St|st|street|STREET) \d', text) is not None

# Extract features and add them as columns
training_data['Featured_1'] = training_data['updated_address'].apply(extract_feature_1)
training_data['Featured_2'] = training_data['updated_address'].apply(extract_feature_2)
training_data['Featured_3'] = training_data['updated_address'].apply(extract_feature_3)

# Use label encoding to convert textual labels to numerical values for the 'Result' column
label_encoder = LabelEncoder()
training_data['Result'] = label_encoder.fit_transform(training_data['Result'])

# Drop unwanted columns
training_data.drop(['order_id', 'province', 'city', 'address', 'status', 'updated_address', 'full_address'], axis=1, inplace=True)

# Separate features (X) and target (y)
X = training_data.drop('Result', axis=1)
y = training_data['Result']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the XGBoost model
xgb_model = xgb.XGBClassifier(
    learning_rate=0.01,
    n_estimators=500,
    max_depth=300,
    subsample=0.85,
    colsample_bytree=0.85,
    random_state=42
)

# Define the hyperparameters to search
param_grid = {
    'max_depth': [3, 6],
    'n_estimators': [100, 300],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.7, 0.8],
    'colsample_bytree': [0.7, 0.8]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='accuracy', cv=5)

# Perform the grid search
grid_search.fit(X_train, y_train)

# Get the best parameters and estimator
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

# Train the model with the best parameters
best_estimator.fit(X_train, y_train)

# Predict using the model
y_pred_encoded = best_estimator.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_encoded)

# Calculate precision
precision = precision_score(y_test, y_pred_encoded, average='weighted')

# Calculate ROC curve and AUC
y_pred_prob = best_estimator.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
roc_auc = auc(fpr, tpr)

# Calculate R1-score
recall = recall_score(y_test, y_pred_encoded, average='weighted')

# Print the metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("ROC AUC:", roc_auc)
print("R1-score:", recall)

# Save the trained model to a file
model_filename =  'Address_Validation_Model_Updated.joblib'  # Use .joblib extension
joblib.dump(best_estimator, model_filename)
with open(model_filename, 'wb') as model_file:
    joblib.dump(best_estimator, model_file)

print("Best Parameters:", best_params)
print("Training Completed and Model Saved as Address_Validator_Model.joblib")



Accuracy: 0.921
Precision: 0.916548097763976
ROC AUC: 0.8687372847678191
R1-score: 0.921
Best Parameters: {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.7}
Training Completed and Model Saved as Address_Validator_Model.joblib
