In [None]:
from google.colab import drive
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
# Mount Google Drive
# Required if working with files stored in Google Drive
drive.mount('/content/drive')

This comprehensive data read step is performed when the input data file is read, to make sure that there are no missing values or values of a wrong type.
It is performed before any model initialization. 

In [None]:
file_path = '/dataset.xlsx' # Path to the Excel file in Google Drive
# Read the Excel file
df = pd.read_excel(file_path)

# Select numerical columns

numerical_columns = [
      'avg_monthly_cash_transaction_amount',
      'avg_monthly_cash_transaction_count',
      'sum_monthly_cash_transaction_amount',
      'max_monthly_cash_transaction_amount',
      'avg_monthly_cross_border_transaction_amount',
      'avg_monthly_cross_border_transaction_count',
      'sum_monthly_cross_border_transaction_amount',
      'max_monthly_cross_border_transaction_amount'
  ]


# Add encoded categorical columns
categorical_columns = ['market_area', 'industry_group', 'turnover_bucket', 'company_size_M']
all_features = numerical_columns + categorical_columns

# Prepare the feature matrix
X = df[all_features]

# Data validation and preprocessing
print("Data validation steps:")
print(f"Shape of data: {X.shape}")
print("\nChecking for NaN values:")
nan_check = X.isna().sum()
print(nan_check)

# Check for infinite values
inf_check = np.isinf(X).sum()
print("\nChecking for infinite values:")
print(inf_check)

# Replace any infinite values with large finite numbers
X = X.replace([np.inf, -np.inf], np.finfo(np.float64).max)

# Handle any remaining NaN values (if they exist)
X = X.fillna(X.mean())

# Verify data is clean
if X.isna().any().any() or np.isinf(X).any().any():
    raise ValueError("Data still contains NaN or infinite values after preprocessing")

print("\nData types of columns:")
print(X.dtypes)

# Convert all columns to float64 if they aren't already
X = X.astype(float)