<a href="https://colab.research.google.com/github/vibhuverma17/COACH/blob/main/Base_Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install umap-learn hdbscan gower kmodes XGBoost

In [None]:
# ===============================
# Standard Libraries
# ===============================
import time
import numpy as np
import pandas as pd
import warnings
import ast

# ===============================
# Visualization Libraries
# ===============================
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

# ===============================
# Scikit-learn Components
# ===============================
# Data Splitting and Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# Models
from sklearn.ensemble import (
    RandomForestClassifier, RandomForestRegressor,
    GradientBoostingClassifier, GradientBoostingRegressor,
    IsolationForest
)
from sklearn.linear_model import LinearRegression, LogisticRegression

# Metrics
from sklearn.metrics import (
    f1_score, roc_auc_score, accuracy_score, recall_score, precision_score,
    mean_absolute_error, mean_squared_error, r2_score,
    mean_absolute_percentage_error, roc_curve
)

# Pairwise distances
from sklearn.metrics.pairwise import pairwise_distances

# ===============================
# External Libraries
# ===============================
import xgboost as xgb  # XGBoost library
from scipy.stats import ks_2samp, entropy  # Statistical tests
from kmodes.kprototypes import KPrototypes  # Clustering
import umap  # Dimensionality reduction
import hdbscan  # Density-based clustering
import gower  # Gower similarity for mixed data types

# ===============================
# Configure Warnings
# ===============================
warnings.filterwarnings("ignore")

#### DIM REDUCTION AND GRID SEARCH

In [None]:
class StratifiedSamplingUMAP:
    def __init__(self, n_neighbors=15, min_dist=0.1, n_components=2, n_grids=10, sample_percentage=0.1):
        self.n_neighbors = n_neighbors
        self.min_dist = min_dist
        self.n_components = n_components
        self.n_grids = n_grids
        self.sample_percentage = sample_percentage
        self.sample_indices = None  # To store sampled indices

    def check_categorical(self, data):
        return data.select_dtypes(include=['object']).shape[1] > 0

    def fit_transform(self, data):
        if self.check_categorical(data):
            print("Categorical variables detected. Using 'dice' metric for UMAP.")
            categorical_data = data.select_dtypes(include=['object'])
            encoder = OneHotEncoder(sparse_output=False)
            categorical_encoded = encoder.fit_transform(categorical_data)
            continuous_data = data.select_dtypes(exclude=['object'])
            combined_data = np.hstack([categorical_encoded, continuous_data])
            metric = 'dice'
        else:
            print("No categorical variables detected. Using 'euclidean' metric for UMAP.")
            combined_data = data
            metric = 'euclidean'

        umap_model = umap.UMAP(n_neighbors=self.n_neighbors, min_dist=self.min_dist,
                               n_components=self.n_components, metric=metric)
        return umap_model.fit_transform(combined_data)

    def stratified_sampling(self, embedding):
        min_x, min_y = np.min(embedding[:, :2], axis=0)
        max_x, max_y = np.max(embedding[:, :2], axis=0)

        x_bins = np.linspace(min_x, max_x, self.n_grids)
        y_bins = np.linspace(min_y, max_y, self.n_grids)

        if self.n_components == 3:
            min_z = np.min(embedding[:, 2])
            max_z = np.max(embedding[:, 2])
            z_bins = np.linspace(min_z, max_z, self.n_grids)
            grid_counts = np.zeros((self.n_grids, self.n_grids, self.n_grids))
        else:
            grid_counts = np.zeros((self.n_grids, self.n_grids))

        for point in embedding:
            x_idx = np.digitize(point[0], x_bins) - 1
            y_idx = np.digitize(point[1], y_bins) - 1

            if self.n_components == 3:
                z_idx = np.digitize(point[2], z_bins) - 1
                grid_counts[x_idx, y_idx, z_idx] += 1
            else:
                grid_counts[x_idx, y_idx] += 1

        grid_probs = grid_counts / np.sum(grid_counts)
        grid_probs_flat = grid_probs.flatten()

        # Sample indices based on the probability distribution
        sampled_indices = np.random.choice(len(embedding), size=int(len(embedding) * self.sample_percentage), replace=False)

        # Store the sampled indices for later retrieval
        self.sample_indices = sampled_indices

        # Extract the corresponding samples from the embedding
        sampled_embedding = embedding[sampled_indices]

        return sampled_embedding

    def get_sample_indices(self):
        """
        Return the indices of the sampled data points in the original dataset.
        """
        if self.sample_indices is None:
            raise ValueError("No samples have been selected. Please run stratified_sampling first.")
        return self.sample_indices

    def plot(self, embedding):
        if self.n_components == 2:
            plt.scatter(embedding[:, 0], embedding[:, 1], c='blue', marker='o')
            plt.title('UMAP Projection (2D)')
            plt.xlabel('UMAP 1')
            plt.ylabel('UMAP 2')
            plt.show()

        elif self.n_components == 3:
            fig = plt.figure()
            ax = fig.add_subplot(111, projection='3d')
            ax.scatter(embedding[:, 0], embedding[:, 1], embedding[:, 2], c='blue', marker='o')
            ax.set_title('UMAP Projection (3D)')
            ax.set_xlabel('UMAP 1')
            ax.set_ylabel('UMAP 2')
            ax.set_zlabel('UMAP 3')
            plt.show()



In [None]:
# Example usage of StratifiedSamplingUMAP with sample indices retrieval
np.random.seed(42)

# Define number of samples, continuous features, and categorical features
num_samples = 10000
num_continuous_features = 990
num_categorical_features = 10

# Generate continuous features
continuous_data = np.random.randn(num_samples, num_continuous_features)

# Generate categorical features (with categories 'A', 'B', 'C')
categorical_data = np.random.choice(['A', 'B', 'C'], size=(num_samples, num_categorical_features))

# Convert continuous data and categorical data to DataFrames
continuous_df = pd.DataFrame(continuous_data, columns=[f"cont_{i+1}" for i in range(num_continuous_features)])
categorical_df = pd.DataFrame(categorical_data, columns=[f"cat_{i+1}" for i in range(num_categorical_features)])

# Concatenate continuous and categorical DataFrames along columns to form the complete dataset
data = pd.concat([continuous_df, categorical_df], axis=1)

# Initialize and use the StratifiedSamplingUMAP class
stratified_sampler = StratifiedSamplingUMAP(n_neighbors=15, min_dist=0.1, n_components=3, n_grids=10, sample_percentage=0.1)

# Fit and transform the data with UMAP
embedding = stratified_sampler.fit_transform(data)

# Apply stratified sampling
sampled_embedding = stratified_sampler.stratified_sampling(embedding)

# Retrieve indices of the sampled data points in the original dataset
sample_indices = stratified_sampler.get_sample_indices()
print("Sampled Data:", len(sample_indices),"Data:",len(data))

# Filter the original dataset using these indices
sampled_data = data.iloc[sample_indices]

# Plot the result
stratified_sampler.plot(sampled_embedding)

In [None]:
stratified_sampler.plot(embedding)

##### HDBSCAN - CLUSTERING BASED SAMPLING

In [None]:
class ClusterSampler:
    def __init__(self, data, sampling_percent=10, **hdbscan_params):
        """
        Initialize the ClusterSampler class.

        Parameters:
        - data: DataFrame containing the dataset
        - sampling_percent: Percentage of points to sample from each cluster (0-100)
        - **hdbscan_params: Additional parameters to pass to HDBSCAN
        """
        self.data = self._convert_data_types(data)
        self.sampling_percent = sampling_percent
        self.is_categorical = self._detect_categorical(data)
        self.cluster_labels = None
        self.hdbscan_params = hdbscan_params

    def _convert_data_types(self, data):
        """Ensure continuous columns are float64 and categorical columns are object."""
        continuous_cols = data.select_dtypes(include=['float', 'int']).columns
        data[continuous_cols] = data[continuous_cols].astype(np.float64)

        categorical_cols = data.select_dtypes(include=['object', 'category']).columns
        data[categorical_cols] = data[categorical_cols].astype('object')

        return data

    def _detect_categorical(self, data):
        """Detect if the dataset contains categorical features."""
        return data.select_dtypes(include=['object', 'category']).shape[1] > 0

    def _compute_distance_matrix(self):
        """Compute the distance matrix based on the data type."""
        if self.is_categorical:
            gower_matrix = gower.gower_matrix(self.data)
            return gower_matrix.astype(np.float64)
        else:
            return pairwise_distances(self.data, metric='euclidean')

    def fit_clusters(self):
        """Fit HDBSCAN on the dataset with appropriate distance metric."""
        distance_matrix = self._compute_distance_matrix()
        clusterer = hdbscan.HDBSCAN(metric='precomputed' if self.is_categorical else 'euclidean', **self.hdbscan_params)
        self.cluster_labels = clusterer.fit_predict(distance_matrix)

    def sample_points(self):
        """Sample a representative subset of points from each cluster, including noise points as a separate cluster."""
        if self.cluster_labels is None:
            raise ValueError("Clusters have not been computed. Call fit_clusters() first.")

        data_with_labels = self.data.copy()
        data_with_labels['cluster'] = self.cluster_labels

        sampled_indices = []
        unique_labels = np.unique(self.cluster_labels)

        for cluster_label in unique_labels:
            cluster_indices = data_with_labels[data_with_labels['cluster'] == cluster_label].index
            sample_size = max(1, int(len(cluster_indices) * (self.sampling_percent / 100)))

            # Avoid sampling more points than available
            if len(cluster_indices) < sample_size:
                print(f"Cluster {cluster_label} has only {len(cluster_indices)} points, sampling {len(cluster_indices)}.")
            sampled_indices.extend(np.random.choice(cluster_indices, min(sample_size, len(cluster_indices)), replace=False))

        return sampled_indices

    def run(self):
        """Execute the full sampling pipeline: cluster, then sample."""
        self.fit_clusters()
        return self.sample_points()

In [None]:
# Set random seed for reproducibility
np.random.seed(42)

# Generate continuous and categorical data
num_samples = 1000
num_continuous_features = 990
num_categorical_features = 10
continuous_data = np.random.randn(num_samples, num_continuous_features)
categorical_data = np.random.choice(['A', 'B', 'C'], size=(num_samples, num_categorical_features))

# Convert data to DataFrames
continuous_df = pd.DataFrame(continuous_data, columns=[f"cont_{i+1}" for i in range(num_continuous_features)])
categorical_df = pd.DataFrame(categorical_data, columns=[f"cat_{i+1}" for i in range(num_categorical_features)])

# Concatenate to form complete dataset
data = pd.concat([continuous_df, categorical_df], axis=1)

# Instantiate and run sampler
sampler = ClusterSampler(data, sampling_percent=10, min_cluster_size=5, min_samples=1)
sampled_indices = sampler.run()
sampled_data = data.loc[sampled_indices]

In [None]:
print("Sampled Data:", len(sample_indices),"Data:",len(data))

##### ISOLATION FOREST AND KS STATISTIC SAMPLER

In [None]:
class AnomalySampler:
    def __init__(self, X, sample_weight=None):
        """
        Initialize the AnomalySampler class.

        Parameters:
        - X: DataFrame containing the dataset
        - sample_weight: Optional sample weights for the Isolation Forest
        """
        self.original_data = X
        self.X = self._one_hot_encode(X)
        self.sample_weight = sample_weight
        self.preds = self.isolation_forest(self.X, sample_weight)

    def _one_hot_encode(self, data):
        """
        Apply one-hot encoding to categorical columns.

        Parameters:
        - data: DataFrame containing the dataset

        Returns:
        - One-hot encoded DataFrame
        """
        return pd.get_dummies(data, drop_first=True)  # drop_first to avoid multicollinearity, if relevant

    @staticmethod
    def isolation_forest(X, sample_weight=None):
        """
        Fits an Isolation Forest to the dataset and assigns an anomaly score to each sample.

        Parameters:
        - X: DataFrame or array-like containing the dataset
        - sample_weight: Optional sample weights for the Isolation Forest

        Returns:
        - preds: Anomaly scores for each sample
        """
        clf = IsolationForest().fit(X, sample_weight=sample_weight)
        preds = clf.score_samples(X)
        return preds

    @staticmethod
    def get_5_percent(num):
        """Calculate 5% of a given number."""
        return round(5 / 100 * num)

    def get_5_percent_splits(self, length):
        """Splits a given length into 5% intervals."""
        five_percent = self.get_5_percent(length)
        return np.arange(five_percent, length, five_percent)

    def find_sample_indices(self):
        """
        Finds a sample by comparing the distribution of anomaly scores between the sample
        and the original distribution using the KS-test. Starts with a 5% sample, increasing
        by 5% increments until a significant sample (p-value > 0.95) is found or a limit is reached.

        Returns:
        - List of indices representing the sample in the original dataset
        """
        size_splits = self.get_5_percent_splits(len(self.X))
        element = 1
        iteration = 0

        while element < len(size_splits):
            sample_size = size_splits[element]
            sample_indices = np.random.choice(np.arange(self.preds.size), size=sample_size, replace=False)
            sample = np.take(self.preds, sample_indices)

            # Check if KS test p-value indicates similar distributions
            if ks_2samp(self.preds, sample).pvalue > 0.95:
                return sample_indices  # Return indices from the original dataset

            iteration += 1
            if iteration >= 20:
                element += 1
                iteration = 0

        # If no suitable sample is found, return the last attempted sample indices
        return sample_indices



In [None]:
# Set random seed for reproducibility
np.random.seed(42)

# Generate continuous and categorical data
num_samples = 1000
num_continuous_features = 990
num_categorical_features = 10
continuous_data = np.random.randn(num_samples, num_continuous_features)
categorical_data = np.random.choice(['A', 'B', 'C'], size=(num_samples, num_categorical_features))

# Convert data to DataFrames
continuous_df = pd.DataFrame(continuous_data, columns=[f"cont_{i+1}" for i in range(num_continuous_features)])
categorical_df = pd.DataFrame(categorical_data, columns=[f"cat_{i+1}" for i in range(num_categorical_features)])

# Concatenate to form complete dataset
data = pd.concat([continuous_df, categorical_df], axis=1)

# Initialize AnomalySampler
sampler = AnomalySampler(data)
sample_indices = sampler.find_sample_indices()
sampled_data = data.iloc[sample_indices]  # Retrieve the sampled data based on indices

In [None]:
print("Sampled Data:", len(sample_indices),"Data:",len(data))

#### ENTROPY SAMPLER

In [None]:
class EntropySampler:
    def __init__(self, data, sampling_percent=10, bins=10):
        """
        Initialize the EntropySampler class.

        Parameters:
        - data: DataFrame containing the dataset
        - sampling_percent: Percentage of points to sample based on entropy (0-100)
        - bins: Number of bins to use for continuous data entropy calculation
        """
        self.data = data
        self.sampling_percent = sampling_percent
        self.bins = bins
        self.entropy_scores = None

    def _calculate_entropy(self):
        """
        Calculate the entropy for each feature and aggregate entropy per data point.

        Returns:
        - entropy_scores: Array of entropy scores for each data point
        """
        entropy_scores = np.zeros(len(self.data))

        for col in self.data.columns:
            if pd.api.types.is_numeric_dtype(self.data[col]):
                # For continuous data, bin it and calculate entropy over the bins
                counts, _ = np.histogram(self.data[col], bins=self.bins)
                feature_entropy = entropy(counts + 1e-10)  # Add small value to avoid log(0)
                feature_contributions = np.digitize(self.data[col], bins=np.histogram_bin_edges(self.data[col], bins=self.bins))
            else:
                # For categorical data, calculate entropy over unique values
                counts = self.data[col].value_counts().values
                feature_entropy = entropy(counts + 1e-10)
                feature_contributions = self.data[col].map(self.data[col].value_counts(normalize=True)).values

            # Accumulate entropy scores based on the feature contributions for each data point
            entropy_scores += feature_contributions * feature_entropy

        self.entropy_scores = entropy_scores

    def sample_indices(self):
        """
        Get indices of points with the highest entropy scores.

        Returns:
        - List of indices for sampled points based on entropy scores
        """
        if self.entropy_scores is None:
            self._calculate_entropy()

        # Determine the number of points to sample
        num_samples = max(1, int(len(self.data) * (self.sampling_percent / 100)))

        # Get the indices of the top entropy scores
        top_indices = np.argsort(self.entropy_scores)[-num_samples:]

        return top_indices

    def run(self):
        """
        Execute the full entropy-based sampling process.

        Returns:
        - List of indices for sampled points based on entropy scores
        """
        self._calculate_entropy()
        return self.sample_indices()

In [None]:
# Set random seed for reproducibility
np.random.seed(42)

# Generate continuous and categorical data
num_samples = 1000
num_continuous_features = 990
num_categorical_features = 10
continuous_data = np.random.randn(num_samples, num_continuous_features)
categorical_data = np.random.choice(['A', 'B', 'C'], size=(num_samples, num_categorical_features))

# Convert data to DataFrames
continuous_df = pd.DataFrame(continuous_data, columns=[f"cont_{i+1}" for i in range(num_continuous_features)])
categorical_df = pd.DataFrame(categorical_data, columns=[f"cat_{i+1}" for i in range(num_categorical_features)])

# Concatenate to form complete dataset
data = pd.concat([continuous_df, categorical_df], axis=1)

# Initialize EntropySampler
sampler = EntropySampler(data)
sample_indices = sampler.run()  # Get indices of sampled points
sampled_data = data.iloc[sample_indices]  # Retrieve sampled data from the original DataFrame

In [None]:
print("Sampled Data:", len(sample_indices),"Data:",len(data))

### DISTANCE BASED SAMPLER

In [None]:
class DistanceBasedSampler:
    def __init__(self, data, k=5, sampling_percent=10):
        """
        Initialize the DistanceBasedSampler class for distance-based sampling with mixed data.

        Parameters:
        - data: DataFrame containing the dataset
        - k: Number of clusters (or core-set size) to select
        - sampling_percent: Percentage of points to sample (for selecting a subset of the cluster centers)
        """
        self.data = data
        self.k = k
        self.sampling_percent = sampling_percent
        self.cluster_labels = None

    def _prepare_data(self):
        """
        Prepare the data by converting categorical columns to string types for k-prototypes.
        """
        self.data = self.data.apply(lambda col: col.astype(str) if col.dtype == 'object' else col)
        return self.data

    def fit_clusters(self):
        """
        Fit the k-prototypes model to the data to create clusters.
        """
        self.data = self._prepare_data()
        data_array = self.data.values
        categorical_indices = [i for i, col in enumerate(self.data.columns) if self.data.dtypes[col] == 'object']

        # Apply K-Prototypes clustering
        kproto = KPrototypes(n_clusters=self.k, init='Cao', random_state=42)
        clusters = kproto.fit_predict(data_array, categorical=categorical_indices)

        # Store the cluster labels
        self.cluster_labels = clusters

        return clusters

    def sample_indices(self):
        """
        Sample the points that are the most representative based on distance from the cluster centers.

        Returns:
        - List of indices of the sampled data points.
        """
        if self.cluster_labels is None:
            self.fit_clusters()

        # Get the indices of the cluster centers (medoids)
        cluster_centers = self.data.iloc[np.unique(self.cluster_labels)].index

        # Calculate distances between all points and their closest cluster center
        distances = np.zeros(len(self.data))

        for i, point in self.data.iterrows():
            min_distance = np.min([self._calculate_distance(point, self.data.iloc[center]) for center in cluster_centers])
            distances[i] = min_distance

        # Determine the number of points to sample based on the sampling percentage
        num_samples = max(1, int(len(self.data) * (self.sampling_percent / 100)))

        # Get the indices of the top points with maximum distance from the centers
        sampled_indices = np.argsort(distances)[-num_samples:]

        return sampled_indices

    def _calculate_distance(self, point1, point2):
        """
        Calculate a distance between two data points, accounting for both categorical and numerical features.

        Uses Euclidean distance for continuous features and a matching distance for categorical features.
        """
        numeric_cols = self.data.select_dtypes(include=[np.number]).columns
        categorical_cols = self.data.select_dtypes(include=['object']).columns

        # Calculate numeric distance (Euclidean)
        numeric_distance = np.linalg.norm(point1[numeric_cols] - point2[numeric_cols])

        # Calculate categorical distance (matching)
        categorical_distance = sum([1 if point1[col] != point2[col] else 0 for col in categorical_cols])

        # Combine both distances
        return numeric_distance + categorical_distance

    def run(self):
        """
        Execute the distance-based sampling process.

        Returns:
        - List of indices for sampled data points.
        """
        self.fit_clusters()
        return self.sample_indices()


In [None]:
# Set random seed for reproducibility
np.random.seed(42)

# Generate continuous and categorical data
num_samples = 1000
num_continuous_features = 990
num_categorical_features = 10
continuous_data = np.random.randn(num_samples, num_continuous_features)
categorical_data = np.random.choice(['A', 'B', 'C'], size=(num_samples, num_categorical_features))

# Convert data to DataFrames
continuous_df = pd.DataFrame(continuous_data, columns=[f"cont_{i+1}" for i in range(num_continuous_features)])
categorical_df = pd.DataFrame(categorical_data, columns=[f"cat_{i+1}" for i in range(num_categorical_features)])

# Concatenate to form complete dataset
data = pd.concat([continuous_df, categorical_df], axis=1)

# Initialize DistanceBasedSampler
sampler = DistanceBasedSampler(data, k=5, sampling_percent=10)

# Get indices of the sampled points
sampled_indices = sampler.run()

# Retrieve the sampled data from the original DataFrame
sampled_data = data.iloc[sampled_indices]

In [None]:
print("Sampled Data:", len(sample_indices),"Data:",len(data))

#### RANDON SAMPLING

In [None]:
class RandomSampler:
    def __init__(self, data, sampling_percent=10):
        """
        Initialize the RandomSampler class.

        Parameters:
        - data: DataFrame containing the dataset
        - sampling_percent: Percentage of rows to sample (0-100)
        """
        self.data = data
        self.sampling_percent = sampling_percent

    def sample_indices(self):
        """
        Randomly sample indices of rows from the dataset.

        Returns:
        - A list of sampled indices.
        """
        if not (0 <= self.sampling_percent <= 100):
            raise ValueError("sampling_percent must be between 0 and 100.")

        # Calculate the number of samples to draw
        sample_size = int(len(self.data) * (self.sampling_percent / 100))
        sample_size = max(1, sample_size)  # Ensure at least one index is selected

        # Perform random sampling without replacement and return indices
        sampled_indices = self.data.sample(n=sample_size, random_state=42).index.tolist()
        return sampled_indices

In [None]:
# Set random seed for reproducibility
np.random.seed(42)

# Generate continuous and categorical data
num_samples = 1000
num_continuous_features = 990
num_categorical_features = 10
continuous_data = np.random.randn(num_samples, num_continuous_features)
categorical_data = np.random.choice(['A', 'B', 'C'], size=(num_samples, num_categorical_features))

# Convert data to DataFrames
continuous_df = pd.DataFrame(continuous_data, columns=[f"cont_{i+1}" for i in range(num_continuous_features)])
categorical_df = pd.DataFrame(categorical_data, columns=[f"cat_{i+1}" for i in range(num_categorical_features)])

# Concatenate to form complete dataset
data = pd.concat([continuous_df, categorical_df], axis=1)


sampler = RandomSampler(data=data, sampling_percent=10)
sampled_indices = sampler.sample_indices()

In [None]:
print("Sampled Data:", len(sample_indices),"Data:",len(data))

### MACHINE LEARNING MODELS

In [None]:
!mkdir Data

#### READING DATA

In [None]:
train = pd.read_csv('/content/Data/All Claims.csv')

# Display the first few rows of the training data
print("Training Data:")
# print(train.head())

train.drop(columns=['id'],axis=1,inplace=True)

X = train.drop(columns=['loss'])
y = train['loss']

categorical_features = X.select_dtypes(include=['object']).columns.tolist()
# Convert categorical features to category dtype
for feature in categorical_features:
    X[feature] = X[feature].astype('category')

X.head()

In [None]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
train = pd.read_csv('/content/Data/TUANDROMD.csv')

# Display the first few rows of the training data
print("Training Data:")

train = train[~(train['Label'].isnull())]
train.columns = train.columns.str.replace(r'[^a-zA-Z0-9_]', '_', regex=True)

X = train.drop(columns=['Label'])
y = train['Label']

categorical_features = X.select_dtypes(include=['object']).columns.tolist()
# Convert categorical features to category dtype
for feature in categorical_features:
    X[feature] = X[feature].astype('category')

X.head()

In [None]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

In [None]:
train = pd.read_csv('/content/Data/Andriod Permissions.csv')

# Display the first few rows of the training data
print("Training Data:")

train = train[~(train['Result'].isnull())]
train.columns = train.columns.str.replace(r'[^a-zA-Z0-9_]', '_', regex=True)

X = train.drop(columns=['Result'])
y = train['Result']

categorical_features = X.select_dtypes(include=['object']).columns.tolist()
# Convert categorical features to category dtype
for feature in categorical_features:
    X[feature] = X[feature].astype('category')

X.head()

In [None]:
X_train3, X_test3, y_train3, y_test3 = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

In [None]:
# Load the data
train = pd.read_csv('/content/Data/Student Success.csv', delimiter=';')

# Clean and preprocess
train = train[~train['Target'].isnull()]  # Remove rows with null Target
train.columns = train.columns.str.replace(r'[^a-zA-Z0-9_]', '_', regex=True)  # Clean column names
train = train[~(train['Target'] == 'Enrolled')]  # Remove 'Enrolled' from Target

# Label encode the Target column
label_encoder = LabelEncoder()
train['Target'] = label_encoder.fit_transform(train['Target'])  # Encode Target as 0 and 1

# Split features and target
X = train.drop(columns=['Target'])
y = train['Target']

# Process categorical features
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
for feature in categorical_features:
    X[feature] = X[feature].astype('category')

X.head()

In [None]:
X_train4, X_test4, y_train4, y_test4 = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

In [None]:
train = pd.read_csv('/content/Data/Phishing URL.csv')

train = train[[x for x in train.columns if x not in ['FILENAME','URL','Domain','Title']]]

# Clean and preprocess
train = train[~train['label'].isnull()]  # Remove rows with null Target
train.columns = train.columns.str.replace(r'[^a-zA-Z0-9_]', '_', regex=True)  # Clean column names

train = train.head(10000)
# Split features and target
X = train.drop(columns=['label'])
y = train['label']

# Process categorical features
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
for feature in categorical_features:
    X[feature] = X[feature].astype('category')

X.head()

In [None]:
X_train5, X_test5, y_train5, y_test5 = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

In [None]:
train = pd.read_csv('/content/Data/parkinsons_updrs.data')

# Display the first few rows of the training data
print("All Data:")
# print(train.head())

train.drop('subject#', axis=1, inplace=True)
train.drop('test_time', axis=1, inplace=True)
train.drop('total_UPDRS', axis=1, inplace=True)

X=train.drop('motor_UPDRS', axis=1)
y=train[['motor_UPDRS']]

# Replace positive and negative infinity with NaN across the DataFrame
X = X.replace([np.inf, -np.inf], np.nan)

X.head()

In [None]:
X_train6, X_test6, y_train6, y_test6 = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
train = pd.read_csv('/content/Data/creditcard.csv')

# Display the first few rows of the training data
print("Training Data:")
# print(train.head())

print(train.shape)

train.drop(columns=['Time'],axis=1)
# test.drop(columns=['id'],axis=1)

# Split the data into features (X) and target (y
X = train.drop(columns=['Time','Class'])
y = train['Class']

categorical_features = X.select_dtypes(include=['object']).columns.tolist()
# Convert categorical features to category dtype
for feature in categorical_features:
    X[feature] = X[feature].astype('category')
# Split the dataset into training and testing sets
X.head()

In [None]:
X_train7, X_test7, y_train7, y_test7 = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Custom transformer to drop highly correlated features
class DropHighCorrelation(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.6):
        """
        Custom transformer to drop highly correlated features.

        Parameters:
        - threshold: Correlation threshold above which features will be dropped.
        """
        self.threshold = threshold
        self.to_drop = None

    def fit(self, X, y=None):
        """
        Identify features to drop based on the correlation threshold.
        """
        corr_matrix = pd.DataFrame(X).corr().abs()  # Compute the absolute correlation matrix
        upper_tri = corr_matrix.where(
            np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
        )  # Extract upper triangle
        self.to_drop = [
            column for column in upper_tri.columns if any(upper_tri[column] > self.threshold)
        ]

        print(f"DropHighCorrelation: {len(self.to_drop)} columns will be dropped due to correlation (threshold={self.threshold}).")
        return self

    def transform(self, X):
        """
        Drop the identified features from the dataset.
        """
        return pd.DataFrame(X).drop(columns=self.to_drop, errors='ignore')

# Main ModelTrainer class
class ModelTrainer:
    def __init__(self, datasets_X, datasets_y, task_type, dataset_names=None):
        """
        Initializes the ModelTrainer with datasets, explicitly provided task type, and dataset names.

        Parameters:
        - datasets_X: List of [X_train, X_test] for different datasets
        - datasets_y: List of [y_train, y_test] for different datasets
        - task_type: A string explicitly specifying the task type, either 'regression' or 'classification'.
        - dataset_names: List of names for the datasets to be used as index in the results DataFrame
        """
        if task_type not in ['classification', 'regression']:
            raise ValueError("task_type must be either 'classification' or 'regression'")

        self.datasets_X = datasets_X
        self.datasets_y = datasets_y
        self.task_type = task_type
        self.dataset_names = dataset_names

    def _select_model(self):
        """Select model based on task type."""
        if self.task_type == 'classification':
            return {
                'RandomForest': RandomForestClassifier(random_state=42),
                'GradientBoosting': GradientBoostingClassifier(random_state=42),
                'LogisticRegression': LogisticRegression(),
                'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
            }
        elif self.task_type == 'regression':
            return {
                'RandomForest': RandomForestRegressor(random_state=42),
                'GradientBoosting': GradientBoostingRegressor(random_state=42),
                'LinearRegression': LinearRegression(),
                'XGBoost': xgb.XGBRegressor(random_state=42)
            }

    def _create_pipeline(self, model, X):
        """Create a preprocessing and modeling pipeline."""
        # Identify categorical and numerical features
        categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
        numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

        # Preprocessing for numerical features: Standard scaling
        numerical_transformer = StandardScaler()

        # Preprocessing for categorical features: One-hot encoding
        categorical_transformer = OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False)

        # Combine preprocessors in a column transformer
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numerical_transformer, numerical_features),
                ('cat', categorical_transformer, categorical_features)
            ]
        )

        # Define a pipeline with preprocessing, correlation dropping, and the specified model
        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),  # First preprocess
            ('drop_high_corr', DropHighCorrelation(threshold=0.6)),  # Then drop highly correlated features
            ('model', model)  # Finally, apply the model
        ])
        return pipeline

    def _get_best_cutoff(self, y_true, y_pred_proba):
        """Use Youden's J statistic to determine the best cutoff point for classification."""
        fpr, tpr, thresholds = roc_curve(y_true, y_pred_proba)
        youden_index = tpr - fpr
        best_cutoff = thresholds[np.argmax(youden_index)]
        return best_cutoff

    def _train_and_evaluate(self, model, X_train, X_test, y_train, y_test):
        """Train the model and evaluate it on both the training and test datasets."""
        start_time = time.time()
        model.fit(X_train, y_train)
        training_time = time.time() - start_time

        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)

        if self.task_type == 'classification':
            y_pred_proba_test = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else y_pred_test
            y_pred_proba_train = model.predict_proba(X_train)[:, 1] if hasattr(model, 'predict_proba') else y_pred_train

            best_cutoff = self._get_best_cutoff(y_test, y_pred_proba_test)
            y_pred_class_test = (y_pred_proba_test >= best_cutoff).astype(int)
            y_pred_class_train = (y_pred_proba_train >= best_cutoff).astype(int)

            metrics = {
                'Train F1': f1_score(y_train, y_pred_class_train),
                'Test F1': f1_score(y_test, y_pred_class_test),
                'Train AUC': roc_auc_score(y_train, y_pred_proba_train),
                'Test AUC': roc_auc_score(y_test, y_pred_proba_test),
                'Train Accuracy': accuracy_score(y_train, y_pred_class_train),
                'Test Accuracy': accuracy_score(y_test, y_pred_class_test),
                'Train Recall': recall_score(y_train, y_pred_class_train),
                'Test Recall': recall_score(y_test, y_pred_class_test),
                'Train Precision': precision_score(y_train, y_pred_class_train),
                'Test Precision': precision_score(y_test, y_pred_class_test),
                'Best Cutoff': best_cutoff,
                'Training Time (seconds)': training_time
            }
        else:
            metrics = {
                'Train MSE': mean_squared_error(y_train, y_pred_train),
                'Test MSE': mean_squared_error(y_test, y_pred_test),
                'Train MAPE': mean_absolute_percentage_error(y_train, y_pred_train),
                'Test MAPE': mean_absolute_percentage_error(y_test, y_pred_test),
                'Train R2': r2_score(y_train, y_pred_train),
                'Test R2': r2_score(y_test, y_pred_test),
                'Training Time (seconds)': training_time
            }

        return metrics

    def train_models(self):
        """Train and evaluate models on multiple datasets and return a DataFrame of results."""
        models = self._select_model()
        results = []

        for idx, (X_data, y_data) in enumerate(zip(self.datasets_X, self.datasets_y)):
            X_train, X_test = X_data
            y_train, y_test = y_data

            for model_name, model in models.items():
                pipeline = self._create_pipeline(model, X_train)
                metrics = self._train_and_evaluate(pipeline, X_train, X_test, y_train, y_test)
                metrics['Dataset'] = self.dataset_names[idx] if self.dataset_names else f'Dataset {idx+1}'
                metrics['Model'] = model_name
                results.append(metrics)

        return pd.DataFrame(results).set_index('Dataset')

In [None]:
# Assume you have the datasets as before
trainer = ModelTrainer(
    datasets_X=[[X_train1, X_test1],[X_train6, X_test6],[X_train7, X_test7]],
    datasets_y=[[y_train1, y_test1],[y_train6, y_test6],[y_train7, y_test7]],
    task_type='regression',
    dataset_names=['All Claims','Parkinsons','Credit Card Fraud']
)

results = trainer.train_models()

In [None]:
results.to_csv('regression.csv',index=False)

In [None]:
results

In [None]:
# Assume you have the datasets as before
trainer = ModelTrainer(
    datasets_X=[[X_train2, X_test2],[X_train3, X_test3],[X_train4, X_test4],[X_train5, X_test5]],
    datasets_y=[[y_train2, y_test2],[y_train3, y_test3],[y_train4, y_test4],[y_train5, y_test5]],
    task_type='classification',
    dataset_names=['Tuandromd','Andriod Permissions','Student Success','Phishing URL']
)

results = trainer.train_models()

In [None]:
results.to_csv('classification.csv',index=False)

In [None]:
results