<a href="https://colab.research.google.com/github/vibhuverma17/MLBASEDSAMPLING/blob/main/Base_Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install umap-learn hdbscan gower kmodes XGBoost

In [None]:
# ===============================
# Standard Libraries
# ===============================
import time
import numpy as np
import pandas as pd
import warnings
import ast

# ===============================
# Visualization Libraries
# ===============================
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

# ===============================
# Scikit-learn Components
# ===============================
# Data Splitting and Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# Models
from sklearn.ensemble import (
    RandomForestClassifier, RandomForestRegressor,
    GradientBoostingClassifier, GradientBoostingRegressor,
    IsolationForest
)
from sklearn.linear_model import LinearRegression, LogisticRegression

# Metrics
from sklearn.metrics import (
    f1_score, roc_auc_score, accuracy_score, recall_score, precision_score,
    mean_absolute_error, mean_squared_error, r2_score,
    mean_absolute_percentage_error, roc_curve
)

# Pairwise distances
from sklearn.metrics.pairwise import pairwise_distances

# ===============================
# External Libraries
# ===============================
import xgboost as xgb  # XGBoost library
from scipy.stats import ks_2samp, entropy  # Statistical tests
from kmodes.kprototypes import KPrototypes  # Clustering
import umap  # Dimensionality reduction
import hdbscan  # Density-based clustering
import gower  # Gower similarity for mixed data types

# ===============================
# Configure Warnings
# ===============================
warnings.filterwarnings("ignore")

In [None]:
# !mkdir Data

#### READING DATA

In [None]:
train = pd.read_csv('/content/Data/All Claims.csv')

# Display the first few rows of the training data
print("Training Data:")
# print(train.head())

train.drop(columns=['id'],axis=1,inplace=True)

X = train.drop(columns=['loss'])
y = train['loss']

categorical_features = X.select_dtypes(include=['object']).columns.tolist()
# Convert categorical features to category dtype
for feature in categorical_features:
    X[feature] = X[feature].astype('category')

X.head()

In [None]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
train = pd.read_csv('/content/Data/TUANDROMD.csv')

# Display the first few rows of the training data
print("Training Data:")

train = train[~(train['Label'].isnull())]
train.columns = train.columns.str.replace(r'[^a-zA-Z0-9_]', '_', regex=True)

X = train.drop(columns=['Label'])
y = train['Label']

categorical_features = X.select_dtypes(include=['object']).columns.tolist()
# Convert categorical features to category dtype
for feature in categorical_features:
    X[feature] = X[feature].astype('category')

X.head()

In [None]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

In [None]:
train = pd.read_csv('/content/Data/Andriod Permissions.csv')

# Display the first few rows of the training data
print("Training Data:")

train = train[~(train['Result'].isnull())]
train.columns = train.columns.str.replace(r'[^a-zA-Z0-9_]', '_', regex=True)

X = train.drop(columns=['Result'])
y = train['Result']

categorical_features = X.select_dtypes(include=['object']).columns.tolist()
# Convert categorical features to category dtype
for feature in categorical_features:
    X[feature] = X[feature].astype('category')

X.head()

In [None]:
X_train3, X_test3, y_train3, y_test3 = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

In [None]:
# Load the data
train = pd.read_csv('/content/Data/Student Success.csv', delimiter=';')

# Clean and preprocess
train = train[~train['Target'].isnull()]  # Remove rows with null Target
train.columns = train.columns.str.replace(r'[^a-zA-Z0-9_]', '_', regex=True)  # Clean column names
train = train[~(train['Target'] == 'Enrolled')]  # Remove 'Enrolled' from Target

# Label encode the Target column
label_encoder = LabelEncoder()
train['Target'] = label_encoder.fit_transform(train['Target'])  # Encode Target as 0 and 1

# Split features and target
X = train.drop(columns=['Target'])
y = train['Target']

# Process categorical features
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
for feature in categorical_features:
    X[feature] = X[feature].astype('category')

X.head()

In [None]:
X_train4, X_test4, y_train4, y_test4 = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

In [None]:
train = pd.read_csv('/content/Data/Phishing URL.csv')

train = train[[x for x in train.columns if x not in ['FILENAME','URL','Domain','Title']]]

# Clean and preprocess
train = train[~train['label'].isnull()]  # Remove rows with null Target
train.columns = train.columns.str.replace(r'[^a-zA-Z0-9_]', '_', regex=True)  # Clean column names

train = train.head(10000)
# Split features and target
X = train.drop(columns=['label'])
y = train['label']

# Process categorical features
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
for feature in categorical_features:
    X[feature] = X[feature].astype('category')

X.head()

In [None]:
X_train5, X_test5, y_train5, y_test5 = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

In [None]:
train = pd.read_csv('/content/Data/parkinsons_updrs.data')

# Display the first few rows of the training data
print("All Data:")
# print(train.head())

train.drop('subject#', axis=1, inplace=True)
train.drop('test_time', axis=1, inplace=True)
train.drop('total_UPDRS', axis=1, inplace=True)

X=train.drop('motor_UPDRS', axis=1)
y=train[['motor_UPDRS']]

# Replace positive and negative infinity with NaN across the DataFrame
X = X.replace([np.inf, -np.inf], np.nan)

X.head()

In [None]:
X_train6, X_test6, y_train6, y_test6 = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
train = pd.read_csv('/content/Data/creditcard.csv')

# Display the first few rows of the training data
print("Training Data:")
# print(train.head())

print(train.shape)

train.drop(columns=['Time'],axis=1)
# test.drop(columns=['id'],axis=1)

# Split the data into features (X) and target (y
X = train.drop(columns=['Time','Class'])
y = train['Class']

categorical_features = X.select_dtypes(include=['object']).columns.tolist()
# Convert categorical features to category dtype
for feature in categorical_features:
    X[feature] = X[feature].astype('category')
# Split the dataset into training and testing sets
X.head()

In [None]:
X_train7, X_test7, y_train7, y_test7 = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Custom transformer to drop highly correlated features
class DropHighCorrelation(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.6):
        """
        Custom transformer to drop highly correlated features.

        Parameters:
        - threshold: Correlation threshold above which features will be dropped.
        """
        self.threshold = threshold
        self.to_drop = None

    def fit(self, X, y=None):
        """
        Identify features to drop based on the correlation threshold.
        """
        corr_matrix = pd.DataFrame(X).corr().abs()  # Compute the absolute correlation matrix
        upper_tri = corr_matrix.where(
            np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
        )  # Extract upper triangle
        self.to_drop = [
            column for column in upper_tri.columns if any(upper_tri[column] > self.threshold)
        ]

        print(f"DropHighCorrelation: {len(self.to_drop)} columns will be dropped due to correlation (threshold={self.threshold}).")
        return self

    def transform(self, X):
        """
        Drop the identified features from the dataset.
        """
        return pd.DataFrame(X).drop(columns=self.to_drop, errors='ignore')

# Main ModelTrainer class
class ModelTrainer:
    def __init__(self, datasets_X, datasets_y, task_type, dataset_names=None):
        """
        Initializes the ModelTrainer with datasets, explicitly provided task type, and dataset names.

        Parameters:
        - datasets_X: List of [X_train, X_test] for different datasets
        - datasets_y: List of [y_train, y_test] for different datasets
        - task_type: A string explicitly specifying the task type, either 'regression' or 'classification'.
        - dataset_names: List of names for the datasets to be used as index in the results DataFrame
        """
        if task_type not in ['classification', 'regression']:
            raise ValueError("task_type must be either 'classification' or 'regression'")

        self.datasets_X = datasets_X
        self.datasets_y = datasets_y
        self.task_type = task_type
        self.dataset_names = dataset_names

    def _select_model(self):
        """Select model based on task type."""
        if self.task_type == 'classification':
            return {
                'RandomForest': RandomForestClassifier(random_state=42),
                'GradientBoosting': GradientBoostingClassifier(random_state=42),
                'LogisticRegression': LogisticRegression(),
                'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
            }
        elif self.task_type == 'regression':
            return {
                'RandomForest': RandomForestRegressor(random_state=42),
                'GradientBoosting': GradientBoostingRegressor(random_state=42),
                'LinearRegression': LinearRegression(),
                'XGBoost': xgb.XGBRegressor(random_state=42)
            }

    def _create_pipeline(self, model, X):
        """Create a preprocessing and modeling pipeline."""
        # Identify categorical and numerical features
        categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
        numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

        # Preprocessing for numerical features: Standard scaling
        numerical_transformer = StandardScaler()

        # Preprocessing for categorical features: One-hot encoding
        categorical_transformer = OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False)

        # Combine preprocessors in a column transformer
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numerical_transformer, numerical_features),
                ('cat', categorical_transformer, categorical_features)
            ]
        )

        # Define a pipeline with preprocessing, correlation dropping, and the specified model
        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),  # First preprocess
            ('drop_high_corr', DropHighCorrelation(threshold=0.6)),  # Then drop highly correlated features
            ('model', model)  # Finally, apply the model
        ])
        return pipeline

    def _get_best_cutoff(self, y_true, y_pred_proba):
        """Use Youden's J statistic to determine the best cutoff point for classification."""
        fpr, tpr, thresholds = roc_curve(y_true, y_pred_proba)
        youden_index = tpr - fpr
        best_cutoff = thresholds[np.argmax(youden_index)]
        return best_cutoff

    def _train_and_evaluate(self, model, X_train, X_test, y_train, y_test):
        """Train the model and evaluate it on both the training and test datasets."""
        start_time = time.time()
        model.fit(X_train, y_train)
        training_time = time.time() - start_time

        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)

        if self.task_type == 'classification':
            y_pred_proba_test = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else y_pred_test
            y_pred_proba_train = model.predict_proba(X_train)[:, 1] if hasattr(model, 'predict_proba') else y_pred_train

            best_cutoff = self._get_best_cutoff(y_test, y_pred_proba_test)
            y_pred_class_test = (y_pred_proba_test >= best_cutoff).astype(int)
            y_pred_class_train = (y_pred_proba_train >= best_cutoff).astype(int)

            metrics = {
                'Train F1': f1_score(y_train, y_pred_class_train),
                'Test F1': f1_score(y_test, y_pred_class_test),
                'Train AUC': roc_auc_score(y_train, y_pred_proba_train),
                'Test AUC': roc_auc_score(y_test, y_pred_proba_test),
                'Train Accuracy': accuracy_score(y_train, y_pred_class_train),
                'Test Accuracy': accuracy_score(y_test, y_pred_class_test),
                'Train Recall': recall_score(y_train, y_pred_class_train),
                'Test Recall': recall_score(y_test, y_pred_class_test),
                'Train Precision': precision_score(y_train, y_pred_class_train),
                'Test Precision': precision_score(y_test, y_pred_class_test),
                'Best Cutoff': best_cutoff,
                'Training Time (seconds)': training_time
            }
        else:
            metrics = {
                'Train MSE': mean_squared_error(y_train, y_pred_train),
                'Test MSE': mean_squared_error(y_test, y_pred_test),
                'Train MAPE': mean_absolute_percentage_error(y_train, y_pred_train),
                'Test MAPE': mean_absolute_percentage_error(y_test, y_pred_test),
                'Train R2': r2_score(y_train, y_pred_train),
                'Test R2': r2_score(y_test, y_pred_test),
                'Training Time (seconds)': training_time
            }

        return metrics

    def train_models(self):
        """Train and evaluate models on multiple datasets and return a DataFrame of results."""
        models = self._select_model()
        results = []

        for idx, (X_data, y_data) in enumerate(zip(self.datasets_X, self.datasets_y)):
            X_train, X_test = X_data
            y_train, y_test = y_data

            for model_name, model in models.items():
                pipeline = self._create_pipeline(model, X_train)
                metrics = self._train_and_evaluate(pipeline, X_train, X_test, y_train, y_test)
                metrics['Dataset'] = self.dataset_names[idx] if self.dataset_names else f'Dataset {idx+1}'
                metrics['Model'] = model_name
                results.append(metrics)

        return pd.DataFrame(results).set_index('Dataset')

In [None]:
# Assume you have the datasets as before
trainer = ModelTrainer(
    datasets_X=[[X_train1, X_test1],[X_train6, X_test6],[X_train7, X_test7]],
    datasets_y=[[y_train1, y_test1],[y_train6, y_test6],[y_train7, y_test7]],
    task_type='regression',
    dataset_names=['All Claims','Parkinsons','Credit Card Fraud']
)

results = trainer.train_models()

In [None]:
results.to_csv('regression.csv',index=False)

In [None]:
results

In [None]:
# Assume you have the datasets as before
trainer = ModelTrainer(
    datasets_X=[[X_train2, X_test2],[X_train3, X_test3],[X_train4, X_test4],[X_train5, X_test5]],
    datasets_y=[[y_train2, y_test2],[y_train3, y_test3],[y_train4, y_test4],[y_train5, y_test5]],
    task_type='classification',
    dataset_names=['Tuandromd','Andriod Permissions','Student Success','Phishing URL']
)

results = trainer.train_models()

In [None]:
results.to_csv('classification.csv',index=False)

In [None]:
results