### Functions for Modeling<a class="anchor" id="functions"></a>

In [4]:
def select_features(df, columns=None):
    """
    Selects specific columns from a DataFrame.

    Parameters:
    - df (DataFrame): The input DataFrame.
    - columns (list or None): List of column names to select. If None, all columns are selected.

    Returns:
    - DataFrame: DataFrame containing only the selected columns.
    """
    if columns is None:
        # If specific columns are not provided, select all columns
        selected_features = df
    else:
        # Select the provided specific columns
        selected_features = df[columns]

    return selected_features

In [5]:
def preprocess_binary_responses(df):
    """
    Preprocesses binary responses in a DataFrame by mapping labels to numerical values.

    Parameters:
    - df (DataFrame): The input DataFrame containing binary response columns.

    Returns:
    DataFrame: DataFrame with binary responses mapped to numerical values.
    """
    #  Map 'Yes' and 'No' to 1 and 0 in columns containing these labels
    mapping_yes_no = {'Yes': 1, 'No': 0}
    df = df.replace(mapping_yes_no)
    
    #  Map 'depressed' and 'not_depressed' to 1 and 0 in the 'depression_status' column
    df['depression_status'] = df['depression_status'].map({'depressed': 1, 'not_depressed': 0})
    return df

In [6]:
def one_hot_encode_object_columns(df):
    """
    One-hot encodes categorical columns in a DataFrame, excluding the 'depression_status' column.

    Parameters:
    - df (DataFrame): The input DataFrame containing categorical columns.

    Returns:
    DataFrame: DataFrame with one-hot encoded categorical columns.
    """
    # Exclude depression_status
    exclude_columns = ['depression_status']
    
    # Select object columns
    object_columns = df.select_dtypes(include=['object']).columns.difference(exclude_columns)
    
    # Apply one-hot codification to selected columns
    df_encoded = pd.get_dummies(df, columns=object_columns, prefix=object_columns)

    df_encoded.info()
    
    return df_encoded 

In [7]:
def normalized_data(df):
    return (df - np.min(df)) / (np.max(df) - np.min(df))    

In [8]:
def std_data(df):
    """
    Standardizes numerical features in a DataFrame, leaving the 'depression_status' column unchanged.

    Parameters:
    - df (DataFrame): The input DataFrame containing numerical features.

    Returns:
    DataFrame: DataFrame with standardized numerical features and the 'depression_status' column.
    """
    
    # Extract the 'depression_status' column
    depression_status_column = df['depression_status']
    
    # Save the original indices
    original_indices = df.index

    # Select all columns except 'depression_status'
    features = df.drop('depression_status', axis=1)

    # Scale the features
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(features)

    # Create a new DataFrame with the scaled features
    df_scaled = pd.DataFrame(scaled_features, columns=features.columns)

    # Restore the original indices
    df_scaled.index = original_indices

    # Add the 'depression_status' column to the new scaled DataFrame
    df_scaled['depression_status'] = depression_status_column

    return df_scaled

In [9]:
def split_data(df, target_column='depression_status', test_size=0.3, random_state=42):
    """
    Splits a DataFrame into training and test sets for machine learning.

    Parameters:
    - df (DataFrame): The input DataFrame.
    - target_column (str, optional): The name of the target column to predict. Default is 'depression_status'.
    - test_size (float, optional): The proportion of the dataset to include in the test split. Default is 0.3.
    - random_state (int, optional): Seed for random number generation. Default is 42.

    Returns:
    tuple: X_train, X_test, y_train, y_test, where
      - X_train (DataFrame): Training features.
      - X_test (DataFrame): Test features.
      - y_train (Series): Training target.
      - y_test (Series): Test target.
    """

    # Split dataframe into training and test sets
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    return X_train, X_test, y_train, y_test

In [10]:
def oversample_data(X_train, y_train, strategy='minority', random_state=42):
    """
    Oversamples the minority class in the target variable.

    Parameters:
    - X_train (DataFrame): Training features.
    - y_train (Series): Training target.
    - strategy (str or float, optional): Strategy for resampling. Default is 'minority'.
    - random_state (int, optional): Seed for random number generation. Default is 42.

    Returns:
    tuple: X_train_oversampled, y_train_oversampled, where
      - X_train_oversampled (DataFrame): Oversampled training features.
      - y_train_oversampled (Series): Oversampled training target.
    """

    # Oversample the minority class in the target variable.
    oversample = RandomOverSampler(sampling_strategy=strategy, random_state=random_state)
    X_train_oversampled, y_train_oversampled = oversample.fit_resample(X_train, y_train)
    
    print(f"Before oversampling: {sorted(Counter(y_train).items())}")
    print(f"After oversampling: {sorted(Counter(y_train_oversampled).items())}")
    
    return X_train_oversampled, y_train_oversampled

In [11]:
def undersample_data(X_train, y_train):
    """
    Undersamples the majority class using a combination of Near-Miss and Tomek Links.

    Parameters:
    - X_train (DataFrame): Training features.
    - y_train (Series): Training target.

    Returns:
    tuple: X_train_combined, y_train_combined, where
      - X_train_combined (DataFrame): Undersampled training features.
      - y_train_combined (Series): Undersampled training target.
    """
    
    # Near-Miss to undersample the majority class 
    nm = NearMiss()
    X_train_nm, y_train_nm = nm.fit_resample(X_train, y_train)
    
    # Tomek Links to delete samples close to each class
    tl = TomekLinks()
    X_train_tl, y_train_tl = tl.fit_resample(X_train_nm, y_train_nm)
    
    # Near-Miss and Tomek Links combined
    smtl = SMOTETomek()
    X_train_combined, y_train_combined = smtl.fit_resample(X_train, y_train)
    
    print(f"Before undersampling: {sorted(Counter(y_train).items())}")
    print(f"After undersampling: {sorted(Counter(y_train_combined).items())}")
    
    return X_train_combined, y_train_combined

In [12]:
def random_search(model, X_train, y_train, param_distributions, n_iter=10, cv=5):
    """
    Performs a random search for hyperparameters for a given model.

    Parameters:
    - model: The machine learning model to be optimized.
    - X_train (DataFrame): Training features.
    - y_train (Series): Training target.
    - param_distributions (dict): Dictionary with parameter names as keys and distributions or lists of parameters to sample from.
    - n_iter (int, optional): Number of parameter settings that are sampled. Default is 10.
    - cv (int, optional): Number of cross-validation folds. Default is 5.

    Returns:
    BestEstimator: Best estimator found during the random search.
    """
    
    # Performs a random search for hyperparameters for a given model
    search = RandomizedSearchCV(model, param_distributions, n_iter=n_iter, cv=cv)
    search.fit(X_train, y_train)
    
    print(f"Best hyperparameters: {search.best_params_}")
    print(f"Accuracy of the best model: {search.best_estimator_.score(X_test, y_test):.2f}")
    
    return search.best_estimator_

In [13]:
def grid_search(model, X_train, y_train, param_distributions, cv=5, scoring='accuracy'):
    """
    Performs a grid search for hyperparameters for a given model.

    Parameters:
    - model: The machine learning model to be optimized.
    - X_train (DataFrame): Training features.
    - y_train (Series): Training target.
    - param_distributions (dict): Dictionary with parameter names as keys and lists of parameters to search.
    - cv (int, optional): Number of cross-validation folds. Default is 5.
    - scoring (str, optional): Scoring metric for model evaluation. Default is 'accuracy'.

    Returns:
    BestEstimator: Best estimator found during the grid search.
    """
    
    # Performs a random search for hyperparameters for a given model
    search = GridSearchCV(model, param_distributions, cv=cv, scoring=scoring)
    search.fit(X_train, y_train)
    
    print(f"Best hyperparameters: {search.best_params_}")
    print(f"Accuracy of the best model: {search.best_estimator_.score(X_test, y_test):.2f}")
    
    return search.best_estimator_

In [7]:
def print_confusion_matrix(model, X_test, y_test):
    """
    Display the confusion matrix and print True Negatives, True Positives, 
    False Positives, and False Negatives.

    Parameters:
    - model: The trained machine learning model.
    - X_test: The feature matrix of the test set.
    - y_test: The true labels of the test set.
    """
    # Make predictions using the model
    y_pred = model.predict(X_test)

    # Compute the confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    # Plot the confusion matrix using seaborn
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.show()

    # Print True Negatives, True Positives, False Positives, and False Negatives
    print('True Negatives: {}'.format(cm[0][0]))
    print('True Positives: {}'.format(cm[1][1]))
    print('False Positives: {}'.format(cm[0][1]))
    print('False Negatives: {}'.format(cm[1][0]))


In [14]:
def print_classification_report(model, X_test, y_test, title=''):
    """
    Prints the classification report for a given model on the test set.

    Parameters:
    - model: The trained machine learning model.
    - X_test (DataFrame): Test features.
    - y_test (Series): Test target.
    - title (str, optional): Title for the classification report. Default is an empty string.
    """
    
    print("_"*60)
    print(f"\nCLASSIFICATION REPORT FOR: {title}")
    print("_"*60)
    predictions = model.predict(X_test)
    print(classification_report(y_test, predictions, zero_division=1))
    

In [15]:
def plot_precision_recall_curve(model, X_test, y_test, title=''):
    """
    Plots the precision-recall curve for a given model on the test set.

    Parameters:
    - model: The trained machine learning model.
    - X_test (DataFrame): Test features.
    - y_test (Series): Test target.
    - title (str, optional): Title for the precision-recall curve. Default is an empty string.
    """
    
    predictions = model.predict(X_test)
    acc = accuracy_score(y_test, predictions)    
    precision, recall, _ = precision_recall_curve(y_test, predictions)
    print(f"AUC-PR: {acc:.2f}")
    plt.figure(figsize=(8, 6))
    plt.plot(recall, precision, color='b', lw=2, label='PR Curve')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'PRECISION CURVE FOR: {title}')
    plt.legend(loc='lower left')
    plt.grid(True)
    plt.show()

In [16]:
def analyze_results(model, X_train, X_test, y_test, title=''):
    """
    Analyzes and visualizes the results of a machine learning model.

    Parameters:
    - model: The trained machine learning model.
    - X_train (DataFrame): Training features.
    - X_test (DataFrame): Test features.
    - y_test (Series): Test target.
    - title (str, optional): Title for the analysis. Default is an empty string.
    """
    
    print_classification_report(model, X_test, y_test, title=title)
    plot_precision_recall_curve(model, X_test, y_test, title=title)

In [1]:
def lime_explanation(model, X_test, y_test, start_index=0, end_index=None):
    """
    Generates and displays LIME explanations for a machine learning model.

    Parameters:
    - model: The trained machine learning model.
    - X_test (DataFrame): Test features.
    - y_test (Series): Test target.
    - start_index (int, optional): Starting index for the data subset. Default is 0.
    - end_index (int or None, optional): Ending index for the data subset. Default is None.
    """

    # Create a LIME explainer
    lime = LimeTabular(model=model.predict_proba, data=X_train, random_state=1)

    # Explain the local model for the selected data
    lime_local = lime.explain_local(X_test.iloc[start_index:end_index], 
                                    y_test.iloc[start_index:end_index], 
                                    name='LIME')

    # Show the LIME explanation
    show(lime_local)

In [6]:
def lime_tabular_explainer(model, X_train, instance, num_features):
    """
    Generate and visualize a LIME (Local Interpretable Model-agnostic Explanations) explanation for a tabular dataset.

    Parameters:
    - model: The trained machine learning model to be explained.
    - X_train: The training data used to train the model. It should be a pandas DataFrame.
    - instance_to_explain: Index of the specific instance in X_train that you want to explain.
    - num_features: Number of features to include in the explanation.

    Returns:
    None
    """
    # Create the explainer
    explainer = lime.lime_tabular.LimeTabularExplainer(X_train.values,
                                                      class_names=['Not Depressed', 'Depressed'],
                                                      feature_names=X_train.columns.tolist()
                                                      )
    # Choose the specific instance for explanation
    instance_to_explain = X_test.iloc[instance]
    # Generate the explanation
    explanation = explainer.explain_instance(instance_to_explain.values, model.predict_proba, num_features=num_features)
    # Visualize the explanation
    print(f"Instance: {instance}")
    explanation.show_in_notebook(show_all=True)


In [2]:
def shap_explanation(model, X_test, start_index=0, end_index=None):
    """
    Generates and displays SHAP explanations for a machine learning model.

    Parameters:
    - model: The trained machine learning model.
    - X_test (DataFrame): Test features.
    - start_index (int, optional): Starting index for the data subset. Default is 0.
    - end_index (int or None, optional): Ending index for the data subset. Default is None.
    """
        
   # If end_index is None, take all indices from start_index to the end
    end_index = end_index or len(X_test)

    # Create a SHAP explainer object
    explainer = shap.TreeExplainer(model)

    # Calculate SHAP values for the selected data
    shap_values = explainer.shap_values(X_test.iloc[start_index:end_index])
    
#     # class 0 = Contribution to class 1
#     # class 1 = Contribution to class 2
#     print(shap_values[0].shape)
#     print(shap_values)

    # Display the SHAP force plot
    prediction = model.predict(X_test[start_index:end_index])[0]
    print(f"Prediction: {prediction}")
    
    if (end_index - start_index == 1):
        display(shap.force_plot(explainer.expected_value[1], shap_values[1], X_test.iloc[start_index:end_index],matplotlib=matplotlib))  
    else:
         display(shap.force_plot(explainer.expected_value[1], shap_values[1], X_test.iloc[start_index:end_index],matplotlib=False))  

    display(shap.summary_plot(shap_values,X_test))