# Import libraries

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

import joblib
import plotly.graph_objects as go

import time
import os

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df_95 = pd.read_csv('/content/df_95_labeled_on_lift.csv')


In [None]:
 pd.options.mode.copy_on_write = True

# Function - Misclassification Logic Work

In [None]:
# function for masking misclassification (logic)

def misclassification_mask(df, column_to_mask, chunk_size, threshold):
    """
    Apply a binary mask to each row in a DataFrame based on the average value of a specified column in chunks.

    Parameters:
    - df: pandas.DataFrame, the DataFrame to process.
    - column_to_mask: str, the name of the column to evaluate.
    - chunk_size: int, the number of rows in each chunk.
    - threshold: float, the threshold for determining the mask value.

    Returns:
    - df: pandas.DataFrame, the original DataFrame with an added 'mask' column.
    """
    # Calculate the total number of chunks
    total_chunks = len(df) // chunk_size

    # Initialize an empty list to store all calculations
    all_calculations = []

    # Process each chunk
    for i in range(total_chunks):
        # Fetch a chunk of data
        start_index = i * chunk_size
        end_index = start_index + chunk_size
        data_chunk = df[column_to_mask].iloc[start_index:end_index]

        # Process the chunk based on the sum of values
        processed_chunk = [1 if sum(data_chunk)/len(data_chunk) >= threshold else 0] * len(data_chunk)
        # Append the processed chunk to the list of all calculations
        all_calculations += processed_chunk

    # Process remainder if any
    remainder = len(df) % chunk_size
    if remainder > 0:
        remainder_data_chunk = df[column_to_mask].iloc[-remainder:]
        processed_chunk = [1 if sum(remainder_data_chunk)/len(remainder_data_chunk) >= threshold else 0] * len(remainder_data_chunk)
        all_calculations += processed_chunk

    # Assign the calculated mask to the DataFrame
    df['mask'] = all_calculations
    return df.reset_index()

# Example usage
# Assuming df_option3 is your DataFrame and already defined.
# df_option3 = pd.DataFrame(...)
# df_with_mask = apply_mask_based_on_threshold(df_option3, 'predicted', 60, 0.3)
# print(df_with_mask)


In [None]:
# function for defining on-lift identification

def on_lift_event_identification(df, column_to_mask, chunk_size, threshold):
    """
    Applies a binary mask to a DataFrame based on the average value of a specified column in chunks,
    and logs the start and end index of chunks meeting the threshold.

    Parameters:
    - df: pandas.DataFrame, the DataFrame to process.
    - column_to_mask: str, the column based on whose values the mask will be applied.
    - chunk_size: int, the number of rows in each chunk.
    - threshold: float, the threshold value for applying the mask.

    Returns:
    - df: pandas.DataFrame, the DataFrame with an added 'mask' column.
    - event_log: dict, log of chunk indices and their start and end positions that meet the threshold.
    """

    # Calculate the total number of chunks and remainder
    total_chunks = len(df) // chunk_size
    remainder = len(df) % chunk_size

    # Initialize an empty list for calculations and a dictionary for event logging
    all_calculations = []
    event_log = {}

    # Process each chunk
    for i in range(total_chunks):
        start_index = i * chunk_size
        end_index = start_index + chunk_size
        data_chunk = df[column_to_mask].iloc[start_index:end_index]

        # Process the chunk
        if sum(data_chunk) / len(data_chunk) >= threshold:
            processed_chunk = [1] * len(data_chunk)
            event_log[i] = (start_index, end_index)
        else:
            processed_chunk = [0] * len(data_chunk)

        all_calculations += processed_chunk

    # Process remainder
    if remainder:
        remainder_data_chunk = df[column_to_mask].iloc[-remainder:]
        if sum(remainder_data_chunk) / len(remainder_data_chunk) >= threshold:
            processed_chunk = [1] * len(remainder_data_chunk)
        else:
            processed_chunk = [0] * len(remainder_data_chunk)
        all_calculations += processed_chunk
        # Optionally log remainder chunk if it meets the threshold
        if sum(remainder_data_chunk) / len(remainder_data_chunk) >= threshold:
            event_log[total_chunks] = (len(df) - remainder, len(df))

    # Assign calculated masks to the DataFrame
    df['mask'] = all_calculations

    ## Storing in a dict instead

    continuous_events_dict = {}
    event_index = 1

    start = None
    end = None

    for key in sorted(event_log.keys()):
        if start is None:
            start, end = event_log[key]
        elif end == event_log[key][0]:
            end = event_log[key][1]
        else:
            continuous_events_dict[event_index] = (start, end)
            event_index += 1
            start, end = event_log[key]
    # Append the last continuous event
    if start is not None and end is not None:
        continuous_events_dict[event_index] = (start, end)

    print("Continuous events as dictionary:")
    for key, value in continuous_events_dict.items():
        print(f"{key}: {value}")

    print('number of continuous events:', len(continuous_events_dict))

    df['event']=0
    for label in continuous_events_dict:
      range_val = [x for x in range(continuous_events_dict[label][0], continuous_events_dict[label][1] + 1)]
      df.loc[range_val, 'event'] = label


    return df

# Example usage:
# df_option3 = pd.DataFrame(...) # Assuming df_option3 is your DataFrame
# df_on_lift = on_lift_event_identification(df_option3, 'predicted', 60, 0.3)
# print(df_on_lift)
