In [5]:
import joblib
import pandas as pd

#get cwd    
import os
cwd = os.getcwd()
print(f"Current working directory: {cwd}")

#change to  /Users/ulisesgordillo/Downloads/capstone/data
os.chdir('/Users/ulisesgordillo/Downloads/capstone/model_output/second_model')
cwd = os.getcwd()
print(f"Current working directory: {cwd}")







Current working directory: /Users/ulisesgordillo/Downloads/capstone/model_output/second_model
Current working directory: /Users/ulisesgordillo/Downloads/capstone/model_output/second_model


In [6]:


model_lgb = None
X_test = None
final_df = None

# --- Load the Model ---
try:
    model_lgb = joblib.load('lightgbm_model.joblib')
except Exception as e:
    print(f"ERROR loading model_lgb: {e}")

# --- Load X_test ---
try:
    X_test = pd.read_parquet('X_test_model_features.parquet')
except Exception: # Broad exception to catch FileNotFoundError or Parquet read errors
    try:
        X_test = pd.read_csv('X_test_model_features.csv')
    except Exception as e_csv:
        print(f"ERROR loading X_test (tried Parquet and CSV): {e_csv}")

# --- Load final_df ---
try:
    final_df = pd.read_parquet('final_df_for_analysis.parquet')
except Exception: # Broad exception
    try:
        final_df = pd.read_csv('final_df_for_analysis.csv')
        if final_df is not None and 'DateOfReport' in final_df.columns: # Check if df loaded
            final_df['DateOfReport'] = pd.to_datetime(final_df['DateOfReport'], errors='coerce')
    except Exception as e_csv:
        print(f"ERROR loading final_df (tried Parquet and CSV): {e_csv}")

# --- Check if all loaded successfully before proceeding ---
if model_lgb is not None and X_test is not None and final_df is not None:
    # Proceed with your stratification logic using model_lgb, X_test, final_df
    # print("All components loaded successfully.") # Optional: uncomment for a success message
    pass
else:
    print("One or more essential components failed to load. Cannot proceed with analysis.")

In [7]:
# --- Step 1: Configuration for Stratification ---
probability_class_idx_for_strata = 0
strata_config = {
    'labels': ['A', 'B', 'C', 'D', 'E'],
    'quantiles': [0, 0.05, 0.25, 0.75, 0.95, 1.0]
}
# Dynamically create the names for the new columns
proba_col_name = f'probability_class{probability_class_idx_for_strata}'
stratum_col_name = f'stratum_class{probability_class_idx_for_strata}' # This is THE key variable

# --- Step 2: Function to Assign Strata ---
def assign_strata_from_probabilities(
    trained_model, features_for_prediction, df_to_add_strata_to,
    prob_class_idx, strata_definition, new_proba_col_name, new_stratum_col_name
):
    all_class_probs = trained_model.predict_proba(features_for_prediction)
    selected_probs = all_class_probs[:, prob_class_idx]
    output_df = df_to_add_strata_to.copy()
    output_df[new_proba_col_name] = selected_probs
    try:
        output_df[new_stratum_col_name] = pd.qcut(
            output_df[new_proba_col_name], q=strata_definition['quantiles'],
            labels=strata_definition['labels'], duplicates='drop'
        )
    except Exception:
        output_df[new_stratum_col_name] = np.nan
    return output_df

# --- Step 3: Create the Stratified DataFrame ---
stratified_df = None # Initialize
if ('model_lgb' in locals() and
    'X_test' in locals() and isinstance(X_test, pd.DataFrame) and
    'final_df' in locals() and isinstance(final_df, pd.DataFrame)):

    # Ensure DateOfReport is datetime in final_df if it's going to be used for filtering
    if 'DateOfReport' in final_df.columns and final_df['DateOfReport'].dtype != '<M8[ns]':
         final_df['DateOfReport'] = pd.to_datetime(final_df['DateOfReport'])
    if 'SupplierName' not in final_df.columns: # Basic check
        print("Warning: 'SupplierName' not found in final_df. Supplier filtering will not work.")


    stratified_df = assign_strata_from_probabilities(
        model_lgb, X_test, final_df, probability_class_idx_for_strata,
        strata_config, proba_col_name, stratum_col_name
    )
else:
    print("Initial setup skipped: Required variables (model_lgb, X_test, final_df) not found or not of correct type.")

In [8]:
def print_stratum_distribution(df, column_name, config_labels, description="Data"):
    if df is None or df.empty:
        print(f"No data for: {description}")
        return
    if column_name not in df.columns or df[column_name].isnull().all():
        print(f"Stratum column '{column_name}' missing or all NaNs for: {description}")
        return

    print(f"\n--- Stratum Distribution for: {description} (Total: {df.shape[0]}) ---")
    counts = df[column_name].value_counts().sort_index()
    percentages = df[column_name].value_counts(normalize=True).sort_index() * 100
    for label in config_labels:
        c = counts.get(label, 0)
        p = percentages.get(label, 0.0)
        print(f"  {label}: {c} ({p:.2f}%)")

#Manual definition of the stratification column and its configuration
# Example: (Replace with your actual values)
filter_supplier_name = 'SupplierH'
filter_start_date = pd.to_datetime('2023-10-01')
filter_end_date = pd.to_datetime('2023-10-31')
# --- END OF MANUAL DEFINITION ---

if 'stratified_df' in locals() and stratified_df is not None:
    # 1. Filter by Supplier
    if filter_supplier_name and 'SupplierName' in stratified_df.columns:
        supplier_filtered_df = stratified_df[stratified_df['SupplierName'] == filter_supplier_name]
        print_stratum_distribution(supplier_filtered_df, stratum_col_name, strata_config['labels'],
                                   description=f"Supplier: {filter_supplier_name}")

    # 2. Filter by Date Range
    if filter_start_date and filter_end_date and 'DateOfReport' in stratified_df.columns:
        # Ensure DateOfReport is datetime if not already done
        if stratified_df['DateOfReport'].dtype != '<M8[ns]':
            stratified_df['DateOfReport'] = pd.to_datetime(stratified_df['DateOfReport'])

        date_filtered_df = stratified_df[
            (stratified_df['DateOfReport'] >= filter_start_date) &
            (stratified_df['DateOfReport'] <= filter_end_date)
        ]
        print_stratum_distribution(date_filtered_df, stratum_col_name, strata_config['labels'],
                                   description=f"Date: {filter_start_date.strftime('%Y-%m-%d')} to {filter_end_date.strftime('%Y-%m-%d')}")

    # 3. Filter by BOTH Supplier AND Date Range
    if filter_supplier_name and filter_start_date and filter_end_date and \
       'SupplierName' in stratified_df.columns and 'DateOfReport' in stratified_df.columns:
        # Ensure DateOfReport is datetime
        if stratified_df['DateOfReport'].dtype != '<M8[ns]':
            stratified_df['DateOfReport'] = pd.to_datetime(stratified_df['DateOfReport'])

        both_filtered_df = stratified_df[
            (stratified_df['SupplierName'] == filter_supplier_name) &
            (stratified_df['DateOfReport'] >= filter_start_date) &
            (stratified_df['DateOfReport'] <= filter_end_date)
        ]
        print_stratum_distribution(both_filtered_df, stratum_col_name, strata_config['labels'],
                                   description=f"Supplier: {filter_supplier_name} AND Date: {filter_start_date.strftime('%Y-%m-%d')} to {filter_end_date.strftime('%Y-%m-%d')}")
else:
    print("`stratified_df` not found or is None. Cannot perform filtering.")


--- Stratum Distribution for: Supplier: SupplierH (Total: 4390) ---
  A: 0 (0.00%)
  B: 0 (0.00%)
  C: 249 (5.67%)
  D: 2285 (52.05%)
  E: 1856 (42.28%)

--- Stratum Distribution for: Date: 2023-10-01 to 2023-10-31 (Total: 7840) ---
  A: 144 (1.84%)
  B: 1662 (21.20%)
  C: 3952 (50.41%)
  D: 1629 (20.78%)
  E: 453 (5.78%)

--- Stratum Distribution for: Supplier: SupplierH AND Date: 2023-10-01 to 2023-10-31 (Total: 269) ---
  A: 0 (0.00%)
  B: 0 (0.00%)
  C: 3 (1.12%)
  D: 130 (48.33%)
  E: 136 (50.56%)
