In [2]:
# Import necessary libraries
import sys
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Add the path where telecom_analysis.py is located
sys.path.append(os.path.abspath('../scripts'))

# Import functions from telecom_analysis.py
# Import the functions from feature_engineering.py
from eda import load_data
from feature_engineering import create_aggregate_features, extract_temporal_features, one_hot_encode
from feature_engineering import label_encode, handle_missing_values, scale_numerical_features



In [2]:
file_path = '../data/data.csv'
df = load_data(file_path)

In [None]:
df.head(3)

In [4]:
# Apply aggregate features
df = create_aggregate_features(df)

In [None]:
df

In [5]:
# Extract temporal features
df = extract_temporal_features(df)

In [None]:
df

In [None]:
# Handle missing values
df = handle_missing_values(df, strategy='median')





In [7]:
# Apply normalization or standardization
df = scale_numerical_features(df, columns=['Amount', 'TransactionCount'], method='normalize')


In [None]:
# Ensure that all references to columns are in lowercase
df.columns = df.columns.str.lower()
print(df.columns)

######TTT

In [3]:
import scorecardpy as sc
from monotonic_binning.monotonic_woe_binning import Binning

In [4]:
file_path = '../data/data.csv'
df = load_data(file_path)

In [5]:
df.describe()


Unnamed: 0,CountryCode,Amount,Value,PricingStrategy,FraudResult
count,95662.0,95662.0,95662.0,95662.0,95662.0
mean,256.0,6717.846,9900.584,2.255974,0.002018
std,0.0,123306.8,123122.1,0.732924,0.044872
min,256.0,-1000000.0,2.0,0.0,0.0
25%,256.0,-50.0,275.0,2.0,0.0
50%,256.0,1000.0,1000.0,2.0,0.0
75%,256.0,2800.0,5000.0,2.0,0.0
max,256.0,9880000.0,9880000.0,4.0,1.0


In [12]:
df.head(3)

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15T02:18:49Z,2,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15T02:19:08Z,2,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15T02:44:21Z,2,0


In [6]:
# Step 1: Binning for numeric columns
numeric_cols = ['Amount', 'Value']
target_col = 'FraudResult'  # Target variable

# Perform monotonic binning for numeric variables
bins = sc.woebin(df, y=target_col, x=numeric_cols, method="chimerge")

[INFO] creating woe binning ...


In [10]:
import numpy as np
import pandas as pd

# Perform monotonic binning for numeric variables
bins = sc.woebin(df, y=target_col, x=numeric_cols, method="chimerge")

# Step 2: Fix the binning output to remove list-wrapped or array-like variable names
for var in bins:
    # Ensure all variable names are flattened and converted to string types
    bins[var]['variable'] = bins[var]['variable'].apply(
        lambda x: str(x[0]) if isinstance(x, (list, tuple, np.ndarray)) else str(x)
    )

# Step 3: Apply the binning to the dataset to get WoE values
df_woe = sc.woebin_ply(df, bins)

# Step 4: Display the dataset with WoE transformed values
print("\nData with WoE transformed values:")
print(df_woe)


[INFO] creating woe binning ...
[INFO] converting into woe values ...

Data with WoE transformed values:
              TransactionId  FraudResult    ProviderId CurrencyCode     ProductCategory         BatchId   Amount  TransactionStartTime       CustomerId       AccountId     ProductId  PricingStrategy  CountryCode       SubscriptionId    ChannelId  Value_woe
0       TransactionId_76871            0  ProviderId_6          UGX             airtime   BatchId_36123   1000.0  2018-11-15T02:18:49Z  CustomerId_4406  AccountId_3957  ProductId_10                2          256   SubscriptionId_887  ChannelId_3  -3.986997
1       TransactionId_73770            0  ProviderId_4          UGX  financial_services   BatchId_15642    -20.0  2018-11-15T02:19:08Z  CustomerId_4406  AccountId_4841   ProductId_6                2          256  SubscriptionId_3829  ChannelId_2  -3.986997
2       TransactionId_26203            0  ProviderId_6          UGX             airtime   BatchId_53941    500.0  2018-11-15

In [15]:
df.describe()

Unnamed: 0,CountryCode,Amount,Value,PricingStrategy,FraudResult
count,95662.0,95662.0,95662.0,95662.0,95662.0
mean,256.0,6717.846,9900.584,2.255974,0.002018
std,0.0,123306.8,123122.1,0.732924,0.044872
min,256.0,-1000000.0,2.0,0.0,0.0
25%,256.0,-50.0,275.0,2.0,0.0
50%,256.0,1000.0,1000.0,2.0,0.0
75%,256.0,2800.0,5000.0,2.0,0.0
max,256.0,9880000.0,9880000.0,4.0,1.0


In [22]:
import pandas as pd
import scorecardpy as sc

def woe_num(x, y, df):
    """
    Function to perform WoE binning on numeric variables.
    
    Parameters:
    - x: list of numerical column names.
    - y: target variable (string).
    - df: DataFrame containing the data.
    
    Returns:
    - breaks: A dictionary with breaks (bins) for each numerical variable.
    """
    global breaks
    breaks = {}
    
    for i in x:
        # Perform WoE binning using scorecardpy
        bin_result = sc.woebin(df, y=y, x=[i], method='chimerge')
        
        # Extract the break points from the binning result for the current variable
        breaks[i] = bin_result[i]['breaks'].tolist()

    return breaks


# Example usage

# Define the numerical variables and target
numeric_cols = ['Amount', 'Value', 'PricingStrategy',]
target_col = 'FraudResult'  # Use your actual target variable

# Call the function
breaks = woe_num(numeric_cols, target_col, df)

# Print the breaks for each numerical variable
print(breaks)


[INFO] creating woe binning ...
[INFO] creating woe binning ...
[INFO] creating woe binning ...
{'Amount': ['1500.0', 'inf'], 'Value': ['1500.0', 'inf'], 'PricingStrategy': ['4.0', 'inf']}


In [24]:
import pandas as pd
import scorecardpy as sc

def woe_cat(categorical_cols, target_col, df):
    """
    Function to perform WoE binning on categorical variables.
    
    Parameters:
    - categorical_cols: list of categorical column names.
    - target_col: target variable (string).
    - df: DataFrame containing the data.
    
    Returns:
    - woe_results: A dictionary with WoE values for each categorical variable.
    """
    woe_results = {}  # Dictionary to store WoE values for each variable
    
    for col in categorical_cols:
        # Perform WoE binning using scorecardpy
        bin_result = sc.woebin(df, y=target_col, x=[col])
        
        # Debugging: Print the structure of the bin_result to identify available columns
        print(f"Results for {col}:")
        print(bin_result[col])  # Print the DataFrame for the current variable
        
        # Extract the WoE values from the binning result for the current variable
        # Adjusting based on the observed columns in the bin_result
        if 'woe' in bin_result[col].columns:
            woe_results[col] = bin_result[col][['level', 'woe']].set_index('level').to_dict()['woe']
        else:
            print(f"Warning: 'woe' column not found for {col}. Available columns: {bin_result[col].columns.tolist()}")

    return woe_results


# Define the categorical variables and target
categorical_columns = ['ProductCategory', 'ChannelId', 'ProviderId']  # Add your actual categorical variables here
target_column = 'FraudResult'  # Use your actual target variable

# Assuming 'df' is already defined and contains the necessary data
# Call the function
woe_values = woe_cat(categorical_columns, target_column, df)

# Print the WoE values for each categorical variable
print("WoE values for each categorical variable:")
for col, woe_dict in woe_values.items():
    print(f"{col}: {woe_dict}")


[INFO] creating woe binning ...
Results for ProductCategory:
          variable                                                bin  count  count_distr   good  bad   badprob       woe    bin_iv  total_iv                                             breaks  is_special_values
0  ProductCategory  data_bundles%,%movies%,%ticket%,%other%,%tv%,%...  48312     0.505028  48294   18  0.000373 -1.690824  0.697628  0.948087  data_bundles%,%movies%,%ticket%,%other%,%tv%,%...              False
1  ProductCategory      financial_services%,%utility_bill%,%transport  47350     0.494972  47175  175  0.003696  0.607033  0.250460  0.948087      financial_services%,%utility_bill%,%transport              False


KeyError: "['level'] not in index"

In [30]:
# Import necessary libraries
import pandas as pd
import scorecardpy as sc

# Define the function for WoE binning on categorical variables
def woe_cat(categorical_cols, target_col, df):
    """
    Function to perform WoE binning on categorical variables.
    
    Parameters:
    - categorical_cols: list of categorical column names.
    - target_col: target variable (string).
    - df: DataFrame containing the data.
    
    Returns:
    - woe_results: A dictionary with WoE values for each categorical variable.
    """
    woe_results = {}  # Dictionary to store WoE values for each variable
    
    for col in categorical_cols:
        # Perform WoE binning using scorecardpy
        bin_result = sc.woebin(df, y=target_col, x=[col])
        
        # Check if bin_result is a dictionary and extract the appropriate DataFrame
        if col in bin_result:
            bin_df = bin_result[col]
            print(f"Results for {col}:")
            print(bin_df)  # Print the entire binning result
            
            # Extract WoE values; check for expected column names
            if 'woe' in bin_df.columns and 'bin' in bin_df.columns:
                # Create a dictionary with bins as keys and WoE values as values
                woe_results[col] = bin_df[['bin', 'woe']].set_index('bin').to_dict()['woe']
            else:
                print(f"Warning: Expected 'woe' and 'bin' columns not found for {col}. Available columns: {bin_df.columns.tolist()}")
        else:
            print(f"Error: {col} not found in bin_result. Output: {bin_result}")

    return woe_results


# Define the categorical variables and target
categorical_columns = ['ProductCategory', 'ChannelId', 'ProviderId']  # Add your actual categorical variables here
target_column = 'FraudResult'  # Use your actual target variable

# Call the function
woe_values = woe_cat(categorical_columns, target_column, df)

# Print the WoE values for each categorical variable
print("WoE values for each categorical variable:")
for col, woe_dict in woe_values.items():
    print(f"{col}: {woe_dict}")


[INFO] creating woe binning ...
Results for ProductCategory:
          variable                                                bin  count  count_distr   good  bad   badprob       woe    bin_iv  total_iv                                             breaks  is_special_values
0  ProductCategory  data_bundles%,%movies%,%ticket%,%other%,%tv%,%...  48312     0.505028  48294   18  0.000373 -1.690824  0.697628  0.948087  data_bundles%,%movies%,%ticket%,%other%,%tv%,%...              False
1  ProductCategory      financial_services%,%utility_bill%,%transport  47350     0.494972  47175  175  0.003696  0.607033  0.250460  0.948087      financial_services%,%utility_bill%,%transport              False
[INFO] creating woe binning ...
Results for ChannelId:
    variable                        bin  count  count_distr   good  bad   badprob       woe    bin_iv  total_iv                     breaks  is_special_values
0  ChannelId  ChannelId_5%,%ChannelId_2  38189     0.399208  38184    5  0.000131 -2.73686

In [None]:
# Import necessary libraries
import pandas as pd
import scorecardpy as sc

# Define the function for WoE binning on numerical variables
def woe_numerical(numerical_cols, target_col, df):
    """
    Function to perform WoE binning on numerical variables.
    
    Parameters:
    - numerical_cols: list of numerical column names.
    - target_col: target variable (string).
    - df: DataFrame containing the data.
    
    Returns:
    - woe_results: A dictionary with WoE values for each numerical variable.
    """
    woe_results = {}  # Dictionary to store WoE values for each variable
    
    for col in numerical_cols:
        # Perform WoE binning using scorecardpy
        bin_result = sc.woebin(df, y=target_col, x=[col])
        
        # Check if bin_result contains the expected structure
        if isinstance(bin_result, dict) and col in bin_result:
            # Extract WoE values
            woe_results[col] = bin_result[col][['bin', 'woe']].set_index('bin').to_dict()['woe']
        else:
            print(f"Error: Unexpected output structure for {col}. Output: {bin_result}")
    
    return woe_results

# Define the function for WoE binning on categorical variables
def woe_cat(categorical_cols, target_col, df):
    """
    Function to perform WoE binning on categorical variables.
    
    Parameters:
    - categorical_cols: list of categorical column names.
    - target_col: target variable (string).
    - df: DataFrame containing the data.
    
    Returns:
    - woe_results: A dictionary with WoE values for each categorical variable.
    """
    woe_results = {}  # Dictionary to store WoE values for each variable
    
    for col in categorical_cols:
        # Perform WoE binning using scorecardpy
        bin_result = sc.woebin(df, y=target_col, x=[col])
        
        # Check if bin_result is a dictionary and extract the appropriate DataFrame
        if col in bin_result:
            bin_df = bin_result[col]
            print(f"Results for {col}:")
            print(bin_df)  # Print the entire binning result
            
            # Extract WoE values; check for expected column names
            if 'woe' in bin_df.columns and 'bin' in bin_df.columns:
                # Create a dictionary with bins as keys and WoE values as values
                woe_results[col] = bin_df[['bin', 'woe']].set_index('bin').to_dict()['woe']
            else:
                print(f"Warning: Expected 'woe' and 'bin' columns not found for {col}. Available columns: {bin_df.columns.tolist()}")
        else:
            print(f"Error: {col} not found in bin_result. Output: {bin_result}")

    return woe_results

# Specify your numerical columns and target variable
numerical_columns = ['Amount', 'Value', 'PricingStrategy']  # Add your actual numerical variables here
target_column = 'FraudResult'  # Use your actual target variable

# Assuming 'df' is already defined and contains the necessary data
# Call the function for numerical variables
woe_values_numerical = woe_numerical(numerical_columns, target_column, df)

# Define the categorical variables
categorical_columns = ['ProductCategory', 'ChannelId', 'ProviderId']  # Add your actual categorical variables here

# Call the function for categorical variables
woe_values_categorical = woe_cat(categorical_columns, target_column, df)

# Merging the results
def merge_woe_results(woe_categorical, woe_numerical):
    """
    Function to merge WoE results from categorical and numerical variables.

    Parameters:
    - woe_categorical: Dictionary of WoE values for categorical variables.
    - woe_numerical: Dictionary of WoE values for numerical variables.

    Returns:
    - combined_woe: A DataFrame containing all WoE values.
    """
    # Create empty DataFrame for combined results
    combined_woe = pd.DataFrame()

    # Convert categorical results to DataFrame
    for col, woe_dict in woe_categorical.items():
        temp_df = pd.DataFrame(list(woe_dict.items()), columns=[col, 'woe'])
        temp_df['variable_type'] = 'categorical'
        combined_woe = pd.concat([combined_woe, temp_df], ignore_index=True)

    # Convert numerical results to DataFrame
    for col, woe_dict in woe_numerical.items():
        temp_df = pd.DataFrame(list(woe_dict.items()), columns=['bin', 'woe'])
        temp_df['variable_type'] = 'numerical'
        temp_df['variable'] = col
        combined_woe = pd.concat([combined_woe, temp_df], ignore_index=True)

    return combined_woe

# Merge the WoE results
combined_woe_results = merge_woe_results(woe_values_categorical, woe_values_numerical)

# Print the combined WoE results
print("Combined WoE results:")
print(combined_woe_results)


In [31]:
# Import necessary libraries
import pandas as pd
import scorecardpy as sc

# Define the function for WoE binning on numerical variables
def woe_numerical(numerical_cols, target_col, df):
    """
    Function to perform WoE binning on numerical variables.
    
    Parameters:
    - numerical_cols: list of numerical column names.
    - target_col: target variable (string).
    - df: DataFrame containing the data.
    
    Returns:
    - woe_results: A dictionary with WoE values for each numerical variable.
    """
    woe_results = {}  # Dictionary to store WoE values for each variable
    
    for col in numerical_cols:
        # Perform WoE binning using scorecardpy
        bin_result = sc.woebin(df, y=target_col, x=[col])
        
        # Check if bin_result contains the expected structure
        if isinstance(bin_result, dict) and col in bin_result:
            # Extract WoE values
            woe_results[col] = bin_result[col][['bin', 'woe']].set_index('bin').to_dict()['woe']
        else:
            print(f"Error: Unexpected output structure for {col}. Output: {bin_result}")
    
    return woe_results

# Specify your numerical columns and target variable
numerical_columns = ['Amount', 'Value', 'PricingStrategy']  # Add your actual numerical variables here
target_column = 'FraudResult'  # Use your actual target variable

# Assuming 'df' is already defined and contains the necessary data
# Call the function
woe_values_numerical = woe_numerical(numerical_columns, target_column, df)

# Print the WoE values for each numerical variable
print("WoE values for each numerical variable:")
for col, woe_dict in woe_values_numerical.items():
    print(f"{col}: {woe_dict}")


[INFO] creating woe binning ...
[INFO] creating woe binning ...
[INFO] creating woe binning ...
WoE values for each numerical variable:
Amount: {'[-inf,0.0)': -2.7368672687139024, '[0.0,1500.0)': -3.932397875280846, '[1500.0,inf)': 1.0601412646345112}
Value: {'[-inf,1000.0)': -4.183280830691619, '[1000.0,1500.0)': -3.74253667597523, '[1500.0,inf)': 0.8071026936952379}
PricingStrategy: {'[-inf,4.0)': 0.0943993387017186, '[4.0,inf)': -0.9124536739866135}


########################function`

In [None]:
import pandas as pd
import scorecardpy as sc

def woe_num(x, y, df, n_threshold=50, y_threshold=10, p_threshold=0.35):
    """
    Function to perform WoE binning on numeric variables.
    
    Parameters:
    - x: list of numerical column names.
    - y: target variable (string).
    - df: DataFrame containing the data.
    - n_threshold: minimum number of unique values in the variable for binning.
    - y_threshold: minimum count of observations in each bin for statistical significance.
    - p_threshold: acceptable proportion of good versus bad loans within the bins.
    
    Returns:
    - breaks: A dictionary with breaks (bins) for each numerical variable.
    """
    global breaks
    breaks = {}
    
    for i in x:
        # Check if the number of unique values meets the n_threshold
        if df[i].nunique() < n_threshold:
            print(f"Skipping {i} due to insufficient unique values ({df[i].nunique()} found).")
            continue
        
        # Perform WoE binning with specified method
        bin_result = sc.woebin(df, y=y, x=[i], method='chimerge', 
                                n_bins=10, 
                                min_bin_size=y_threshold, 
                                p_bin_threshold=p_threshold)
        
        # Extract the break points from the binning result for the current variable
        breaks[i] = bin_result[i]['breaks'].tolist()

    return breaks


# Ensuring that the numerical columns are of the correct type
numeric_cols = ['Amount', 'Value', 'PricingStrategy']
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric)

# Define the target variable
target_col = 'FraudResult' 

# Call the function to get WoE breaks for numeric variables
breaks = woe_num(numeric_cols, target_col, df)

# Print the breaks for each numerical variable
print(breaks)


In [27]:
import pandas as pd
import scorecardpy as sc

def woe_categorical(x, y, df):
    """
    Function to perform WoE binning on categorical variables.
    
    Parameters:
    - x: list of categorical column names.
    - y: target variable (string).
    - df: DataFrame containing the data.
    - n_threshold: minimum number of unique values in the variable for binning.
    - y_threshold: minimum count of observations in each bin for statistical significance.
    
    Returns:
    - breaks: A dictionary with WoE for each categorical variable.
    """
    global categorical_woe
    categorical_woe = {}
    
    for i in x:
        # Check if the number of unique values meets the n_threshold
        # Perform WoE binning for categorical variables
        bin_result = sc.woebin(df, y=y, x=[i], method='chimerge', min_bin_size=y_threshold)
        
        # Extract WoE for each category
        categorical_woe[i] = bin_result[i][['category', 'woe']].set_index('category').to_dict()['woe']

    return categorical_woe


# Define the target variable
target_col = 'FraudResult' 


# Call the function to get WoE for categorical variables
categorical_cols = ['CurrencyCode', 'ProviderId', 'ProductId', 'ProductCategory', 'ChannelId']
categorical_woes = woe_categorical(categorical_cols, target_col, df)


# Print the WoE for each categorical variable
print("\nCategorical WoE:")
print(categorical_woes)


NameError: name 'y_threshold' is not defined