In [179]:
import pandas as pd
import numpy as np
import requests
from io import StringIO
from sklearn.preprocessing import StandardScaler


In [180]:
def fetch_data(url):
    response = requests.get(url)
    data = StringIO(response.text)
    df = pd.read_csv(data, sep=";")
    return df

def map_columns(df, columns, mapping):
    for column in columns:
        if column in df.columns:
            df[column] = df[column].map(mapping)
    return df

def one_hot_encode_columns(df, columns):
    # Check if the specified columns exist in the dataframe
    missing_cols = [col for col in columns if col not in df.columns]
    if missing_cols:
        raise ValueError(f"The following columns are not in the DataFrame: {', '.join(missing_cols)}")
    
    # Apply one-hot encoding to the specified columns
    df_encoded = pd.get_dummies(df, columns=columns, drop_first=False)

    df_encoded = df_encoded.map(lambda x: 1 if x is True else (0 if x is False else x))
    
    return df_encoded

def encode_cyclic_information(df, column_name, mapping):
    # Ensure the column exists in the DataFrame
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' not found in the DataFrame.")
    
    # Map the days to numbers
    df[column_name] = df[column_name].map(mapping)
    
    # Create sine and cosine features for cyclic encoding
    df[column_name + '_sin'] = np.sin(2 * np.pi * df[column_name] / 7)
    df[column_name + '_cos'] = np.cos(2 * np.pi * df[column_name] / 7)
    
    # Drop the original day column if you no longer need it
    df = df.drop(columns=[column_name])
    
    return df

def mask_column(df, column, mask_value):
    if column not in df.columns:
        raise ValueError(f"Column '{column}' does not exist in the DataFrame.")
    
    mask_column_name = f"{column}_unknown"
    
    # Use pd.isna to handle NaN or None
    if pd.isna(mask_value):
        df[mask_column_name] = df[column].apply(lambda x: 1 if pd.isna(x) else 0)
    else:
        df[mask_column_name] = df[column].apply(lambda x: 1 if x == mask_value else 0)
    
    return df

def replace_value_mode(df, column, mask_value):
    if column not in df.columns:
        raise ValueError(f"Column '{column}' does not exist in the DataFrame.")

    # Create the mask column
    if pd.isna(mask_value):
        mask = df[column].isna()
    else:
        mask = df[column] == mask_value

    # Compute the mode of the column, excluding NaN
    column_mode = df.loc[~mask, column].mode()

    if column_mode.empty:
        raise ValueError(f"Cannot compute mode for column '{column}' as it has no valid values.")

    # Replace masked values with the mode
    df[column] = df[column].where(~mask, column_mode[0])

    return df

def standardize_columns(data, columns):
    scaler = StandardScaler()
    for column in columns : 
        data[column] = scaler.fit_transform(data[[column]])
    
    return data

def replace_unknown_NaN(data):
    pass

def drop_specific_data(data):
    #Drop the default column (20% is unknown, there are 3/41188 yes)
    #Drop the Pdays column (96.3% unknown)
    #Drop duration, as its for benchmarking not modeling

    data = data.drop(columns="default")
    data = data.drop(columns="pdays")
    data = data.drop(columns="duration")

    #Drop the 5 unknown data entries for now
    data = data.drop(columns="nr.employed")
    data = data.drop(columns="euribor3m")
    data = data.drop(columns="cons.conf.idx")
    data = data.drop(columns="emp.var.rate")
    data = data.drop(columns="cons.price.idx")
    
    return data
    


In [181]:
def get_unknown_percentage(df):
    unknown_percentages = {}

    # Loop over each column in the dataframe
    for column in df.columns:
        # Count the occurrences of 'unknown' in the column
        unknown_count = (df[column] == 'unknown').sum()
        
        if(column == "pdays") :
            unknown_count +=(df[column] == 999).sum()
        
        unknown_count +=(df[column].isna()).sum()
        
        # Calculate the percentage of 'unknown' values
        total_count = len(df[column])
        unknown_percentage = (unknown_count / total_count) * 100
        
        # Store the percentage in the dictionary
        unknown_percentages[column] = unknown_percentage
        print(f"{column} unknowns: {unknown_percentage}%")

    return unknown_percentages

def get_unknowns_per_row(row):
    return (row == 'unknown').sum()

def count_values(column_name, dataframe):
    if column_name not in dataframe.columns:
        print(f"Error: Column '{column_name}' not found in the DataFrame.")
        return None
    counts = dataframe[column_name].value_counts()
    return counts



In [182]:
url = "https://raw.githubusercontent.com/tuclaure/CSU-CS-345/refs/heads/main/Data/bank-additional/bank-additional/bank-additional-full.csv"
data = fetch_data(url)
print(get_unknown_percentage(data))
print()

for column in data :
    print(count_values(column, data))

age unknowns: 0.0%
job unknowns: 0.8012042342429834%
marital unknowns: 0.1942313295134505%
education unknowns: 4.202680392347285%
default unknowns: 20.87258424783918%
housing unknowns: 2.40361270272895%
loan unknowns: 2.40361270272895%
contact unknowns: 0.0%
month unknowns: 0.0%
day_of_week unknowns: 0.0%
duration unknowns: 0.0%
campaign unknowns: 0.0%
pdays unknowns: 96.32174419733903%
previous unknowns: 0.0%
poutcome unknowns: 0.0%
emp.var.rate unknowns: 0.0%
cons.price.idx unknowns: 0.0%
cons.conf.idx unknowns: 0.0%
euribor3m unknowns: 0.0%
nr.employed unknowns: 0.0%
y unknowns: 0.0%
{'age': 0.0, 'job': 0.8012042342429834, 'marital': 0.1942313295134505, 'education': 4.202680392347285, 'default': 20.87258424783918, 'housing': 2.40361270272895, 'loan': 2.40361270272895, 'contact': 0.0, 'month': 0.0, 'day_of_week': 0.0, 'duration': 0.0, 'campaign': 0.0, 'pdays': 96.32174419733903, 'previous': 0.0, 'poutcome': 0.0, 'emp.var.rate': 0.0, 'cons.price.idx': 0.0, 'cons.conf.idx': 0.0, 'eurib

In [None]:
binary_mapping = {
    'yes':1,
    'no':0,
    }

education_mapping = {
    'basic.4y':0,
    'basic.6y':1,
    'basic.9y':2,
    'high.school':3,
    'illiterate':4,
    'professional.course':5,
    'university.degree':6
    }

days_mapping = {
        'sun': 0,
        'mon': 1,
        'tue': 2,
        'wed': 3,
        'thu': 4,
        'fri': 5,
        'sat': 6
    }

months_mapping = {
        'jan': 0,
        'feb': 1,
        'mar': 2,
        'apr': 3,
        'may': 4,
        'jun': 5,
        'jul': 6,
        'aug': 7,
        'sep': 8,
        'oct': 9,
        'nov': 10,
        'dec': 11
    }

binary_columns = ["housing", "loan", "y"]
categorical_columns = ["job","marital","contact","poutcome"]
integer_columns = []

def preprocessing(mask_nans = True):
    data = fetch_data(url)
    
    #Remove Duplicate rows
    data = data[~data.duplicated()]

    #Drop necissary features
    data = drop_specific_data(data)

    #Map Binary values to 0,1,NaN
    data = map_columns(data, binary_columns, binary_mapping)

    #Map education to numeric quantities
    data = map_columns(data, ["education"], education_mapping)

    #One hot encode categorical data
    data = one_hot_encode_columns(data, categorical_columns)

    #Perform Cyclical mapping for time data
    data = encode_cyclic_information(data, "day_of_week", days_mapping)
    data = encode_cyclic_information(data, "month", months_mapping)
    
    if mask_nans :
        #create a mask for the pdays, housing, loan columns
        data = mask_column(data, "housing", None)
        data = mask_column(data, "loan", None)

        #Imput Housing and Loan to the mode
        data = replace_value_mode(data, "housing", None)
        data = replace_value_mode(data, "loan", None)
    return data


data = preprocessing()

for column in data :
    print(count_values(column, data))
        
print()
print(get_unknown_percentage(data))
print()
    
#Education Unknowns need to be handled
#Need to decide what should be standardized

#Currently WIP
#Create a different preprocessing type to be used with Models that can handle Unknowns, where NaN replaces unknown
#IE Random forests with surrogate splits etc
#Currently setup so mask_nans can be = False and it will return a dataset with binary values that have NaN

age
31    1947
32    1845
33    1833
36    1779
35    1758
      ... 
89       2
91       2
94       1
87       1
95       1
Name: count, Length: 78, dtype: int64
education
6.0    12164
3.0     9512
2.0     6045
5.0     5240
0.0     4176
1.0     2291
4.0       18
Name: count, dtype: int64
housing
1.0    22561
0.0    18615
Name: count, dtype: int64
loan
0.0    34928
1.0     6248
Name: count, dtype: int64
campaign
1     17634
2     10568
3      5340
4      2650
5      1599
6       979
7       629
8       400
9       283
10      225
11      177
12      125
13       92
14       69
17       58
16       51
15       51
18       33
20       30
19       26
21       24
22       17
23       16
24       15
27       11
29       10
28        8
26        8
25        8
31        7
30        7
35        5
32        4
33        4
34        3
42        2
40        2
43        2
56        1
39        1
41        1
37        1
Name: count, dtype: int64
previous
0    35551
1     4561
2      754
3      216
4

In [184]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

def split_data(df):
    y = df['y']
    X = df.drop(columns=['y'])
    return X, y

preprocessed_data = basic_preprocessing()
X, y = split_data(data)

over_strength = 0.2
under_strength = 0.5
over = SMOTE(sampling_strategy=over_strength, random_state=42)
under = RandomUnderSampler(sampling_strategy=under_strength, random_state=42)

#Create Train-TestSplit


#Handling Imbalance styles

#Get Over resample sets
X_over_resample, y_over_resample = over.fit_resample(X_train, y_train)

#Get Under Resample sets
X_under_resample, y_under_resample = under.fit_resample(X_train, y_train)

#Use class weight adjustments in SVM, Logicistic Regression, Random Forest

#Use a balanced Random Forest

NameError: name 'X_train' is not defined

In [None]:
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve

#Accuracy is not a good metric on imbalanced data sets, still display it but also calculate
# Precision, Recall, and F1-Score: These metrics give a better understanding of the model’s performance on the minority class.
# ROC-AUC: Measures how well the model distinguishes between the two classes, even in imbalanced scenarios.
# Precision-Recall AUC: This is especially useful when the dataset is highly imbalanced, as it focuses on the performance on the minority class.

#THIS IS AN EXAMPLE OF METRICS, STILL NEEDS MODELS
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

# ROC-AUC Score
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
print(f"ROC-AUC Score: {roc_auc}")

# Precision-Recall AUC
precision, recall, _ = precision_recall_curve(y_test, model.predict_proba(X_test)[:, 1])

