In [None]:
import requests
import pandas as pd
import numpy as np
from io import StringIO
from sklearn.preprocessing import OneHotEncoder

def fetch_data(url):
    response = requests.get(url)
    data = StringIO(response.text)
    df = pd.read_csv(data, sep=";")
    return df

def split_data(df):
    y = df['y']
    X = df.drop(columns=['y'])
    return X, y

def make_YN_numeric(df, column_name):
    return df[column_name].map({'yes': 1, 'no': 0})
    
    
def one_hot_encode_column(df, column_name):
    encoder = OneHotEncoder(drop='first', sparse_output=False)
    
    column_data = df[[column_name]]
    
    encoded_data = encoder.fit_transform(column_data)
    
    encoded_columns = encoder.get_feature_names_out([column_name])
    
    encoded_df = pd.DataFrame(encoded_data, columns=encoded_columns)
    df_encoded = pd.concat([df, encoded_df], axis=1)
    
    df_encoded = df_encoded.drop(columns=[column_name])
    
    return df_encoded

def standardize_column(df, column_name):
    mean = df[column_name].mean()
    std = df[column_name].std()
    df[column_name] = (df[column_name] - mean) / std
    return df

def months_numeric_cyclical(df):
    month_numeric_dictionary = {
    'dec': 1, 'jan': 12, 'feb': 11, 'mar': 10,
    'apr': 9, 'may': 8, 'jun': 7, 'jul': 6,
    'aug': 5, 'sep': 4, 'oct': 3, 'nov': 2}
    
    df['month'] = df['month'].map(month_numeric_dictionary)
    
    df['month' + '_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month' + '_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    df = df.drop(columns=['month'])  # Drop original month column
    return df

# Get Data
url = "https://raw.githubusercontent.com/tuclaure/CSU-CS-345/refs/heads/main/Data/bank/bank-full.csv"
df = fetch_data(url)

#Maybe use Target Encoding instead of OHE to reduce computational time
def target_encode_column(df, column_name, target_column):
    # Calculate mean target for each category
    means = df.groupby(column_name)[target_column].mean()
    # Map the mean values to the column
    df[column_name + "_target_encoded"] = df[column_name].map(means)
    # Optionally drop the original column
    df = df.drop(columns=[column_name])
    return df

#Apply one Hot Encoding
categorical_columns = ['job', 'marital', 'education', 'contact', 'poutcome']
for column in categorical_columns:
    df = one_hot_encode_column(df, column)
    
#Turn Yes/No Columns into binary numbers
yn_columns = ['default','housing','loan','y']
for column in yn_columns:
    df[column] = make_YN_numeric(df, column)
  
#Convert months to cyclical representation 
df = months_numeric_cyclical(df)

#Standardize necessary columns
std_columns = ['age', 'balance', 'duration']
for column in std_columns:
    df = standardize_column(df, column)
    
#Split into features
X, y = split_data(df)

df.iloc[0]
# print(X)
# print(y)


age                    1.606947
default                0.000000
balance                0.256416
housing                1.000000
loan                   0.000000
day                    5.000000
duration               0.011016
campaign               1.000000
pdays                 -1.000000
previous               0.000000
y                      0.000000
job_blue-collar        0.000000
job_entrepreneur       0.000000
job_housemaid          0.000000
job_management         1.000000
job_retired            0.000000
job_self-employed      0.000000
job_services           0.000000
job_student            0.000000
job_technician         0.000000
job_unemployed         0.000000
job_unknown            0.000000
marital_married        1.000000
marital_single         0.000000
education_secondary    0.000000
education_tertiary     1.000000
education_unknown      0.000000
contact_telephone      0.000000
contact_unknown        1.000000
poutcome_other         0.000000
poutcome_success       0.000000
poutcome