In [None]:
import requests
import pandas as pd
from io import StringIO
from sklearn.preprocessing import OneHotEncoder

def fetch_data(url):
    response = requests.get(url)
    data = StringIO(response.text)
    df = pd.read_csv(data, sep=";")
    
    return df

def split_data(df):
    y = df['y']
    
    X = df.drop(columns=['y'])
    
    return X, y

def make_numeric(df, column_name):
    return df[column_name].map({'yes': 1, 'no': 0})
    
    
def one_hot_encode_column(df, column_name):
    encoder = OneHotEncoder(drop='first', sparse_output=False)
    
    column_data = df[[column_name]]
    
    encoded_data = encoder.fit_transform(column_data)
    
    encoded_columns = encoder.get_feature_names_out([column_name])
    
    encoded_df = pd.DataFrame(encoded_data, columns=encoded_columns)
    df_encoded = pd.concat([df, encoded_df], axis=1)
    
    df_encoded = df_encoded.drop(columns=[column_name])
    
    return df_encoded

def standardize_column(df, column_name):
    mean = df[column_name].mean()
    std = df[column_name].std()
    df[column_name] = (df[column_name] - mean) / std
    return df

def months_to_numeric(df):
    month_numeric_dictionary = {
    'december': 0, 'january': 11, 'february': 10, 'march': 9,
    'april': 8, 'may': 7, 'june': 6, 'july': 5,
    'august': 4, 'september': 3, 'october': 2, 'november': 1
}
    
    df['month'] = df['month'].map(month_numeric_dictionary)
    return df

# Get Data
url = "https://raw.githubusercontent.com/tuclaure/CSU-CS-345/refs/heads/main/Data/bank/bank-full.csv"
df = fetch_data(url)

#Apply one Hot Encoding
categorical_columns = ['job', 'marital', 'education', 'contact', 'poutcome']
for column in categorical_columns:
    df = one_hot_encode_column(df, column)
    
#Turn Yes/No Columns into binary numbers
yn_columns = ['default','housing','loan']
for column in yn_columns:
    df[column] = make_numeric(df, column)
  
#Convert months [0-11]  
df = months_to_numeric(df)

#Standardize necessary columns
std_columns = ['balance', 'duration']
for column in std_columns:
    df = standardize_column(df, column)
    
#Split into features
X, y = split_data(df)

df.iloc[0]
# print(X)
# print(y)


age                          58
default                       0
balance                0.256416
housing                       1
loan                          0
day                           5
month                       7.0
duration               0.011016
campaign                      1
pdays                        -1
previous                      0
y                            no
job_blue-collar             0.0
job_entrepreneur            0.0
job_housemaid               0.0
job_management              1.0
job_retired                 0.0
job_self-employed           0.0
job_services                0.0
job_student                 0.0
job_technician              0.0
job_unemployed              0.0
job_unknown                 0.0
marital_married             1.0
marital_single              0.0
education_secondary         0.0
education_tertiary          1.0
education_unknown           0.0
contact_telephone           0.0
contact_unknown             1.0
poutcome_other              0.0
poutcome