In [138]:


# Import the necessary libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import neighbors
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import lightgbm as lgb


# Load the CSV files
df2 = pd.read_csv('ailments_scale.csv')
df_breed = pd.read_csv('split_data.csv')

# ------------ AILMENTS FINANCIAL COST PRE-PROCESSING 

# Calculating average costs based on provided min and max
df2['real_cost_avg'] = (df2['real_cost_min'] + df2['real_cost_max']) / 2
df2['genetic_ailments'] = df2['genetic_ailments'].astype(str)

# Replace specific values in the 'genetic_ailments' column using the .replace() method
df2['genetic_ailments'] = df2['genetic_ailments'].replace({
    'cleft palate': 'cleft_palate',  
    'bladder': 'urinary'})

# Assuming df2 consists of columns: genetic_ailments, real_cost_min, real_cost_avg, real_cost_max
# Map each ailment to its associated costs 
ailment_to_cost_severity = df2.set_index('genetic_ailments')[['real_cost_min', 'real_cost_avg', 'real_cost_max']].to_dict(orient='index')
list_of_ailments = df2['genetic_ailments'].unique().tolist()

# Fix the function to check for column existence and correct access to the 'ailment_to_cost_severity' dict
def calculate_final_yearly_costs(row, list_of_ailments):
    yearly_final_cost = 0
    for ailment in list_of_ailments:
        ailment_col_name = f'{ailment}'
        # Check if the ailment column exists in the row
        if ailment_col_name in row and row[ailment_col_name] > 0:
            size_key = row['size']
            cost = 0
            if size_key == 1:
                cost = ailment_to_cost_severity[ailment]['real_cost_min']
            elif size_key == 2:
                cost = ailment_to_cost_severity[ailment]['real_cost_avg']
            else:
                cost = ailment_to_cost_severity[ailment]['real_cost_max']
            yearly_final_cost += cost
    return yearly_final_cost

# Apply the function to each row in df_breed, passing the list_of_ailments
df_breed['health_cost'] = df_breed.apply(lambda row: calculate_final_yearly_costs(row, list_of_ailments), axis=1)
df_breed['yearly_final_cost'] = (df_breed['yearly_final_cost'] + df_breed['health_cost'])

# ----------------- PRE-PROCESSING BREED -----------------------------------

# Drop unnecessary columns, clean up naming 
cols_to_drop = [ 'severity_score', 'health_cost']
df_breed.drop(columns=cols_to_drop, inplace=True)

# Perform one-hot encoding on the 'breed' column
df_encoded = pd.get_dummies(df_breed, columns=['breed'])

# Convert all boolean columns to integers 
df_encoded.replace([np.inf, -np.inf], np.nan, inplace=True)
df_encoded = df_encoded.fillna(0).astype(int)

# ---- EXPORT DATA FOR MODELLING ----------------------------
df_breed.to_csv('clean_split_data.csv', index=False)
df_encoded.to_csv('combined_data.csv', index=False)


Index(['breed', 'gender', 'age', 'longevity', 'size', 'category_companion',
       'category_herding', 'category_hound', 'category_sporting',
       'category_terrier', 'category_working', 'grooming_required',
       'intelligence_category', 'sensitivity_level', 'tolerates_being_alone',
       'tolerates_cold_weather', 'tolerates_hot_weather', 'kid_friendly',
       'dog_friendly', 'stranger_friendly', 'potential_for_mouthiness',
       'prey_drive', 'tendency_to_bark_or_howl', 'wanderlust_potential',
       'exercise_needs', 'energy_level', 'allergies', 'bleeding', 'bloat',
       'breathing', 'cleft_palate', 'dental', 'elbows', 'eyes', 'heart',
       'hips', 'kidney', 'liver', 'metabolic', 'neurological', 'none',
       'osteopathy', 'patella', 'respiratory', 'skin', 'spine', 'thyroid',
       'urinary', 'yearly_final_cost'],
      dtype='object')
