# Classify the Prices to different class and Assign the disease to Different Prices Range 

## Import Libraries

In [63]:
import pandas as pd
import numpy as np

## 1. Load the Dataset

In [64]:
df = pd.read_csv('data/disease_insurance.csv')

In [65]:
df.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges',
       'disease', 'sedantry period', 'junk food consumption',
       'alcohol consumption', 'exercise routine', 'substance use',
       'stress level'],
      dtype='object')

In [66]:
df['charges'].nunique()

1337

## Part 1 - Classify the Prices to different classes

#### classification

In [67]:
charges = df['charges']

In [68]:
charges.min()

1121.8739

In [69]:
charges.max()

63770.42801

In [70]:
# Define the minimum and maximum values
min_value = 1120
max_value = 65000

# Define the number of bins (4 in this case)
num_bins = 3

# Generate evenly spaced bin edges
bin_edges = np.linspace(min_value, max_value, num_bins + 1)

# Convert bin edges to a list
bins = bin_edges.tolist()

In [71]:
bins

[1120.0, 22413.333333333332, 43706.666666666664, 65000.0]

In [72]:
# Define your custom bins and labels
labels = ['A', 'B', 'C']

# Create a new column 'groups' that contains the custom labels based on the bins
df['charges_bins'] = pd.cut(df['charges'], bins=bins, labels=labels, include_lowest=True)

In [73]:
df['charges_bins']

0       A
1       A
2       A
3       A
4       A
       ..
1333    A
1334    A
1335    A
1336    A
1337    B
Name: charges_bins, Length: 1338, dtype: category
Categories (3, object): ['A' < 'B' < 'C']

In [74]:
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,disease,sedantry period,junk food consumption,alcohol consumption,exercise routine,substance use,stress level,charges_bins
0,19.0,female,27.900,0,yes,southwest,16884.92400,,,,,,,,A
1,,male,33.770,1,no,southeast,1725.55230,,12hrs,,no,,no,,A
2,28.0,male,33.000,3,no,southeast,4449.46200,,2hrs,daily,rarely,frequently,,,A
3,33.0,male,22.705,0,no,northwest,21984.47061,,,,,,,,A
4,32.0,male,28.880,0,no,northwest,3866.85520,,2hrs,frequently,rarely,daily,yes,,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1333,50.0,male,30.970,3,no,northwest,10600.54830,,4hrs,rarely,no,no,,low,A
1334,18.0,female,31.920,0,no,northeast,2205.98080,,6hrs,,rarely,,yes,low,A
1335,18.0,female,36.850,0,no,southeast,1629.83350,,6hrs,rarely,no,daily,yes,low,A
1336,21.0,female,25.800,0,no,southwest,2007.94500,,,daily,,rarely,,medium,A


In [75]:
df['charges_bins'].isna().sum()

0

#### Part 1 Save the CSV

In [76]:
# Specify the file path where you want to save the CSV file
file_path = 'data/disease_insurance_price_v2.csv'

# Use the 'to_csv' method to save the DataFrame to a CSV file
df.to_csv(file_path, index=False)  # Set 'index' to False to exclude the index column in the saved file

## Part 2 -- Assign the diseases based on price range

#### 2.1 Class A diseases

<ul>
<li> Hypoglycemia                = 2000, 5000 </li>
<li> Urinary tract infection     = 5001, 10000 </li>
<li> Gastroenteritis             = 10001, 15000 </li>
<li> Chickenpox                  = 15001, 20000 </li>
<li> Dengue                      = 20001, 22003 </li>
</ul>


In [77]:
# Define the starting and ending prices 1120.0, 22003.333333333332
start_price = 1120
end_price = 22000

# List of diseases
diseases_A = ['Hypoglycemia', 'Urinary tract infection', 'Gastroenteritis', 'Chickenpox', 'Dengue']

# Calculate the width of each price bin
bin_width = (end_price - start_price) / len(diseases_A)

# Create a list of price ranges
price_ranges_A = [(start_price + i * bin_width, start_price + (i + 1) * bin_width) for i in range(len(diseases_A))]

# Create a dictionary to map diseases to their price ranges
disease_to_price_range_A = {disease: price_range for disease, price_range in zip(diseases_A, price_ranges_A)}

# Print the mapping of diseases to their price ranges
for disease, price_range in disease_to_price_range_A.items():
    print(f'Disease: {disease}, Price Range: {price_range}')


Disease: Hypoglycemia, Price Range: (1120.0, 5296.0)
Disease: Urinary tract infection, Price Range: (5296.0, 9472.0)
Disease: Gastroenteritis, Price Range: (9472.0, 13648.0)
Disease: Chickenpox, Price Range: (13648.0, 17824.0)
Disease: Dengue, Price Range: (17824.0, 22000.0)


In [78]:
# Iterate through the rows and assign diseases based on charges
for index, row in df.iterrows():
    charge = row['charges']
    for disease, (start, end) in disease_to_price_range_A.items():
        if start <= charge <= end:
            df.at[index, 'disease'] = disease
            break

# Print the updated DataFrame
# print(df)

In [79]:
disease_checker = df.disease.isna() == False
df[disease_checker]

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,disease,sedantry period,junk food consumption,alcohol consumption,exercise routine,substance use,stress level,charges_bins
0,19.0,female,27.900,0,yes,southwest,16884.92400,Chickenpox,,,,,,,A
1,,male,33.770,1,no,southeast,1725.55230,Hypoglycemia,12hrs,,no,,no,,A
2,28.0,male,33.000,3,no,southeast,4449.46200,Hypoglycemia,2hrs,daily,rarely,frequently,,,A
3,33.0,male,22.705,0,no,northwest,21984.47061,Dengue,,,,,,,A
4,32.0,male,28.880,0,no,northwest,3866.85520,Hypoglycemia,2hrs,frequently,rarely,daily,yes,,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1332,52.0,female,44.700,3,no,southwest,11411.68500,Gastroenteritis,,,,,,,A
1333,50.0,male,30.970,3,no,northwest,10600.54830,Gastroenteritis,4hrs,rarely,no,no,,low,A
1334,18.0,female,31.920,0,no,northeast,2205.98080,Hypoglycemia,6hrs,,rarely,,yes,low,A
1335,18.0,female,36.850,0,no,southeast,1629.83350,Hypoglycemia,6hrs,rarely,no,daily,yes,low,A


#### 2.2 Class B diseases

In [80]:
# Define the starting and ending prices
start_price = 22000
end_price = 43000

# List of diseases
diseases = [
    'Fungal infection', 'Allergy', 'Dimorphic hemorrhoids (piles)', 'Migraine',
    'GERD (Gastroesophageal Reflux Disease)', 'Hyperthyroidism', 'Hypothyroidism',
    'Tuberculosis', 'Osteoarthritis', 'Arthritis', 'Cervical spondylosis',
    'Paroxysmal Positional Vertigo (Vertigo)', 'Chronic cholestasis', 'Jaundice'
]

# Calculate the width of each price bin
bin_width = (end_price - start_price) / len(diseases)

# Create a list of price ranges
price_ranges = [(start_price + i * bin_width, start_price + (i + 1) * bin_width) for i in range(len(diseases))]

# Create a dictionary to map diseases to their price ranges
disease_to_price_range = {disease: price_range for disease, price_range in zip(diseases, price_ranges)}

# Print the mapping of diseases to their price ranges
for disease, price_range in disease_to_price_range.items():
    print(f'Disease: {disease}, Price Range: {price_range}')


Disease: Fungal infection, Price Range: (22000.0, 23500.0)
Disease: Allergy, Price Range: (23500.0, 25000.0)
Disease: Dimorphic hemorrhoids (piles), Price Range: (25000.0, 26500.0)
Disease: Migraine, Price Range: (26500.0, 28000.0)
Disease: GERD (Gastroesophageal Reflux Disease), Price Range: (28000.0, 29500.0)
Disease: Hyperthyroidism, Price Range: (29500.0, 31000.0)
Disease: Hypothyroidism, Price Range: (31000.0, 32500.0)
Disease: Tuberculosis, Price Range: (32500.0, 34000.0)
Disease: Osteoarthritis, Price Range: (34000.0, 35500.0)
Disease: Arthritis, Price Range: (35500.0, 37000.0)
Disease: Cervical spondylosis, Price Range: (37000.0, 38500.0)
Disease: Paroxysmal Positional Vertigo (Vertigo), Price Range: (38500.0, 40000.0)
Disease: Chronic cholestasis, Price Range: (40000.0, 41500.0)
Disease: Jaundice, Price Range: (41500.0, 43000.0)


In [81]:
price_range_B = disease_to_price_range.values()
disease_B = disease_to_price_range.keys()

In [82]:
print(disease_B)

dict_keys(['Fungal infection', 'Allergy', 'Dimorphic hemorrhoids (piles)', 'Migraine', 'GERD (Gastroesophageal Reflux Disease)', 'Hyperthyroidism', 'Hypothyroidism', 'Tuberculosis', 'Osteoarthritis', 'Arthritis', 'Cervical spondylosis', 'Paroxysmal Positional Vertigo (Vertigo)', 'Chronic cholestasis', 'Jaundice'])


In [83]:
# Iterate through the rows and assign diseases based on charges
for index, row in df.iterrows():
    charge = row['charges']
    for disease, (start, end) in disease_to_price_range.items():
        if start <= charge <= end:
            df.at[index, 'disease'] = disease
            break

# Print the updated DataFrame
# print(df)

In [84]:
disease_checker = df.disease.isna() == False
df[disease_checker]

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,disease,sedantry period,junk food consumption,alcohol consumption,exercise routine,substance use,stress level,charges_bins
0,19.0,female,27.900,0,yes,southwest,16884.92400,Chickenpox,,,,,,,A
1,,male,33.770,1,no,southeast,1725.55230,Hypoglycemia,12hrs,,no,,no,,A
2,28.0,male,33.000,3,no,southeast,4449.46200,Hypoglycemia,2hrs,daily,rarely,frequently,,,A
3,33.0,male,22.705,0,no,northwest,21984.47061,Dengue,,,,,,,A
4,32.0,male,28.880,0,no,northwest,3866.85520,Hypoglycemia,2hrs,frequently,rarely,daily,yes,,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1333,50.0,male,30.970,3,no,northwest,10600.54830,Gastroenteritis,4hrs,rarely,no,no,,low,A
1334,18.0,female,31.920,0,no,northeast,2205.98080,Hypoglycemia,6hrs,,rarely,,yes,low,A
1335,18.0,female,36.850,0,no,southeast,1629.83350,Hypoglycemia,6hrs,rarely,no,daily,yes,low,A
1336,21.0,female,25.800,0,no,southwest,2007.94500,Hypoglycemia,,daily,,rarely,,medium,A


#### 2.3 Class C Diseases

In [85]:
# Define the starting and ending prices
start_price = 43001
end_price = 65000

# List of diseases
diseases_C = [
'AIDS', 'Alcoholic hepatitis', 'Bronchial Asthma',
'Diabetes', 'Drug Reaction', 'Heart attack',
'Hepatitis A', 'Hepatitis B', 'Hepatitis C', 'Hepatitis D', 'Hepatitis E',
'Hypertension (High Blood Pressure)','Malaria',
'Paralysis (brain hemorrhage)', 'Pneumonia'
'Reptic ulcer disease', 'Typhoid', 'Varicose veins'
]

# Calculate the width of each price bin
bin_width = (end_price - start_price) / len(diseases_C)

# Create a list of price ranges
price_ranges_C = [(start_price + i * bin_width, start_price + (i + 1) * bin_width) for i in range(len(diseases_C))]

# Create a dictionary to map diseases to their price ranges
disease_to_price_range_C = {disease: price_range for disease, price_range in zip(diseases_C, price_ranges_C)}

# Print the mapping of diseases to their price ranges
for disease, price_range in disease_to_price_range_C.items():
    print(f'Disease: {disease}, Price Range: {price_range}')


Disease: AIDS, Price Range: (43001.0, 44295.05882352941)
Disease: Alcoholic hepatitis, Price Range: (44295.05882352941, 45589.117647058825)
Disease: Bronchial Asthma, Price Range: (45589.117647058825, 46883.17647058824)
Disease: Diabetes, Price Range: (46883.17647058824, 48177.23529411765)
Disease: Drug Reaction, Price Range: (48177.23529411765, 49471.294117647056)
Disease: Heart attack, Price Range: (49471.294117647056, 50765.35294117647)
Disease: Hepatitis A, Price Range: (50765.35294117647, 52059.41176470588)
Disease: Hepatitis B, Price Range: (52059.41176470588, 53353.470588235294)
Disease: Hepatitis C, Price Range: (53353.470588235294, 54647.529411764706)
Disease: Hepatitis D, Price Range: (54647.529411764706, 55941.58823529412)
Disease: Hepatitis E, Price Range: (55941.58823529412, 57235.647058823524)
Disease: Hypertension (High Blood Pressure), Price Range: (57235.647058823524, 58529.70588235294)
Disease: Malaria, Price Range: (58529.70588235294, 59823.76470588235)
Disease: Para

In [86]:
# Iterate through the rows and assign diseases based on charges
for index, row in df.iterrows():
    charge = row['charges']
    for disease, (start, end) in disease_to_price_range_C.items():
        if start <= charge <= end:
            df.at[index, 'disease'] = disease
            break

# Print the updated DataFrame
# print(df)

In [87]:
disease_checker = df.disease.isna() == False
df[disease_checker]

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,disease,sedantry period,junk food consumption,alcohol consumption,exercise routine,substance use,stress level,charges_bins
0,19.0,female,27.900,0,yes,southwest,16884.92400,Chickenpox,,,,,,,A
1,,male,33.770,1,no,southeast,1725.55230,Hypoglycemia,12hrs,,no,,no,,A
2,28.0,male,33.000,3,no,southeast,4449.46200,Hypoglycemia,2hrs,daily,rarely,frequently,,,A
3,33.0,male,22.705,0,no,northwest,21984.47061,Dengue,,,,,,,A
4,32.0,male,28.880,0,no,northwest,3866.85520,Hypoglycemia,2hrs,frequently,rarely,daily,yes,,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1333,50.0,male,30.970,3,no,northwest,10600.54830,Gastroenteritis,4hrs,rarely,no,no,,low,A
1334,18.0,female,31.920,0,no,northeast,2205.98080,Hypoglycemia,6hrs,,rarely,,yes,low,A
1335,18.0,female,36.850,0,no,southeast,1629.83350,Hypoglycemia,6hrs,rarely,no,daily,yes,low,A
1336,21.0,female,25.800,0,no,southwest,2007.94500,Hypoglycemia,,daily,,rarely,,medium,A


#### Checking the null values for diseases

In [88]:
df['disease'].isna().sum()

0

In [89]:
lackinfo_disease = df['disease'].isna()==True
df[lackinfo_disease]

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,disease,sedantry period,junk food consumption,alcohol consumption,exercise routine,substance use,stress level,charges_bins


In [90]:
df[lackinfo_disease]['charges'].max()

nan

#### Part 2 Save the CSV

In [91]:
# Specify the file path where you want to save the CSV file
file_path = 'data/disease_insurance_price_final.csv'

# Use the 'to_csv' method to save the DataFrame to a CSV file
df.to_csv(file_path, index=False)  # Set 'index' to False to exclude the index column in the saved file

In [92]:
df.isna().sum()

age                        1
sex                        0
bmi                        0
children                   0
smoker                     0
region                     0
charges                    0
disease                    0
sedantry period          571
junk food consumption    571
alcohol consumption      571
exercise routine         571
substance use            571
stress level             571
charges_bins               0
dtype: int64

In [93]:
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,disease,sedantry period,junk food consumption,alcohol consumption,exercise routine,substance use,stress level,charges_bins
0,19.0,female,27.900,0,yes,southwest,16884.92400,Chickenpox,,,,,,,A
1,,male,33.770,1,no,southeast,1725.55230,Hypoglycemia,12hrs,,no,,no,,A
2,28.0,male,33.000,3,no,southeast,4449.46200,Hypoglycemia,2hrs,daily,rarely,frequently,,,A
3,33.0,male,22.705,0,no,northwest,21984.47061,Dengue,,,,,,,A
4,32.0,male,28.880,0,no,northwest,3866.85520,Hypoglycemia,2hrs,frequently,rarely,daily,yes,,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1333,50.0,male,30.970,3,no,northwest,10600.54830,Gastroenteritis,4hrs,rarely,no,no,,low,A
1334,18.0,female,31.920,0,no,northeast,2205.98080,Hypoglycemia,6hrs,,rarely,,yes,low,A
1335,18.0,female,36.850,0,no,southeast,1629.83350,Hypoglycemia,6hrs,rarely,no,daily,yes,low,A
1336,21.0,female,25.800,0,no,southwest,2007.94500,Hypoglycemia,,daily,,rarely,,medium,A
