# **GLOBAL LIB**

In [500]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# **IMPORT DATASET**

# **Lungs Diseases Dataset**
- Patient Information, Smoking Status, Treatment and more.
- Src: https://www.kaggle.com/datasets/samikshadalvi/lungs-diseases-dataset

## About the dataset: 
- This dataset captures detailed information about patients suffering from various lung conditions. It includes:
    1. üßë‚Äçü§ù‚Äçüßë Age & Gender: Patient demographics to understand the spread across age groups and gender.
    2. üö¨ Smoking Status: Whether the patient is a smoker or non-smoker.
    3. üå°Ô∏è Lung Capacity: Measured lung function to assess disease severity.
    4. ü´Å Disease Type: The specific lung condition, like COPD or Bronchitis.
    5. üíä Treatment Type: Different treatments patients received, including therapy, medication, or surgery.
    6. üè• Hospital Visits: Number of visits to the hospital for managing the condition.
    7. ‚úÖ Recovery Status: Indicates whether the patient recovered after treatment.

In [501]:
df = pd.read_csv('../Data/lung_disease_data.csv')
df.head()

Unnamed: 0,Age,Gender,Smoking Status,Lung Capacity,Disease Type,Treatment Type,Hospital Visits,Recovered
0,71.0,Female,No,4.49,COPD,Therapy,14.0,Yes
1,34.0,Female,Yes,,Bronchitis,Surgery,7.0,No
2,80.0,Male,Yes,1.95,COPD,,4.0,Yes
3,40.0,Female,Yes,,Bronchitis,Medication,1.0,No
4,43.0,Male,Yes,4.6,COPD,Surgery,,Yes


In [502]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5200 entries, 0 to 5199
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Age              4900 non-null   float64
 1   Gender           4900 non-null   object 
 2   Smoking Status   4900 non-null   object 
 3   Lung Capacity    4900 non-null   float64
 4   Disease Type     4900 non-null   object 
 5   Treatment Type   4900 non-null   object 
 6   Hospital Visits  4900 non-null   float64
 7   Recovered        4900 non-null   object 
dtypes: float64(3), object(5)
memory usage: 325.1+ KB


# DTYPE
- The dataset contains the following columns:
    1. Age: Numeric.
    2. Gender: Categorical.
    3. Smoking Status: Categorical (Yes/No).
    4. Lung Capacity: Numeric.
    5. Disease Type: Categorical.
    6. Treatment Type: Categorical.
    7. Hospital Visits: Numeric.
    8. Recovered: Categorical (Yes/No).

# BASIC PROCESSING
- Steps to clean the data:
    1. Handle missing values: Fill or remove missing values in Lung Capacity, Treatment Type, and Hospital Visits.
    2. Standardize categorical values (e.g., ensure Gender, Smoking Status, and Recovered are consistent).
    3. Convert data types where necessary.
    4. Remove any duplicates if present.

In [503]:
# Checking for missing values
print(df.isnull().sum())

cat_col = df.select_dtypes(include='object').columns
num_col = df.select_dtypes(exclude='object').columns

# Filling missing numeric values with the median
for col in num_col:
    df[col] = df[col].fillna(df[col].median())
    
# Filling missing categorical values with the most frequent value (mode)
for col in cat_col:
    df[col] = df[col].fillna(df[col].mode()[0])

print(df.isnull().sum())

# Converting data types
df['Age'] = df['Age'].astype('Int64')
df['Hospital Visits'] = df['Hospital Visits'].astype('Int64')

# Removing duplicates
df.drop_duplicates(inplace=True)

Age                300
Gender             300
Smoking Status     300
Lung Capacity      300
Disease Type       300
Treatment Type     300
Hospital Visits    300
Recovered          300
dtype: int64
Age                0
Gender             0
Smoking Status     0
Lung Capacity      0
Disease Type       0
Treatment Type     0
Hospital Visits    0
Recovered          0
dtype: int64


In [504]:
df.head()

Unnamed: 0,Age,Gender,Smoking Status,Lung Capacity,Disease Type,Treatment Type,Hospital Visits,Recovered
0,71,Female,No,4.49,COPD,Therapy,14,Yes
1,34,Female,Yes,3.48,Bronchitis,Surgery,7,No
2,80,Male,Yes,1.95,COPD,Medication,4,Yes
3,40,Female,Yes,3.48,Bronchitis,Medication,1,No
4,43,Male,Yes,4.6,COPD,Surgery,8,Yes


In [505]:
df.describe()

Unnamed: 0,Age,Lung Capacity,Hospital Visits
count,5091.0,5091.0,5091.0
mean,54.474563,3.497107,7.557258
std,19.499451,1.41687,3.882409
min,20.0,1.0,1.0
25%,38.0,2.29,4.0
50%,54.0,3.48,8.0
75%,71.0,4.725,11.0
max,89.0,6.0,14.0


# EXPORT CLEANED DATA

In [506]:
""" # Translate to Vietnamese
column_map = {
    'Age': 'Tu·ªïi',
    'Gender': 'Gi·ªõi T√≠nh',
    'Smoking Status': 'T√¨nh Tr·∫°ng H√∫t Thu·ªëc',
    'Lung Capacity': 'Dung L∆∞·ª£ng Ph·ªïi',
    'Disease Type': 'Lo·∫°i B·ªánh',
    'Treatment Type': 'Lo·∫°i ƒêi·ªÅu Tr·ªã',
    'Hospital Visits': 'S·ªë L·∫ßn Kh√°m',
    'Recovered': 'Ph·ª•c H·ªìi'
}

# Rename columns
df.rename(columns=column_map, inplace=True)

# Translate values to Vietnamese
value_map = {
    'T√¨nh Tr·∫°ng H√∫t Thu·ªëc': {
        'Yes': 'C√≥',
        'No': 'Kh√¥ng'
    },
    'Gi·ªõi T√≠nh': {
        'Male': 'Nam',
        'Female': 'N·ªØ'
    },
    'Ph·ª•c H·ªìi': {
        'Yes': 'C√≥',
        'No': 'Kh√¥ng'
    },
    'Lo·∫°i B·ªánh': {
        'COPD': 'B·ªánh Ph·ªïi T·∫Øc Ngh·∫Ωn M√£n T√≠nh',
        'Bronchitis': 'Vi√™m Ph·∫ø Qu·∫£n',
        'Asthma': 'Hen Suy·ªÖn',
        'Pneumonia': 'Vi√™m Ph·ªïi',
        'Lung Cancer': 'Ung Th∆∞ Ph·ªïi'
    },
    'Lo·∫°i ƒêi·ªÅu Tr·ªã': {
        'Therapy': 'Li·ªáu Ph√°p',
        'Surgery': 'Ph·∫´u Thu·∫≠t',
        'Medication': 'Thu·ªëc',
    }
}

# Replace values
for column, mapping in value_map.items():
    if column in df.columns:
        df[column] = df[column].replace(mapping) """

" # Translate to Vietnamese\ncolumn_map = {\n    'Age': 'Tu·ªïi',\n    'Gender': 'Gi·ªõi T√≠nh',\n    'Smoking Status': 'T√¨nh Tr·∫°ng H√∫t Thu·ªëc',\n    'Lung Capacity': 'Dung L∆∞·ª£ng Ph·ªïi',\n    'Disease Type': 'Lo·∫°i B·ªánh',\n    'Treatment Type': 'Lo·∫°i ƒêi·ªÅu Tr·ªã',\n    'Hospital Visits': 'S·ªë L·∫ßn Kh√°m',\n    'Recovered': 'Ph·ª•c H·ªìi'\n}\n\n# Rename columns\ndf.rename(columns=column_map, inplace=True)\n\n# Translate values to Vietnamese\nvalue_map = {\n    'T√¨nh Tr·∫°ng H√∫t Thu·ªëc': {\n        'Yes': 'C√≥',\n        'No': 'Kh√¥ng'\n    },\n    'Gi·ªõi T√≠nh': {\n        'Male': 'Nam',\n        'Female': 'N·ªØ'\n    },\n    'Ph·ª•c H·ªìi': {\n        'Yes': 'C√≥',\n        'No': 'Kh√¥ng'\n    },\n    'Lo·∫°i B·ªánh': {\n        'COPD': 'B·ªánh Ph·ªïi T·∫Øc Ngh·∫Ωn M√£n T√≠nh',\n        'Bronchitis': 'Vi√™m Ph·∫ø Qu·∫£n',\n        'Asthma': 'Hen Suy·ªÖn',\n        'Pneumonia': 'Vi√™m Ph·ªïi',\n        'Lung Cancer': 'Ung Th∆∞ Ph·ªïi'\n    },\n    'Lo·∫°i ƒêi·ªÅu T

In [507]:
# Save the cleaned data
df.to_csv('lung_disease_data_cleaned.csv', index=False)