In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the data
df = pd.read_csv('../Data/lung_disease_data.csv')
display(df.head())

Unnamed: 0,Age,Gender,Smoking Status,Lung Capacity,Disease Type,Treatment Type,Hospital Visits,Recovered
0,71.0,Female,No,4.49,COPD,Therapy,14.0,Yes
1,34.0,Female,Yes,,Bronchitis,Surgery,7.0,No
2,80.0,Male,Yes,1.95,COPD,,4.0,Yes
3,40.0,Female,Yes,,Bronchitis,Medication,1.0,No
4,43.0,Male,Yes,4.6,COPD,Surgery,,Yes


In [3]:
# Size of data
print(df.shape)

(5200, 8)


In [4]:
# Information about the data
display(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5200 entries, 0 to 5199
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Age              4900 non-null   float64
 1   Gender           4900 non-null   object 
 2   Smoking Status   4900 non-null   object 
 3   Lung Capacity    4900 non-null   float64
 4   Disease Type     4900 non-null   object 
 5   Treatment Type   4900 non-null   object 
 6   Hospital Visits  4900 non-null   float64
 7   Recovered        4900 non-null   object 
dtypes: float64(3), object(5)
memory usage: 325.1+ KB


None

In [5]:
# Missing values count
display(df.isnull().sum())

Age                300
Gender             300
Smoking Status     300
Lung Capacity      300
Disease Type       300
Treatment Type     300
Hospital Visits    300
Recovered          300
dtype: int64

In [6]:
# Data cleaning and preprocessing

numeric_cols = ['Age', 'Lung Capacity', 'Hospital Visits']
categorical_cols = ['Gender', 'Smoking Status', 'Disease Type', 'Treatment Type', 'Recovered']

# Fill missing data in numeric columns with median
for col in numeric_cols:
    if df[col].isnull().sum() > 0:
        median = df[col].median()
        df[col].fillna(median, inplace=True)

# Fill missing data in categorical columns with mode
for col in categorical_cols:
    if df[col].isnull().sum() > 0:
        mode = df[col].mode()[0]
        df[col].fillna(mode, inplace=True)

# Check for missing values
display(df.isnull().sum())

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(median, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(median, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a

Age                0
Gender             0
Smoking Status     0
Lung Capacity      0
Disease Type       0
Treatment Type     0
Hospital Visits    0
Recovered          0
dtype: int64

In [7]:
# Translate to Vietnamese
column_map = {
    'Age': 'Tuổi',
    'Gender': 'Giới Tính',
    'Smoking Status': 'Tình Trạng Hút Thuốc',
    'Lung Capacity': 'Dung Lượng Phổi',
    'Disease Type': 'Loại Bệnh',
    'Treatment Type': 'Loại Điều Trị',
    'Hospital Visits': 'Số Lần Khám',
    'Recovered': 'Phục Hồi'
}

# Đổi tên cột
df.rename(columns=column_map, inplace=True)

# Từ điển ánh xạ giá trị
value_map = {
    'Tình Trạng Hút Thuốc': {
        'Yes': 'Có',
        'No': 'Không'
    },
    'Giới Tính': {
        'Male': 'Nam',
        'Female': 'Nữ'
    },
    'Phục Hồi': {
        'Yes': 'Có',
        'No': 'Không'
    },
    'Loại Bệnh': {
        'COPD': 'Bệnh Phổi Tắc Nghẽn Mãn Tính',
        'Bronchitis': 'Viêm Phế Quản',
        'Asthma': 'Hen Suyễn',
        'Pneumonia': 'Viêm Phổi',
        'Lung Cancer': 'Ung Thư Phổi'
    },
    'Loại Điều Trị': {
        'Therapy': 'Liệu Pháp',
        'Surgery': 'Phẫu Thuật',
        'Medication': 'Thuốc',
        # Thêm các loại điều trị khác nếu cần
    }
}

# Áp dụng các thay đổi giá trị
for column, mapping in value_map.items():
    if column in df.columns:
        df[column] = df[column].replace(mapping)

In [8]:
# Save the cleaned data
df.to_csv('lung_disease_data_cleaned.csv', index=False)