In [1]:
import pandas as pd

doctors = pd.read_csv("../data/doctors.csv")
patients = pd.read_csv("../data/patients.csv")


## Handle Missing Values

In [2]:
doctors.isnull().sum()



doctor_id            0
doctor_name          0
specialty            0
years_experience     0
location             0
rating               0
consultation_type    0
success_rate         0
languages_spoken     0
dtype: int64

In [3]:
patients.isnull().sum()

patient_id             0
name                   0
age                    0
gender                 0
location               0
chronic_conditions    20
dtype: int64

## Duplicates Rows

In [4]:
doctors.drop_duplicates(inplace=True)
patients.drop_duplicates(inplace=True)


In [7]:
doctors.count()

doctor_id            50
doctor_name          50
specialty            50
years_experience     50
location             50
rating               50
consultation_type    50
success_rate         50
languages_spoken     50
dtype: int64

In [8]:
patients.count()

patient_id            100
name                  100
age                   100
gender                100
location              100
chronic_conditions     80
dtype: int64

## Replace missing values with 'None'

In [10]:
patients['chronic_conditions'].fillna('None', inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  patients['chronic_conditions'].fillna('None', inplace=True)


In [11]:
patients['chronic_conditions'].isnull().sum()


np.int64(0)

## Data type Fixing

In [12]:
doctors['years_experience'] = doctors['years_experience'].astype(int)
patients['age'] = patients['age'].astype(int)


In [14]:
patients.count()

patient_id            100
name                  100
age                   100
gender                100
location              100
chronic_conditions    100
dtype: int64

In [15]:
doctors.count()

doctor_id            50
doctor_name          50
specialty            50
years_experience     50
location             50
rating               50
consultation_type    50
success_rate         50
languages_spoken     50
dtype: int64

## Save Cleaned Data

In [16]:
doctors.to_csv("../data/doctors_cleaned.csv", index=False)
patients.to_csv("../data/patients_cleaned.csv", index=False)
