In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore', category=FutureWarning, module='pandas')

In [None]:
df = pd.read_csv('/kaggle/input/global-health-statistics/Global Health Statistics.csv')
df.head()

# **Data Exploration**

In [None]:
# Get basic information
print(df.info())
print("**********************************************************")

# Check for missing values
print(df.isnull().sum())
print("**********************************************************")

# Summary statistics for numeric columns
print(df.describe())


In [None]:
#Missing Value
print(df.isnull().sum())

In [None]:
# Check for duplicates
print(f"Number of duplicate rows: {df.duplicated().sum()}")

In [None]:
# For entire DataFrame infinite values (inf) with NaN
df = df.replace([np.inf, -np.inf], np.nan)

#  Exploratory Data Analysis (EDA)

In [None]:
print("Dataset Overview:")

print(f"Number of Unique Countries: {df['Country'].nunique()}")
print(f"Year Range: {df['Year'].min()} - {df['Year'].max()}")
print(f"Number of Unique Diseases: {df['Disease Name'].nunique()}")


In [None]:
# List of categorical columns
categorical_columns = ['Country', 'Disease Name', 'Disease Category', 'Age Group', 'Gender', 'Treatment Type', 'Availability of Vaccines/Treatment']

# Loop through the categorical columns and display unique values
for col in categorical_columns:
    print(f"Unique values in {col}:")
    print(df[col].unique())
    print("-" * 50)


In [None]:
# Group by 'Country' and 'Disease Name' to count occurrences
country_disease_counts = df.groupby(['Country', 'Disease Name']).size().reset_index(name='Count')

# Find the most common disease for each country
most_common_disease = country_disease_counts.loc[country_disease_counts.groupby('Country')['Count'].idxmax()]

# Display the result
print(most_common_disease[['Country', 'Disease Name', 'Count']])


In [None]:
# Plot the most common disease for each country

sns.barplot(data=most_common_disease, 
            x='Country', 
            y='Count', 
            hue='Disease Name', 
            palette='Set2')

# Rotate country names on x-axis for better readability
plt.xticks(rotation=90)

# Add titles and labels
plt.title('Most Common Disease for Each Country', fontsize=16)
plt.xlabel('Country', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.legend(title='Disease Name', bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
# Show the plot
plt.tight_layout()
plt.show()

In [None]:
# Boxplot to compare 'Prevalence Rate' across disease categories
plt.figure(figsize=(10, 6))
# Rotate country names on x-axis for better readability
plt.xticks(rotation=90)
sns.boxplot(x='Disease Category', y='Prevalence Rate (%)', data=df, palette='Set3')
plt.title('Prevalence Rate by Disease Category')
plt.xlabel('Disease Category')
plt.ylabel('Prevalence Rate (%)')
plt.show()


In [None]:
# Group by 'Disease Name' and calculate the average 'Prevalence Rate (%)'
top_diseases = df.groupby('Disease Name')['Prevalence Rate (%)'].mean().sort_values(ascending=False).head(10)

# Plot the top 10 diseases
plt.figure(figsize=(10, 6))
sns.barplot(x=top_diseases.values, y=top_diseases.index, palette='viridis')
plt.title('Top 10 Diseases by Average Prevalence Rate (%)')
plt.xlabel('Average Prevalence Rate (%)')
plt.ylabel('Disease Name')
plt.show()



# # Final message after exploration"Simple Exploratory analysis has been performed on the dataset. However, the data appears to be generated based on real-world observations!

