In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import numpy as np

In [2]:
from pandas import DataFrame
data=pd.read_csv('raw_data/chronic-respiratory-diseases-death-rate-who-mdb.csv')
print(data.head())

print(data.head())

    Entity Code  Year  \
0  Albania  ALB  1987   
1  Albania  ALB  1988   
2  Albania  ALB  1989   
3  Albania  ALB  1992   
4  Albania  ALB  1993   

   Age-standardized deaths from respiratory diseases in both sexes per 100,000 people  
0                                          26.313232                                   
1                                          26.932293                                   
2                                          25.726141                                   
3                                          21.516480                                   
4                                          23.432169                                   
    Entity Code  Year  \
0  Albania  ALB  1987   
1  Albania  ALB  1988   
2  Albania  ALB  1989   
3  Albania  ALB  1992   
4  Albania  ALB  1993   

   Age-standardized deaths from respiratory diseases in both sexes per 100,000 people  
0                                          26.313232                              

In [3]:
keepcolumn=['Entity','Year',"Age-standardized deaths from respiratory diseases in both sexes per 100,000 people"]
cleandata=data[keepcolumn]
print(cleandata)

         Entity  Year  \
0       Albania  1987   
1       Albania  1988   
2       Albania  1989   
3       Albania  1992   
4       Albania  1993   
...         ...   ...   
4767  Venezuela  2012   
4768  Venezuela  2013   
4769  Venezuela  2014   
4770  Venezuela  2015   
4771  Venezuela  2016   

      Age-standardized deaths from respiratory diseases in both sexes per 100,000 people  
0                                             26.313232                                   
1                                             26.932293                                   
2                                             25.726141                                   
3                                             21.516480                                   
4                                             23.432169                                   
...                                                 ...                                   
4767                                          23.721012       

In [4]:
df = cleandata[cleandata['Year'] >= 2005]
df = df.reset_index(drop=True)
print(df)

         Entity  Year  \
0       Albania  2005   
1       Albania  2006   
2       Albania  2007   
3       Albania  2008   
4       Albania  2009   
...         ...   ...   
1577  Venezuela  2012   
1578  Venezuela  2013   
1579  Venezuela  2014   
1580  Venezuela  2015   
1581  Venezuela  2016   

      Age-standardized deaths from respiratory diseases in both sexes per 100,000 people  
0                                             15.173623                                   
1                                             13.584775                                   
2                                             11.711932                                   
3                                             10.700809                                   
4                                              9.940529                                   
...                                                 ...                                   
1577                                          23.721012       

In [5]:
df1 = df.dropna()
print(df1)

         Entity  Year  \
0       Albania  2005   
1       Albania  2006   
2       Albania  2007   
3       Albania  2008   
4       Albania  2009   
...         ...   ...   
1577  Venezuela  2012   
1578  Venezuela  2013   
1579  Venezuela  2014   
1580  Venezuela  2015   
1581  Venezuela  2016   

      Age-standardized deaths from respiratory diseases in both sexes per 100,000 people  
0                                             15.173623                                   
1                                             13.584775                                   
2                                             11.711932                                   
3                                             10.700809                                   
4                                              9.940529                                   
...                                                 ...                                   
1577                                          23.721012       

In [6]:
df2 = df1.drop_duplicates()
print(df2)

         Entity  Year  \
0       Albania  2005   
1       Albania  2006   
2       Albania  2007   
3       Albania  2008   
4       Albania  2009   
...         ...   ...   
1577  Venezuela  2012   
1578  Venezuela  2013   
1579  Venezuela  2014   
1580  Venezuela  2015   
1581  Venezuela  2016   

      Age-standardized deaths from respiratory diseases in both sexes per 100,000 people  
0                                             15.173623                                   
1                                             13.584775                                   
2                                             11.711932                                   
3                                             10.700809                                   
4                                              9.940529                                   
...                                                 ...                                   
1577                                          23.721012       

In [7]:
from scipy import stats
numeric_cols = df2.select_dtypes(include=[np.number])  # Select only numeric columns

# Step 2: Calculate Z-scores for numeric columns
z_scores = np.abs(stats.zscore(numeric_cols))

# Step 3: Filter out rows where the z-score is greater than 3 in any numeric column
# We use `.all(axis=1)` to keep rows where all z-scores are less than 3
df_filtered = df2[(z_scores < 3).all(axis=1)]

print("\nFiltered DataFrame (Outliers removed based on numeric columns):")
print(df_filtered)


Filtered DataFrame (Outliers removed based on numeric columns):
         Entity  Year  \
0       Albania  2005   
1       Albania  2006   
2       Albania  2007   
3       Albania  2008   
4       Albania  2009   
...         ...   ...   
1577  Venezuela  2012   
1578  Venezuela  2013   
1579  Venezuela  2014   
1580  Venezuela  2015   
1581  Venezuela  2016   

      Age-standardized deaths from respiratory diseases in both sexes per 100,000 people  
0                                             15.173623                                   
1                                             13.584775                                   
2                                             11.711932                                   
3                                             10.700809                                   
4                                              9.940529                                   
...                                                 ...                                 

In [8]:
# Renaming the long column for simplicity
data = df_filtered.rename(columns={
    'Age-standardized deaths from respiratory diseases in both sexes per 100,000 people': 'Death Rate'
})

# Sort by Entity and Year to ensure correct calculation
data = data.sort_values(by=['Entity', 'Year'])

# Define a function to calculate the mean growth rate over the past 10 years
def mean_growth_rate(df):
    df = df.tail(10)  # Keep only the last 10 years for growth calculation
    return df['Death Rate'].pct_change().mean() * 100  # Calculate percentage growth

# Group by 'Entity' and apply the function to calculate mean growth rate
growth_data = data.groupby('Entity').apply(mean_growth_rate).reset_index()
growth_data.columns = ['Entity', 'Mean Growth Rate (%)']

# Merge the growth data with the original dataset, keeping only the latest year for each country
latest_year_data = data.groupby('Entity').tail(1)  # Keep only the most recent year for each country

# Add the mean growth rate to the latest year data
final_data = pd.merge(latest_year_data, growth_data, on='Entity')

# Show the final result
final_data.head()
keepcolumn=['Entity','Death Rate','Mean Growth Rate (%)']
dfnew=final_data[keepcolumn]

print(dfnew)



                  Entity  Death Rate  Mean Growth Rate (%)
0                Albania    6.813598            -14.290400
1    Antigua and Barbuda   19.863983              2.701650
2              Argentina   42.225513             -1.702524
3                Armenia    8.846441             -8.882595
4              Australia   20.381325             -0.044840
..                   ...         ...                   ...
111       United Kingdom   25.443394             -1.960492
112        United States   30.106623             -1.349667
113              Uruguay   27.665596             -2.009239
114           Uzbekistan   11.768277             -1.143326
115            Venezuela   27.216429              1.751733

[116 rows x 3 columns]


  growth_data = data.groupby('Entity').apply(mean_growth_rate).reset_index()


In [9]:
# Save the preprocessed data to a new CSV file
dfnew.to_csv('preprocessed_data/preprocessed_diseases.csv', index=False)