In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import ipywidgets as widgets
from sklearn import linear_model
from sklearn.metrics import r2_score
from datetime import date
%matplotlib widget

# Preventable Diseases Predictor

In [2]:
# Load in Chronic Disease Indicator Data
cdi = pd.read_csv('adjusted_cdi_mortality.csv')
cdi.drop(columns=['DataSource'])

cdi['YearStart'] = pd.to_datetime(cdi['YearStart'], format='%Y').dt.year

In [3]:
# Get all territories in 'LocationDesc' excluding 'United States'
def get_territories():
    territories = []

    for index, row in cdi.iterrows():
        if (row['LocationDesc'] not in territories) & (row['LocationDesc'] != 'United States') :
            territories.append(row['LocationDesc'])
    
    return np.sort(territories)

terrs = get_territories()

### Average/Total US Mortalities (2010-2018)

In [4]:
# Define a function to calculate the mean and sum for US mortalities and display them
def mean_mortalities(option):
    #Get all 'US' specific data
    df1 = cdi.loc[(cdi['LocationAbbr'] == 'US')]
    #Get df1 topics
    topics = np.array(df1['Topic'].unique())
    #Declare dictionary to store mean data
    data = []
    
    #Create a plt figure
    fig = plt.figure()
    fig.tight_layout()
    fig.canvas.draw()
    fig.canvas.toolbar_visible = False
    fig.canvas.resizable = False
    
    if option == 'Average':
        #Get mean data for each topic
        for topic in topics:
            #Get new df of topic specific data
            df2 = cdi.loc[(cdi['Topic'] == topic)]
            #Get the mean for the topic
            mean = df2.iloc[:, 6].mean(axis=0)
            #Add the mean to the array means
            data.append(mean)

        #Plot data
        plt.barh(topics, data)

        #Add plot aesthetic
        plt.xlabel('Mortalities')
        plt.title('Average Mortalities in the US (2010 - 2018)')
        fig.tight_layout()
    else:
        #Get sum data for each topic
        for topic in topics:
            #Get new df of topic specific data
            df2 = cdi.loc[(cdi['Topic'] == topic)]
            #Get the mean for the topic
            total = df2.iloc[:, 6].sum(axis=0)
            #Add the mean to the array means
            data.append(total)

        #Plot data
        plt.barh(topics, data)

        #Add plot aesthetic
        plt.xlabel('Mortalities')
        plt.title('Total Mortalities in the US (2010 - 2018)')
        fig.tight_layout()

#Create a radio button for Average and Total options
radio_btn = widgets.RadioButtons(options=['Average', 'Total'], description= 'Calculate:', continuous_update=True)

#Create display with radio buttons
widgets.interact(mean_mortalities, option=radio_btn)
plt.show()

interactive(children=(RadioButtons(description='Calculate:', options=('Average', 'Total'), value='Average'), O…

### State Mortality Trends (2010-2018)

In [5]:
def get_state_data(state):
    #Get State Data
    df1 = cdi.loc[(cdi['LocationDesc'] == state)]
    
    # Get Range of Years and Topics
    years = np.sort(np.array(df1['YearStart'].unique()))
    topics = np.array(df1['Topic'].unique())
    
    # Create dictionary to store sum data
    new_data = {}
    
    #Create a plt figure
    fig = plt.figure()
    fig.canvas.draw()
    fig.canvas.toolbar_visible = False
    fig.canvas.resizable = False
    
    #Sort through df1 by topic
    for topic in topics:
        new_data[topic] = [[],[]]
        #Sort through df1 topic by year
        for year in years:
            #Create a new df containing specific topic and year data
            df2 = df1.loc[(df1['Topic'] == topic) & (df1['YearStart'] == year)]
            #Get the sum of all DataValues in df2
            sum = df2.iloc[:, 6].sum(axis=0)
            
            #Append data to new_data dictionary
            new_data[topic][0].append(year)
            new_data[topic][1].append(sum)    
    
    #Plot Data
    plt.plot(new_data['Cardiovascular Disease'][0], new_data['Cardiovascular Disease'][1], 'r.-', label='Cardiovascular Disease')
    plt.plot(new_data['Chronic Kidney Disease'][0], new_data['Chronic Kidney Disease'][1], 'y.-', label='Chronic Kidney Disease')
    plt.plot(new_data['Chronic Obstructive Pulmonary Disease'][0], new_data['Chronic Obstructive Pulmonary Disease'][1], 'g.-', label='Chronic Obstructive Pulmonary Disease')
    plt.plot(new_data['Diabetes'][0], new_data['Diabetes'][1], 'b.-', label='Diabetes')
    
    #Add plot aesthetic
    plt.xlabel('Year')
    plt.ylabel('Mortalities')
    plt.title('Mortality Trends by State')
    plt.legend()

#Create a downdown widget for state options
dropdwn = widgets.Dropdown(options = terrs, description = 'State', continuous_update=True)
#Create visuals
widgets.interact(get_state_data, state=dropdwn)
plt.show()

  silent = bool(old_value == new_value)


interactive(children=(Dropdown(description='State', options=('Alabama', 'Alaska', 'Arizona', 'Arkansas', 'Cali…

### Two State Total Mortalities Comparison (2010-2018)

In [6]:
def state_comp_totals(state1='Alabama', state2='Alabama'):
    #Get State Data
    df_state1 = cdi.loc[(cdi['LocationDesc'] == state1)]
    df_state2 = cdi.loc[(cdi['LocationDesc'] == state2)]
    
    # Get Range of Years and Topics
    years = np.sort(np.array(df_state1['YearStart'].unique()))
    topics = np.array(df_state1['Topic'].unique())
    
    # Create arrays to store sum data for state and us
    state1_sums = []
    state2_sums = []
    
    #Sort through df1 by topic
    for topic in topics:
        #Get new df, for state1 and state2, of topic specific data
        df2_state1 = df_state1.loc[(df_state1['Topic'] == topic)]
        df2_state2 = df_state2.loc[(df_state2['Topic'] == topic)]
        
        #Get the sums for the topic
        sum_for_state1 = df2_state1.iloc[:, 6].sum(axis=0).astype('int64')
        sum_for_state2 = df2_state2.iloc[:, 6].sum(axis=0).astype('int64')
        
        #Add the sum to the porper array means
        state1_sums.append(sum_for_state1)
        state2_sums.append(sum_for_state2)
        
    x_axis = np.arange(len(topics))
    width = 0.4
    
    #Create a plt figure
    #out = widgets.Output()
    #with out:
        #fig = plt.figure()
    fig, ax = plt.subplots()
    fig.canvas.draw()
    fig.canvas.toolbar_visible = False
    fig.canvas.resizable = False
    state = ax.bar(x_axis - width/2, state1_sums, width, label=state1)
    us =  ax.bar(x_axis + width/2, state2_sums, width, label=state2)
    
    ax.set_title('Total Mortality Comparison of Two States (2010 - 2018)')
    ax.set_ylabel('Mortalities')
    ax.set_xticks(x_axis)
    ax.set_xticklabels(topics, rotation=45, ha='right')
    ax.legend()
    fig.tight_layout()


#Create two dropdown widgets for state selections
dropdwn1, dropdwn2 = widgets.Dropdown(options = terrs, description = 'First State', continuous_update=True), \
                     widgets.Dropdown(options = terrs, description = 'Second State', continuous_update=True)


widgets.interact(state_comp_totals, state1=dropdwn1, state2=dropdwn2)
plt.show()

  silent = bool(old_value == new_value)


interactive(children=(Dropdown(description='First State', options=('Alabama', 'Alaska', 'Arizona', 'Arkansas',…

## Predictor

In [7]:
#Build a model for each topic that can determine the number of deaths 
#in any year, up to a certain point, and allow the user to enter a year
#and see the deaths per topic and the trends. Display the trained datasets 
#and provide the value for the specified year. Only need the x to contain
#the topic's DataValue. The y will contain all information. Then the 'US'
#data can be used to compare the accuracy. This model will be used by the 
#department of HHS to better help predict the rates of deaths in this country
#by preventable diseases and allow them to better allocate funds for 
#outreach and care.

#Get US data
df_us = cdi.loc[cdi['LocationAbbr'] == 'US']
#Get territories data (without 'US')
df_terrs = cdi.loc[cdi['LocationAbbr'] != 'US']
#Get all data 'Topics'
topics = np.array(df_terrs['Topic'].unique())
#Get all data 'Questions'
questions = {}
for topic in topics:
    df2 = cdi.loc[cdi['Topic'] == topic]
    questions[topic] = np.array(df2['Question'].unique())

#List of coefficients for every linear regression model
coefficients = []

#Generate a linear model for each topic and question
def gen_model(year, reduced_by):
    try:
        year = int(year)
        for topic in topics:
            for question in questions[topic]:
                #Get data with 'Topic' equal
                df_tq = df_terrs.loc[(df_terrs['Topic'] == topic) & (df_terrs['Question'] == question)]
                #Get x and y for linear model
                df_x = df_tq.drop(columns=['DataValue'])
                df_y = df_tq['DataValue']

                #plt.scatter(df_x[['YearStart']], df_y)
                lin_model = linear_model.LinearRegression()
                lin_model.fit(df_x[['YearStart']], df_y)
    
                #Calculate prediction values
                prediction = int(lin_model.predict(np.reshape(year, (-1, 1))))
                reduction_percent = reduced_by / 100
                prediction_reduction = int(prediction - (prediction * reduction_percent))
                                
                #Print prediction values
                print('Average ' + question + ': ' + str(prediction))
                print('Average if ' + str(reduced_by) + '% of the possible population avoids/prolongs disease: ' + str(prediction_reduction))
                print()
    except ValueError:
        print('Please enter a year after 2018')
widgets.interact(gen_model, year='', reduced_by=(1,15))
print()
print('* Note: these values represent the average across the 50 states and the District of Columbia')
plt.show()

interactive(children=(Text(value='', description='year'), IntSlider(value=8, description='reduced_by', max=15,…


* Note: these values represent the average across the 50 states and the District of Columbia


In [10]:
#Get US data
df_us = cdi.loc[cdi['LocationAbbr'] == 'US']
#Get territories data (without 'US')
df_terrs = cdi.loc[cdi['LocationAbbr'] != 'US']
#Get all data 'Topics'
topics = np.array(df_terrs['Topic'].unique())
#Get all data 'Questions'
questions = {}
for topic in topics:
    df2 = cdi.loc[cdi['Topic'] == topic]
    questions[topic] = np.array(df2['Question'].unique())

for topic in topics:
    for question in questions[topic]:
        #Get data with 'Topic' equal
        df_tq = df_terrs.loc[(df_terrs['Topic'] == topic) & (df_terrs['Question'] == question)]
        #Get x and y for linear model
        df_x = df_tq.drop(columns=['DataValue'])
        df_y = df_tq['DataValue']

        #plt.scatter(df_x[['YearStart']], df_y)
        lin_model = linear_model.LinearRegression()
        lin_model.fit(df_x[['YearStart']], df_y)
        print(lin_model.score(df_x[['YearStart']], df_y))
        #print(lin_model.score([[2020]], df_y))
#         #Calculate prediction values
#         prediction = int(lin_model.predict(np.reshape(year, (-1, 1))))
#         reduction_percent = reduced_by / 100

0.0012485368972926514
0.0009986180087266927
0.006619689014568997
0.00011339189509695924
0.0023702536518674933
0.002179636847547628
0.0026209932793876956
0.0030513059706238632
0.007717339576744475
0.017213170072046036
