# In this .ipnb, we automatically traverse the data table showing the overall death cause ranking system across 51 states based on gender, age.

##  1. Import required packages

In [1]:
import requests
import json
from bs4 import BeautifulSoup
import pandas as pd
from collections import defaultdict
from pandas import ExcelWriter

## 2. State Data and State Index

In [2]:
def state_list():
    '''
    rtype: dict{state name: state index} 
    
    this function finds out all states plus the US itself name and index from website
    '''
    url = "https://www.worldlifeexpectancy.com/usa-cause-of-death-by-age-and-gender"
    response = requests.get(url) 
    soup = BeautifulSoup(response.text, "html.parser") 
    s_list = soup.find_all("div",class_="scrolling-content-wrapper")

    state_list , state_index= {},0
    for i in s_list[0].find_all('a'):
        state_list[i.text] = str(state_index) if state_index!=0 else ''
        state_index+=1
    print(state_list)
    
    return state_list

In [3]:
state_list()

{'United States': '', 'Alabama': '1', 'Alaska': '2', 'Arizona': '3', 'Arkansas': '4', 'California': '5', 'Colorado': '6', 'Connecticut': '7', 'Delaware': '8', 'D C': '9', 'Florida': '10', 'Georgia': '11', 'Hawaii': '12', 'Idaho': '13', 'Illinois': '14', 'Indiana': '15', 'Iowa': '16', 'Kansas': '17', 'Kentucky': '18', 'Louisiana': '19', 'Maine': '20', 'Maryland': '21', 'Massachusetts': '22', 'Michigan': '23', 'Minnesota': '24', 'Mississippi': '25', 'Missouri': '26', 'Montana': '27', 'Nebraska': '28', 'Nevada': '29', 'New Hampshire': '30', 'New Jersey': '31', 'New Mexico': '32', 'New York': '33', 'North Carolina': '34', 'North Dakota': '35', 'Ohio': '36', 'Oklahoma': '37', 'Oregon': '38', 'Pennsylvania': '39', 'Rhode Island': '40', 'South Carolina': '41', 'South Dakota': '42', 'Tennessee': '43', 'Texas': '44', 'Utah': '45', 'Vermont': '46', 'Virginia': '47', 'Washington': '48', 'West Virginia': '49', 'Wisconsin': '50', 'Wyoming': '51'}


{'United States': '',
 'Alabama': '1',
 'Alaska': '2',
 'Arizona': '3',
 'Arkansas': '4',
 'California': '5',
 'Colorado': '6',
 'Connecticut': '7',
 'Delaware': '8',
 'D C': '9',
 'Florida': '10',
 'Georgia': '11',
 'Hawaii': '12',
 'Idaho': '13',
 'Illinois': '14',
 'Indiana': '15',
 'Iowa': '16',
 'Kansas': '17',
 'Kentucky': '18',
 'Louisiana': '19',
 'Maine': '20',
 'Maryland': '21',
 'Massachusetts': '22',
 'Michigan': '23',
 'Minnesota': '24',
 'Mississippi': '25',
 'Missouri': '26',
 'Montana': '27',
 'Nebraska': '28',
 'Nevada': '29',
 'New Hampshire': '30',
 'New Jersey': '31',
 'New Mexico': '32',
 'New York': '33',
 'North Carolina': '34',
 'North Dakota': '35',
 'Ohio': '36',
 'Oklahoma': '37',
 'Oregon': '38',
 'Pennsylvania': '39',
 'Rhode Island': '40',
 'South Carolina': '41',
 'South Dakota': '42',
 'Tennessee': '43',
 'Texas': '44',
 'Utah': '45',
 'Vermont': '46',
 'Virginia': '47',
 'Washington': '48',
 'West Virginia': '49',
 'Wisconsin': '50',
 'Wyoming': '51'}

## 3. Collect Data according to state and gender

In [9]:
'''
Given state name and index, this cell can change url accordingly
Save the dataframe into .csv file
'''
gender =['female','male','both']
for region,state in state_list.items():
    writer = pd.ExcelWriter(f'data/States/{region}.xlsx')

    for g in gender:
        # change url to the table we want to collect data now
        url = 'https://www.worldlifeexpectancy.com/j/state-gbd-cause-age?sel=d_35_44&sex={}&state={}'.format(g,state)  
        response = requests.get(url)
        html_doc = response.content 

        json_parsed = json.loads(html_doc)

        data=defaultdict(list)


        for i in json_parsed['chart']['countries']['countryitem']:
            for j in i.keys():
                data[j].append(i[j])

        
        data_contain ={'name': data['name'], 'rank': data['r_d'], 'death': data['d'], 'rank_0_14': data['r_d_0_14'], 'death_0_14':data['d_0_14'], 'rank_15_24': data['r_d_15_24'],
                       'death_15_24': data['d_15_24'], 'rank_25_34': data['r_d_25_34'], 'death_25_34': data['d_25_34'], 'rank_35_44': data['r_d_35_44'], 'death_35_44': data['d_35_44'], 
                       'rank_45_54': data['r_d_45_54'],'death_45_54': data['d_45_54'], 'rank_55_64': data['r_d_55_64'], 'death_55_64': data['d_55_64'], 'rank_65_74': data['r_d_65_74'], 
                       'death_65_74': data['d_65_74'], 'rank_75': data['r_d_75'], 'death_75': data['d_75']}
        
        # change format to panda Dataframe
        frame = pd.DataFrame(data_contain)  

        
        # write into file
        frame.to_excel(writer, sheet_name=g, index=False,engine='openpyx1')
        print('finish --> %s   %s' % (g,region))

    writer.save()

finish --> female   United States
finish --> male   United States
finish --> both   United States
finish --> female   Alabama
finish --> male   Alabama
finish --> both   Alabama
finish --> female   Alaska
finish --> male   Alaska
finish --> both   Alaska
finish --> female   Arizona
finish --> male   Arizona
finish --> both   Arizona
finish --> female   Arkansas
finish --> male   Arkansas
finish --> both   Arkansas
finish --> female   California
finish --> male   California
finish --> both   California
finish --> female   Colorado
finish --> male   Colorado
finish --> both   Colorado
finish --> female   Connecticut
finish --> male   Connecticut
finish --> both   Connecticut
finish --> female   Delaware
finish --> male   Delaware
finish --> both   Delaware
finish --> female   D C
finish --> male   D C
finish --> both   D C
finish --> female   Florida
finish --> male   Florida
finish --> both   Florida
finish --> female   Georgia
finish --> male   Georgia
finish --> both   Georgia
finish 