# Group 3 - Project 1
# Impact of Infectious Diseases on Flights

In [8]:
# Dependencies
import csv
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import requests
from scipy import stats
import numpy as np
from scipy.stats import linregress
import matplotlib
import time
import calendar
import datetime
import gmaps

from config import g_key

## Analyze Total US Airline Data Domestic and International

In [9]:
# https://www.transtats.bts.gov/Data_Elements.aspx?Data=2
# US flight data for domestic and international
# *Most recent three months of international data by airport and by carrier withheld because of confidentiality agreements for individual routes. 
# Summary totals are shown for all airports and all carriers. 

flight_data_df = pd.read_csv('RawData/Flights_3_12_2020.csv')

In [10]:
flight_data_df

Unnamed: 0,Year,Month,DOMESTIC,INTERNATIONAL,TOTAL
0,2002,10,815489,92565,908054
1,2002,11,766775,91342,858117
2,2002,12,782175,96881,879056
3,2002,TOTAL,8085083,1023994,9109077
4,2003,1,785651,98053,883704
...,...,...,...,...,...
224,2019,TOTAL,8591641,1620275,10211916
225,,,,,
226,Note: All numbers are for scheduled services.,,,,
227,,,,,


In [11]:
# Drop the last few rows which have null values
flight_data_df = flight_data_df.dropna(how='any')

# Drop the rows which have totals for each year
flight_data_df = flight_data_df[flight_data_df['Month'] != 'TOTAL']

In [12]:
# Make all columns as float
columns = flight_data_df.columns
for column in columns:
    flight_data_df[column] = flight_data_df[column].str.replace(",","").astype(float)

In [13]:
# Make Year and Month into integers
flight_data_df['Year'] = flight_data_df['Year'].astype(int)
flight_data_df['Month'] = flight_data_df['Month'].astype(int)

In [14]:
# Make a date column that uses the month and year and assumes the 1st of every month
Date = []
for y, m in zip(flight_data_df.Year, flight_data_df.Month):
    Date.append(datetime(y, m, 1))

flight_data_df['Date'] = Date

TypeError: 'module' object is not callable

In [None]:
flight_data_df

In [None]:
ax = plt.gca()
flight_data_df.plot.line(x = 'Date', y='DOMESTIC',ax=ax)
flight_data_df.plot.line(x = 'Date', y='INTERNATIONAL',ax=ax)
flight_data_df.plot.line(x = 'Date', y='TOTAL',ax=ax)
ax.grid()
plt.show()

## Ebola Data Analysis (Vikash Bhakta)

In [None]:
#https://data.humdata.org/dataset/ebola-cases-2014
ebola_df = pd.read_csv('RawData/ebola_data_db_format.csv')
ebola_df.head()

In [None]:
ebola_df['Indicator'].value_counts()

In [None]:
# Keep the only two indicators I care about
ebola_df = ebola_df[(ebola_df['Indicator']=='Cumulative number of confirmed, probable and suspected Ebola cases')|
        (ebola_df['Indicator']=='Cumulative number of confirmed, probable and suspected Ebola deaths')]

# Pivot table on values and make new columns as indicators
ebola_df = ebola_df.pivot_table(values='value', index=['Country', 'Date'], columns=['Indicator'])
ebola_df= ebola_df.reset_index()
ebola_df

In [None]:
ebola_df['Country'].value_counts()

In [None]:
# Fix country names
ebola_df['Country'] = ebola_df['Country'].replace({'Liberia 2': 'Liberia', 'Guinea 2': 'Liberia'})

# Rename columns
ebola_df = ebola_df.rename(columns=
                    {'Cumulative number of confirmed, probable and suspected Ebola deaths': 'Cumulative Number of Ebola Deaths',
                    'Cumulative number of confirmed, probable and suspected Ebola cases': 'Cumulative Number of Ebola Cases'})

# Sort by date and country
ebola_df = ebola_df.sort_values(by=['Date', 'Country'])
ebola_df

In [None]:
# Get the last date for each county so we can get final data
last_ebola_data = ebola_df[ebola_df['Date'] == max(ebola_df['Date'])].reset_index()
last_ebola_data


### Ebola Heat Map

In [None]:
# Create and clean lat/lng csv
lat_lng_csv = "../group3/citycoords.csv"
lat_lng_df = pd.read_csv(lat_lng_csv)
lat_lng_df = lat_lng_df.rename(columns={"name_long": "Country"})
# lat_lng_df = lat_lng_df.replace(to_replace=['Ireland'],
#                 value='Republic of Ireland')

countries_lat_lng = pd.merge(last_ebola_data, lat_lng_df, on='Country')

countries_lat_lng = countries_lat_lng[['Country', 'Cumulative Number of Ebola Cases', 'Cumulative Number of Ebola Deaths', 'Longitude', 'Latitude']]

# Access maps with unique API key
gmaps.configure(api_key=g_key)

locations = countries_lat_lng[["Latitude", "Longitude"]]

# Plot Heatmap
fig = gmaps.figure()

# Create heat layer
heat_layer = gmaps.heatmap_layer(locations, weights=countries_lat_lng['Cumulative Number of Ebola Cases'], 
                                 dissipating=False, max_intensity=70,
                                 point_radius=5)

# Add layer
fig.add_layer(heat_layer)

# Display figure
fig

In [None]:
# Get final counts for cases, deaths, and mortality rate
final_ebola_cases = last_ebola_data['Cumulative Number of Ebola Cases'].sum()
final_ebola_deaths = last_ebola_data['Cumulative Number of Ebola Deaths'].sum()
ebola_mortality_rate = final_ebola_deaths/final_ebola_deaths

In [None]:
# Get list of dates sorted
dates = ebola_df['Date'].unique().tolist()
dates.sort()

# Group by dates and sum of cases and deaths for each country on date
grouped_date_ebola = ebola_df.groupby('Date')
total_cases = grouped_date_ebola['Cumulative Number of Ebola Cases'].sum().tolist()
total_deathes = grouped_date_ebola['Cumulative Number of Ebola Deaths'].sum().tolist()

In [None]:
# Get cumulative number of cases and deaths across date
grouped_date_ebola =  pd.DataFrame({'Date': dates,
                                   'Cumulative Number of Ebola Cases': total_cases,
                                   'Cumulative Number of Ebola Deaths': total_deathes})

In [None]:
# Create columns that have the change of cases and deaths over the dates
daily_ebola_cases = [grouped_date_ebola.iloc[0,1]]
daily_ebola_deaths = [grouped_date_ebola.iloc[0,2]]

for x in range(1,len(grouped_date_ebola)):
    daily_case = grouped_date_ebola.iloc[x,1] - grouped_date_ebola.iloc[x-1,1]
    daily_ebola_cases.append(daily_case)
    daily_deaths = grouped_date_ebola.iloc[x,2] - grouped_date_ebola.iloc[x-1,2]
    daily_ebola_deaths.append(daily_deaths)
    
grouped_date_ebola['Change in Number of Ebola Cases'] = daily_ebola_cases
grouped_date_ebola['Change in Number of Ebola Deaths'] = daily_ebola_deaths
grouped_date_ebola

In [None]:
summed_dates = []
for row in grouped_date_ebola['Date']:
    summed_dates.append(matplotlib.dates.date2num(datetime.strptime(row, '%Y-%m-%d')))

cases_over_time, = plt.plot(summed_dates, grouped_date_ebola['Cumulative Number of Ebola Cases'], label="Cases")
deaths_over_time, = plt.plot(summed_dates, grouped_date_ebola['Cumulative Number of Ebola Deaths'], label="Deaths")

ax = plt.gca()
ax.xaxis.set_minor_locator(matplotlib.dates.MonthLocator())
ax.xaxis.set_minor_formatter(matplotlib.dates.DateFormatter('%b'))
ax.xaxis.set_major_locator(matplotlib.dates.YearLocator())
ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter('%Y'))
ax.tick_params(pad=5)
plt.xticks(rotation=90)
plt.setp(ax.xaxis.get_minorticklabels(), rotation=90)

plt.title("Ebola Outbreak Over Time")
plt.xlabel("Date")
plt.ylabel("Cumulative Number of Cases/Deaths")
plt.legend(handles=[cases_over_time, deaths_over_time], loc="best")
plt.grid(which='both')
plt.show()

In [None]:
# Create new column that has month and year so we can compare with flight data later
grouped_date_ebola['Month and Year'] = pd.to_datetime(grouped_date_ebola['Date']).dt.to_period('M')
grouped_ebola = grouped_date_ebola.groupby('Month and Year')
grouped_ebola = grouped_ebola.sum()

In [None]:
# Create new dataframe that has it monthly cases and years
grouped_monthly_ebola =  pd.DataFrame({'Monthly Number of Ebola Cases': grouped_ebola['Change in Number of Ebola Cases'],
                                   'Monthly Number of Ebola Deaths': grouped_ebola['Change in Number of Ebola Deaths']})

grouped_monthly_ebola = grouped_monthly_ebola.reset_index()
# Make new columns for each month and year so we can merge with flight data
grouped_monthly_ebola['Month'] = grouped_monthly_ebola['Month and Year'].dt.month
grouped_monthly_ebola['Year'] = grouped_monthly_ebola['Month and Year'].dt.year

grouped_monthly_ebola

In [None]:
ebola_flight_data = pd.read_csv('RawData/Ebola_Top3Countries_2013-2017.csv')
ebola_flight_data

In [None]:
# Make a Month and Year column
Date = []
for y, m in zip(ebola_flight_data.YEAR, ebola_flight_data.MONTH):
    Date.append(datetime(y, m,1))
ebola_flight_data['Date'] = Date 
ebola_flight_data['Month and Year'] = pd.to_datetime(ebola_flight_data['Date']).dt.to_period('M')

# Get number of flights per month/year
monthly_flights_ebola = ebola_flight_data['Month and Year'].value_counts().rename_axis('Month and Year').reset_index(name='Flights')
monthly_flights_ebola = monthly_flights_ebola.sort_values(by=['Month and Year'])
monthly_flights_ebola


In [None]:
ax = plt.gca()
grouped_monthly_ebola.plot.line(x = 'Month and Year', y='Monthly Number of Ebola Cases',ax=ax)
grouped_monthly_ebola.plot.line(x = 'Month and Year', y='Monthly Number of Ebola Deaths',ax=ax)
monthly_flights_ebola.plot.line(x = 'Month and Year', y='Flights',ax=ax, secondary_y = True)
plt.title(" African International Flights vs Ebola cases/deaths over Time")
ax.set_ylabel('Number of cases/deaths')
ax.right_ax.set_ylabel('Number of flights')
plt.grid()
plt.show()

In [None]:
merged_ebola_flight_data = pd.merge(monthly_flights_ebola, grouped_monthly_ebola, on=['Month and Year'])
merged_ebola_flight_data

In [None]:
# Build a scatter plot
plt.scatter(merged_ebola_flight_data["Monthly Number of Ebola Cases"], merged_ebola_flight_data["Flights"], marker="o", edgecolors='black')

# Incorporate the other graph properties
plt.title("African International Flights vs Ebola Cases")
plt.ylabel("Number of Flights")
plt.xlabel("Number of Ebola Cases")
plt.grid(True)

x_values = merged_ebola_flight_data["Monthly Number of Ebola Cases"]
y_values = merged_ebola_flight_data["Flights"]

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,6)) + "x + " + str(round(intercept,2))
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(3000,12),fontsize=15,color="red")
print(f"The r-squared is: {rvalue}")
plt.show()


## SARS Data Analysis (Ryan Ashcraft)

In [None]:
# https://www.kaggle.com/imdevskp/sars-outbreak-2003-complete-dataset
# Create a reference the CSV file desired
SARS_csv = "../group3/RawData/sars_2003_complete_dataset_clean.csv"

# Read the CSV into a Pandas DataFrame
SARS_df = pd.read_csv(SARS_csv)

SARS_df

In [None]:
grouped_SARS = SARS_df.groupby('Date')
cumulative_SARS = grouped_SARS.sum()

SARS_summary = pd.DataFrame({
                             "Case Count": cumulative_SARS['Cumulative number of case(s)'],
                             "Death Count": cumulative_SARS['Number of deaths'],
                             "Recovered Count": cumulative_SARS['Number recovered']
                            })

SARS_summary = SARS_summary.reset_index()
SARS_summary

In [None]:
# Combine Chinese Data
SARS_china = SARS_df.replace(to_replace=['Hong Kong SAR, China', 'Macao SAR, China', 'Taiwan, China'],
                value='China')
SARS_final = SARS_china.replace(to_replace=['Viet Nam'],
                value='Vietnam')

country_totals = SARS_final[SARS_final['Date'] == max(SARS_final['Date'])].reset_index()
country_totals

In [None]:
grouped_countries = country_totals.groupby('Country')
grouped_countries = grouped_countries.sum()
grouped_countries = grouped_countries.drop(['index'], axis=1)

country_summary = pd.DataFrame({
                             "Case Count": grouped_countries['Cumulative number of case(s)'],
                             "Death Count": grouped_countries['Number of deaths'],
                             "Recovered Count": grouped_countries['Number recovered'],
                             "Mortality Rate": 100*(grouped_countries['Number of deaths']/grouped_countries['Cumulative number of case(s)'])
                            })

country_summary['Mortality Rate'] = country_summary['Mortality Rate'].map("{:.2f}%".format)

country_summary

In [None]:
summed_dates = []
for row in SARS_summary['Date']:
    summed_dates.append(matplotlib.dates.date2num(datetime.strptime(row, '%Y-%m-%d')))

cases_over_time, = plt.plot(summed_dates, SARS_summary['Case Count'], label="Cases")
deaths_over_time, = plt.plot(summed_dates, SARS_summary['Death Count'], label="Deaths")

# x_axis = dates
# y_axis = SARS_summary['Case Count']

ax = plt.gca()
ax.xaxis.set_minor_locator(matplotlib.dates.MonthLocator())
ax.xaxis.set_minor_formatter(matplotlib.dates.DateFormatter('%b'))
ax.xaxis.set_major_locator(matplotlib.dates.YearLocator())
ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter('%Y'))
ax.tick_params(pad=5)

# ax.plot(x_axis, y_axis)
plt.title("SARS Outbreak Over Time")
plt.xlabel("Months in 2003")
plt.ylabel("Cumulative Number of Cases/Deaths")
plt.legend(handles=[cases_over_time, deaths_over_time], loc="best")
plt.grid(which='both')
plt.savefig('Plots/SARS_Over_Time.png')
plt.show()

### SARS Heatmap

In [None]:
# Create and clean lat/lng csv
lat_lng_csv = "../group3/citycoords.csv"
lat_lng_df = pd.read_csv(lat_lng_csv)
lat_lng_df = lat_lng_df.rename(columns={"name_long": "Country"})
lat_lng_df = lat_lng_df.replace(to_replace=['Ireland'],
                value='Republic of Ireland')

countries_lat_lng = pd.merge(country_summary, lat_lng_df, on='Country')

countries_lat_lng = countries_lat_lng[['Country', 'Case Count', 'Death Count', 'Recovered Count', 'Mortality Rate', 'Longitude', 'Latitude']]

# Access maps with unique API key
gmaps.configure(api_key=g_key)

locations = countries_lat_lng[["Latitude", "Longitude"]]

# Plot Heatmap
fig = gmaps.figure()

# Create heat layer
heat_layer = gmaps.heatmap_layer(locations, weights=countries_lat_lng['Case Count'], 
                                 dissipating=False, max_intensity=70,
                                 point_radius=5)

# Add layer
fig.add_layer(heat_layer)

# Display figure
fig

In [None]:
SARS_cases = country_summary['Case Count'].sum()

SARS_deaths = country_summary['Death Count'].sum()

s_final_summary = pd.DataFrame({'Cases': [SARS_cases],
                              'Deaths': [SARS_deaths]
                            })

s_final_summary['Mortality Rate'] = 100*(s_final_summary['Deaths']/s_final_summary['Cases'])
s_final_summary['Mortality Rate'] = s_final_summary['Mortality Rate'].map("{:.2f}%".format)


s_final_summary

# SARS Flight Data

In [None]:
daily_SARS_cases = [SARS_summary.iloc[0,1]]
daily_SARS_deaths = [SARS_summary.iloc[0,2]]
for x in range(1,len(SARS_summary)):
    daily_case = SARS_summary.iloc[x,1] - SARS_summary.iloc[x-1,1]
    daily_SARS_cases.append(daily_case)
    daily_deaths = SARS_summary.iloc[x,2] - SARS_summary.iloc[x-1,2]
    daily_SARS_deaths.append(daily_deaths)

daily_SARS_cases
daily_SARS_deaths

SARS_summary['Daily Cases'] = daily_SARS_cases
SARS_summary['Daily Deaths'] = daily_SARS_deaths

SARS_summary

In [None]:
SARS_summary['Month and Year'] = pd.to_datetime(SARS_summary['Date']).dt.to_period('M')

SARS_summary

In [None]:
monthly_grouping = SARS_summary.groupby('Month and Year')
monthly_grouping = monthly_grouping.sum()
monthly_grouping

monthly_sars_summary = pd.DataFrame({'Monthly Cases': monthly_grouping['Daily Cases'],
                                     'Monthly Deaths': monthly_grouping['Daily Deaths'],
                                     })

monthly_sars_summary = monthly_sars_summary.reset_index()
monthly_sars_summary['Month'] = monthly_sars_summary['Month and Year'].dt.month
monthly_sars_summary['Year'] = monthly_sars_summary['Month and Year'].dt.year

monthly_sars_summary

In [None]:
sars_flight_df = pd.read_csv('RawData/sars_flight_data_2002-2004.csv')

In [None]:
sars_flight_df

In [None]:
# Drop the last few rows which have null values
sars_flight_df = sars_flight_df.dropna(how='any')
sars_flight_df

In [None]:
# Drop the rows which have totals for each year
sars_flight_df = sars_flight_df[sars_flight_df['PASSENGERS'] != 0]
sars_flight_df

In [None]:
Date = []
for y, m in zip(sars_flight_df.YEAR, sars_flight_df.MONTH):
    Date.append(datetime(y, m, 1))

sars_flight_df['Date'] = Date

sars_flight_df

In [None]:
grouped_sars_flights = sars_flight_df.groupby('Date')
sars_flights_by_date = grouped_sars_flights.count()
sars_flights_by_date = sars_flights_by_date.reset_index()
sars_flights_by_date['MONTH'] = sars_flights_by_date['Date'].dt.month
sars_flights_by_date['YEAR'] = sars_flights_by_date['Date'].dt.year

sars_flights_summary = pd.DataFrame({'Date': sars_flights_by_date['Date'],
                                     'Flights per Month': sars_flights_by_date['PASSENGERS'],
                                     'Month': sars_flights_by_date['MONTH'],
                                     'Year': sars_flights_by_date['YEAR']
                                    })

sars_flights_summary

In [None]:
ax = plt.gca()
sars_flights_summary.plot.line(x ='Date', y='Flights per Month',ax=ax)
plt.title('Flights per Month between the USA and China, Canada, and Singapore')
plt.show()

In [None]:
sars_flight_data = pd.merge(sars_flights_summary, monthly_sars_summary, on=['Month', 'Year'])

sars_flight_data

In [None]:
#run scatter plot and calculate r square
x_values = sars_flight_data['Monthly Cases']
y_values = sars_flight_data['Flights per Month']
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(2000,750),fontsize=15,color="red")
plt.xlabel('Cases per Month')
plt.ylabel('Flights per Month')
plt.title("Flights to Most Impacted Countries vs SARS Cases")
print(f"The r-squared is: {rvalue}")
plt.savefig('Plots/SARS_Flight_Scatter.png')
plt.grid(which='both')
plt.show()


In [None]:
#create average dataframe
avg_flight_df = sars_flights_summary.groupby(['Month']).mean()

#drop Year columns
del avg_flight_df['Year']

avg_flight_df.head()

In [None]:
#do a second merge to add average data frame
flights_w_avg = pd.merge(sars_flight_data, avg_flight_df, on = 'Month', how = "inner", suffixes = ("", "_AVG"))

flights_w_avg

In [None]:
plt.figure(figsize = (10,5))
ax = plt.gca()

flights_w_avg.plot.line(x='Date', y='Monthly Cases', ax=ax, secondary_y = True)
flights_w_avg.plot.line(x='Date', y='Flights per Month', ax=ax, label="Flights per Month during SARS")
flights_w_avg.plot.line(x='Date', y='Flights per Month_AVG', ax=ax, label="Historical Flights per Month", linestyle = '-.')

plt.title("Impact on Flights to and from Most Impacted Countries")
plt.xlabel("Date")
plt.savefig('Plots/Historical_SARS.png')
plt.show()

# H1N1 Data Analysis (Luis Olguin)

In [None]:
# Create a reference the CSV file desired
#Source: http://apps.who.int/flumart/Default?ReportNo=12

flu_file = "../group3/RawData/flu_data.csv"

#read CSV input files
flu_data = pd.read_csv(flu_file, encoding = 'ISO-8859-1')

#convert time values from flu data
flu_data['SDATE'] = pd.to_datetime(flu_data["SDATE"])
flu_data['EDATE'] = pd.to_datetime(flu_data["EDATE"])

#rename columns in flu data
flu_data = flu_data.rename(columns = {'AH1N12009':'H1N1_Cases'})

#shorten time-frame in flu data
trimmed_flu_data = flu_data.loc[(flu_data["EDATE"] >= "2009-05-01") & (flu_data["EDATE"] < "2010-04-01") ]

#remove unecessary columns  in flu data
trimmed_flu_data = trimmed_flu_data[['WHOREGION', 'Country', 'Year', 'Month', 'Week', 'SDATE', 'EDATE', 'H1N1_Cases', 'TITLE' ]]

#preview data
trimmed_flu_data.head()

In [None]:
# replace NaN values with 0's
trimmed_flu_data = trimmed_flu_data.fillna(0)

trimmed_flu_data

# H1N1 Line Plots: Total and By Region

In [None]:
#Code here is used to plot Total Observed Weekly H1N1 Cases:

#group trimmed flu data by EDATE
who_total = trimmed_flu_data.groupby(["EDATE"])

#grab sum
who_vol = who_total["H1N1_Cases"].sum()

#reset index
who_vol.sort_index(inplace = True)

#plot data
plt.figure(figsize = (15,5))
main_plot = plt.plot( who_vol, color = "blue", label = "Cases")
plt.tick_params(axis = 'both', labelsize = 10)

ax = plt.axes()

#set titles and axis labels
plt.title("Observed Weekly H1N1 Cases", fontsize = 20)
plt.ylabel("Number of Cases", fontsize = 15)
plt.xlabel("Date", fontsize = 15)
plt.grid(axis = 'y')
plt.savefig("plots/total_observed_h1n1_cases.png")
plt.show()

In [None]:
#Code here is used to plot Total Observed Weekly H1N1 Cases BY REGION:

#create filtered dataframes
african = trimmed_flu_data.loc[trimmed_flu_data["WHOREGION"] == 'African Region of WHO']
eastern = trimmed_flu_data.loc[trimmed_flu_data["WHOREGION"] == 'Eastern Mediterranean Region of WHO']
european = trimmed_flu_data.loc[trimmed_flu_data["WHOREGION"] == 'European Region of WHO']
americas = trimmed_flu_data.loc[trimmed_flu_data["WHOREGION"] == 'Region of the Americas of WHO']
southeast = trimmed_flu_data.loc[trimmed_flu_data["WHOREGION"] == 'South-East Asia Region of WHO']
western_pacific = trimmed_flu_data.loc[trimmed_flu_data["WHOREGION"] == 'Western Pacific Region of WHO']

#group by EDATE
african_grouped = african.groupby(['EDATE'])
eastern_grouped = eastern.groupby(['EDATE'])
european_grouped = european.groupby(['EDATE'])
americas_grouped = americas.groupby(['EDATE'])
southeast_grouped = southeast.groupby(['EDATE'])
western_pacific_grouped = western_pacific.groupby(['EDATE'])

#grab summed cases
african_vol = african_grouped["H1N1_Cases"].sum()
eastern_vol = eastern_grouped["H1N1_Cases"].sum()
european_vol = european_grouped["H1N1_Cases"].sum()
americas_vol = americas_grouped["H1N1_Cases"].sum()
southeast_vol = southeast_grouped["H1N1_Cases"].sum()
western_pacific_vol = western_pacific_grouped["H1N1_Cases"].sum()

#plot lines
plt.figure(figsize = (15,5))
african_plot = plt.plot(african_vol, color="blue", label = "African Region")
eastern_plot, = plt.plot(eastern_vol , color="gray", label = "Eastern Mediterranean Region")
european_plot, = plt.plot(european_vol , color="green", label = "European Region")
americas_plot, = plt.plot(americas_vol , color="purple", label = "Americas Region")
southeast_plot, = plt.plot(southeast_vol , color="yellow", label = "South-East Asia Region")
western_pacific_plot, = plt.plot(western_pacific_vol , color="red", label = "Western Pacific Region")

#set titles and axis labels
plt.title("Observed Weekly H1N1 Cases By WHO Region", fontsize = 20)
plt.ylabel("Number of Cases", fontsize = 15)
plt.xlabel("Date", fontsize = 15)
plt.legend(loc = "best")
plt.grid(axis = 'y')
plt.savefig("plots/region_observed_h1n1_cases.png")
plt.show()

# H1N1 Flight Data Portion

In [None]:
#create average flight dataframe
avg_flight_df = flight_data_df.groupby(['Month']).mean()

#drop Year columns in average flight df
del avg_flight_df['Year']

#preview flight df
avg_flight_df.head()

In [None]:
#create USA specific df
usa_df = trimmed_flu_data.loc[trimmed_flu_data["Country"] == 'United States of America']

#group by Year and Month
usa_df = usa_df.groupby(['Year', 'Month']).sum()

#remove columns
del usa_df['Week']

#merge usa flight and flu data
merged_df = pd.merge(flight_data_df, usa_df, on = ['Year', 'Month'], how="inner")

#do a second merge to add average flight df
merged_df = pd.merge(merged_df, avg_flight_df, on = 'Month', how = "inner", suffixes = ("", "_AVG"))


merged_df.sort_values(by = ['Date'], inplace = True, ascending = True, axis = 0)

merged_df

In [None]:
#plotting the USA data

plt.figure(figsize = (10,5))
ax = plt.gca()


merged_df.plot.line(x = 'Date', y='DOMESTIC', ax=ax)
merged_df.plot.line(x = 'Date', y = 'DOMESTIC_AVG', ax=ax, linestyle = '-.')

#create a second axis that contains the H1N1 Case data
ax2 = merged_df.plot.line(x = 'Date', y='H1N1_Cases', ax=ax, secondary_y = True, label = 'US H1N1 Cases')

#set titles and labels
plt.title("Domestic Flights vs USA H1N1 Cases")
ax.set_ylabel('# of Domestic Flights')
ax2.set_ylabel('# of H1N1 Cases')

#save and show chart
plt.savefig("plots/line_domestic_flights_vs_h1n1.png")
plt.show()

In [None]:
#run scatter plot and calculate r square
x_values = merged_df['H1N1_Cases']
y_values = merged_df['DOMESTIC']
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(0,675000),fontsize=15,color="red")
plt.xlabel('USA H1N1 Cases')
plt.ylabel('US DOMESTIC Flights')
plt.title("Domestic Flights vs USA H1N1 Cases")
print(f"The r-squared is: {rvalue}")
plt.savefig("plots/scatter_domestic_flights_vs_h1n1.png")
plt.show()

In [None]:
#create International specific df
intl_df = trimmed_flu_data.loc[trimmed_flu_data["Country"] != 'United States of America']

#group by Year and Month
intl_df = intl_df.groupby(['Year', 'Month']).sum()

#remove columns
del intl_df['Week']

#merge International flight and flu data
intl_merged_df = pd.merge(flight_data_df, intl_df, on = ['Year', 'Month'], how="inner")

#do a second merge to add average flight df
intl_merged_df = pd.merge(intl_merged_df, avg_flight_df, on = 'Month', how = "inner", suffixes = ("", "_AVG"))


intl_merged_df.sort_values(by = ['Date'], inplace = True, ascending = True, axis = 0)

intl_merged_df

In [None]:
#plotting the International data

plt.figure(figsize = (10,5))
ax = plt.gca()

intl_merged_df.plot.line(x = 'Date', y='INTERNATIONAL', ax=ax)
intl_merged_df.plot.line(x = 'Date', y = 'INTERNATIONAL_AVG', ax=ax, linestyle = '-.')

#create a second axis that contains the H1N1 Case data
ax3 = intl_merged_df.plot.line(x = 'Date', y='H1N1_Cases', ax=ax, secondary_y = True, label = 'INTL H1N1 Cases')

#set titles and labels
plt.title("International Flights vs International H1N1 Cases")
ax.set_ylabel('# of International Flights')
ax3.set_ylabel('# of H1N1 Cases')

#show chart
plt.savefig("plots/line_intl_flights_vs_h1n1.png")
plt.show()

In [None]:
#run scatter plot and calculate r square
x_values = intl_merged_df['H1N1_Cases']
y_values = intl_merged_df['DOMESTIC']

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)

regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(0,675000),fontsize=15,color="red")
plt.xlabel('INTL H1N1 Cases')
plt.ylabel('International Flights')
plt.title("International Flights vs International H1N1 Cases")
print(f"The r-squared is: {rvalue}")
plt.savefig("plots/scatter_intl_flights_vs_h1n1.png")
plt.show()

# H1N1 HeatMap Portion

In [None]:
lat_lng_df = lat_lng_df.replace(to_replace=['United States'],
                value='United States of America')

In [None]:
#create summary dataframe
who_country = flu_data.groupby('Country').sum()
who_summary = pd.DataFrame ({
                "H1N1 Case Count": who_country['H1N1_Cases']
})

who_summary.sort_values(by = ['H1N1 Case Count'], inplace = True, ascending = False, axis = 0)

who_summary = who_summary.reset_index()

max_cases = who_summary["H1N1 Case Count"].max()

h1n1_countries_latlng = pd.merge(who_summary, lat_lng_df, on = 'Country')

h1n1_countries_latlng.head()

who_summary


In [None]:
# Access maps with unique API key
gmaps.configure(api_key=g_key)

locations = h1n1_countries_latlng[["Latitude", "Longitude"]]

# Plot Heatmap
fig = gmaps.figure()

# Create heat layer
heat_layer = gmaps.heatmap_layer(locations, weights=h1n1_countries_latlng['H1N1 Case Count'], 
                                 dissipating=False, max_intensity=max_cases)

# Add layer
fig.add_layer(heat_layer)

fig

In [None]:
#H1N1 Source: https://www.cdc.gov/flu/pandemic-resources/2009-h1n1-pandemic.html

deaths = (s_final_summary['Deaths'], 12469, 2000, final_ebola_deaths)

x = np.arange(4) 
width = 0.35       

plt.bar(x, deaths, width, label='Deaths')

#plt.bar(x + width, cases, width, label='Cases')
plt.xlabel('Epidemics')
plt.ylabel('Number of Deaths')
plt.title('Summary of Epidemic Final Data')
plt.xticks(x + width / 2, ('SARS', 'H1N1', 'MERS', 'Ebola'))
plt.legend(loc='best')
plt.show()


## MERS Analysis (Maria Soto)

### Project MERS dataset
* Source: https://www.kaggle.com/imdevskp
##### Note: data has already been cleaned by author

In [None]:
# Study data files
country_count = "MERS Resources/country_count_latest.csv"
mers_weekly = "MERS Resources/weekly_clean.csv"

# Read the MERS data
mers_ByCountry = pd.read_csv(country_count) # Number of cases by Country
mers_PerWeek = pd.read_csv(mers_weekly) # Number of cases per week by Region

#Display of MERS data by Country
mers_PerWeek

In [None]:
# Number of affected countries
mers_count1 = len(mers_PerWeek["Region"].unique())
print(f"{mers_count1} regions were affected by MERS")

In [None]:
# Sum all confirmed cases
mers_sum1 = mers_PerWeek["New Cases"].sum()
print(f"There are {mers_sum1} confirmed cases")

In [None]:
# Group by Date

# Merging format Year-Week into format of Date (Year-Month_Day)

# Splitting Year and Week columns
YearWeeks = mers_PerWeek.loc[:,"Year":"Week"] 

from datetime import datetime, timedelta, date 

def tofirstdayinisoweek(year, week): 
    ret = datetime.strptime('1-%02d-%04d' % (week, year), '%w-%W-%Y').date() 
    if date(year, 1, 4).isoweekday() > 4: 
        ret -= timedelta(days=7) 
    return ret 

def tomonthisoweek(year, week): 
    ret = datetime.strptime('1-%01d-%04d' % (week, year), '%w-%W-%Y').date() 
    ret = ret.strftime("%Y-%m")
    return ret 

# Creating lists to save the formats
Days = [] 
DayAbrev = []

# This loop will change the format from weeks to months/days and from weeks to months
for i in YearWeeks.index: 
    day = tofirstdayinisoweek(YearWeeks["Year"][i],YearWeeks["Week"][i]) 
    DayAbrev.append(day.strftime("%Y-%b-%d"))
    Days.append(day) 
    
Month = [] 
for i in YearWeeks.index: 
    month = tomonthisoweek(YearWeeks["Year"][i],YearWeeks["Week"][i]) 
    Month.append(month) 

mers_PerWeek['Date'] = Days 
mers_PerWeek['DateAbrev'] = DayAbrev
mers_PerWeek['Year and Month'] = Month 

mers_PerWeek.head()

In [None]:
### Plotting MERS cases vs Time

In [None]:
# Plotting MERS cases over Time
figure(num=None, figsize=(20,6))
ax = plt.gca()

mers_PerWeekTotal.reset_index().plot(kind='line',x='Date',y='New Cases',ax=ax, color='b')
plt.setp(ax.get_xticklabels(), rotation=90, ha="right") #Change orientation of tick labels
ax.xaxis.set_major_formatter(mdates.DateFormatter('%d-%m-%Y')) #Defining format
ax.set_xlabel("Date", fontsize=16)
ax.set_ylabel("Number of cases ", fontsize=14)
ax.set_title("Total MERS Cases vs Time", fontsize=18)
ax.get_legend().remove()
ax.set_xlim(mers_PerWeekTotal.index[0], mers_PerWeekTotal.index[-1])

plt.show()

In [None]:
#Defining x, y variable to plott by Region
saudi_date = mers_PerWeek.loc[mers_PerWeek["Region"]=="Saudi Arabia",["Date"]]
saudi_cases = mers_PerWeek.loc[mers_PerWeek["Region"]=="Saudi Arabia",["New Cases"]]
saudi_month = mers_PerWeek.loc[mers_PerWeek["Region"]=="Saudi Arabia",["Year and Month"]]
saudi_year = mers_PerWeek.loc[mers_PerWeek["Region"]=="Saudi Arabia",["Year"]]
saudi = pd.DataFrame({"Date":saudi_date["Date"],"Year and Month":saudi_month["Year and Month"],"Year":saudi_year["Year"],"Cases":saudi_cases["New Cases"]})

korea_date = mers_PerWeek.loc[mers_PerWeek["Region"]=="Republic of Korea",["Date"]]
korea_cases = mers_PerWeek.loc[mers_PerWeek["Region"]=="Republic of Korea",["New Cases"]]
korea = pd.DataFrame({"Date":korea_date["Date"],"Cases":korea_cases["New Cases"]})

other_date = mers_PerWeek.loc[mers_PerWeek["Region"]=="Other Countries",["Date"]]
other_cases = mers_PerWeek.loc[mers_PerWeek["Region"]=="Other Countries",["New Cases"]]
other = pd.DataFrame({"Date":other_date["Date"],"Cases":other_cases["New Cases"]})

saudi

In [None]:
# Plotting MERS cases over Time
plt.figure(figsize=(20,6))
ax = plt.gca()

saudi.plot.line(x='Date',y='Cases',ax=ax, color='b', secondary_y=True, label="Saudi Arabia")
korea.plot.line(x='Date',y='Cases',ax=ax, color='r', label="Rep. of Korea")
other.plot.line(x='Date',y='Cases',ax=ax, color='k', label="Other Countries")
ax.xaxis.set_major_formatter(mdates.DateFormatter('%d-%m-%Y')) #Defining format
plt.setp(ax.get_xticklabels(), rotation=90, ha="right", fontsize=14) #Change orientation of tick labels
ax.set_xlabel("Date", fontsize=14)
ax.set_ylabel("Number of cases ", fontsize=14)
ax.set_title("MERS Cases By Region vs Time", fontsize=16)
ax.get_legend()
ax.set_ylim(0,)
plt.show()

In [None]:
mers_flight_data = "MERS Resources/MERS_SaudiArabia_2012-2019.csv"

mers_flights = pd.read_csv(mers_flight_data)
mers_flights

In [None]:
#Grouping, counting and creating a new data set with the data to plot Flight per Month
grouped_mers_flights = mers_flights.groupby('YEAR')
mers_flight_date = grouped_mers_flights.count()
mers_flight_date = mers_flight_date.reset_index()

mers_flight_df = pd.DataFrame({"Year":mers_flight_date['YEAR'], "Flights per Month": mers_flight_date["PASSENGERS"]})

mers_flight_df

In [None]:
ax = plt.gca()
mers_flight_df.plot.line(x ='Year', y='Flights per Month',ax=ax, color='b', linewidth=1)
plt.title('Flights between the USA and Saudi Arabia', fontsize=14)
plt.ylabel("Number of Flights", fontsize=13)
plt.xlabel("Date", fontsize=13)
plt.xlim()

plt.show()

In [None]:
saudi_group = saudi.groupby(['Year']).sum()

merged_saudi_mers = pd.merge(saudi_group, mers_flight_df, on = ["Year"], how="inner")
merged_saudi_mers

In [None]:
# Plotting the MERS cases vs Flights between US and Saudi Arabia (per month)
figure(num=None, figsize=(15,6))
ax = plt.gca()

merged_saudi_mers.plot.line(x='Year',y ='Flights per Month',ax=ax, color='green', secondary_y=True, label="Saudi Arabia Flights")
merged_saudi_mers.plot.line(x='Year',y ='Cases',ax=ax, color='b', label="Saudi Arabia MERS Cases")


#Aesthetics
plt.setp(ax.get_xticklabels(), ha="center") #Change orientation of tick labels
#ax.xaxis.set_major_formatter(mdates.DateFormatter('%m-%y')) #Define date format
ax.set_xlabel("Date", fontsize=14)
ax.set_ylabel("Number of cases ", fontsize=14)
ax.set_title("MERS Impact on Flights to and from Saudi Arabia", fontsize=16)
plt.show()

In [None]:
#Defining x and y
figure(num=None, figsize=(7,5))
x_values = merged_saudi_mers["Cases"]
y_values = merged_saudi_mers["Flights per Month"]

# Get slope, intercept from linregress() to plot y' = intercept + slope*x
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
# Plot linear regression line.
y_pred = x_values * slope + intercept
equation = "y =" + str(round(slope,2)) + "x + " + str(round(intercept,2))
r2value = round(rvalue, 6)
print(f"The r-squared is: {rvalue}")
print(f'y = ' + str(round(slope,2)) + 'x + ' + str(round(intercept,2)))

#Plotting the scatter chart and line regression

plt.scatter(x_values,y_values, marker="o", color = "blue")
plt.plot(x_values,y_pred,"r-")

#Aaesthetics
plt.annotate(equation,(400,140),fontsize=13,color="red")
plt.annotate(r2value,(420,130),fontsize=13,color="red")
plt.title("Flights to Saudi Arabia vs MERS Cases", fontsize=14)
plt.xlabel("Number of Cases", fontsize=12)
plt.ylabel("Flights per Month", fontsize=13)

plt.show()

### MERS HeatMap

In [None]:
mers_ByCountry

In [None]:
lat_lng_csv = "citycoords.csv"
lat_lng_df = pd.read_csv(lat_lng_csv)
lat_lng_df = lat_lng_df.rename(columns={"name_long": "Country"})

countries_lat_lng = pd.merge(mers_ByCountry, lat_lng_df, on='Country')

# Access maps with unique API key
gmaps.configure(api_key=g_key)

locations = countries_lat_lng[["Latitude", "Longitude"]]

# Plot Heatmap
fig = gmaps.figure()

# Create heat layer
heat_layer = gmaps.heatmap_layer(locations, weights=countries_lat_lng['Confirmed'], 
                                 dissipating=False, max_intensity=70,
                                 point_radius=5)

# Add layer
fig.add_layer(heat_layer)

# Display figure
fig
