# UEP-0239: Python for Data Analysis and Visualization

---

## Importing Packages

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import hvplot.pandas
import plotly.express as px

---

## Getting Started with Pandas

In [None]:
pop = pd.read_csv('data/population.csv', skiprows=4)

In [None]:
pop

In [None]:
pop.columns

In [None]:
pop.dtypes

In [None]:
pop.drop(columns=['Indicator Name', 'Indicator Code'], inplace=True)

In [None]:
pop.head()

In [None]:
gdp = (pd.read_csv('data/gdp.csv', skiprows=4)
         .drop(columns=['Indicator Name', 'Indicator Code']))

In [None]:
gdp.head()

---

## Long vs Wide Data

In [None]:
pop_long = pop.melt(id_vars=['Country Name', 'Country Code'],
                    var_name='year',
                    value_name='population')

In [None]:
pop_long

In [None]:
pop_long.dtypes

In [None]:
pop_long['year'] = pop_long.year.astype(int)

In [None]:
pop_long.dtypes

In [None]:
gdp_long = (gdp.melt(id_vars=['Country Name', 'Country Code'],
                    var_name='year',
                    value_name='gdp')
               .astype({'year': int}))

In [None]:
gdp_long

In [None]:
gdp_long.dtypes

---

## Joining Datasets

In [None]:
data = pop_long.merge(gdp_long,
                      on=['Country Name', 'Country Code', 'year'],
                      how='inner')

In [None]:
data.head()

In [None]:
data['gdp_per_capita'] = data.gdp / data.population

In [None]:
data.head()

In [None]:
def read_world_bank_data(file_name, value_name):
    return (pd.read_csv(file_name, skiprows=4)
              .drop(columns=['Indicator Name', 'Indicator Code'])
              .melt(id_vars=['Country Name', 'Country Code'],
                    var_name='year',
                    value_name=value_name)
              .astype({'year': int}))

In [None]:
life_exp = read_world_bank_data(file_name = 'data/life-expectancy.csv',
                                value_name = 'life_expectancy')

In [None]:
life_exp.head()

In [None]:
data = data.merge(life_exp,
                  on=['Country Name', 'Country Code', 'year'],
                  how='inner')

In [None]:
data.head()

In [None]:
data.rename(columns={'Country Name': 'country_name',
                     'Country Code': 'country_code'},
            inplace=True)

In [None]:
data.head()

In [None]:
m49 = pd.read_csv('data/m49.csv')

In [None]:
m49.head()

In [None]:
regions = m49[['Region Name', 'ISO-alpha3 Code']].copy()

In [None]:
regions.head()

In [None]:
regions.rename(columns={'Region Name': 'region_name',
                        'ISO-alpha3 Code': 'country_code'},
               inplace=True)

In [None]:
regions.head()

In [None]:
data = data.merge(regions,
                  on='country_code',
                  how='inner')

In [None]:
data.head()

---

## Boolean Indexing

In [None]:
usa_data = data[data.country_code == 'USA']

In [None]:
usa_data.head()

In [None]:
usa_data.country_name.unique()

In [None]:
usa_data.country_name.unique()[0]

---

## Creating Visualizations

In [None]:
plt.plot(usa_data.year, usa_data.population)
plt.show()

In [None]:
plt.plot('year', 'gdp', data=usa_data)
plt.show()

In [None]:
usa_data.plot(x='year', y='life_expectancy')
plt.show()

In [None]:
data[data.country_code == 'USA'].plot(x='year',
                                      y='gdp_per_capita',
                                      color='blue',
                                      label='USA')
data[data.country_code == 'CAN'].plot(x='year',
                                      y='gdp_per_capita',
                                      color='red',
                                      label='Canada',
                                      ax=plt.gca())
plt.ylabel('GDP per capita')
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(usa_data.year, usa_data.gdp_per_capita, 'g--', label='GDP per capita')
plt.ylabel('GDP per capita', color='g',)
plt.xlabel('year')
ax2 = ax.twinx()
ax2.plot(usa_data.year, usa_data.life_expectancy, 'mx', label='life expectancy')
plt.ylabel('life expectancy', color='m')
plt.title('USA', size=20)
fig.legend()
plt.show()

---

## Recreating Gapminder

In [None]:
data2019 = data[data.year == 2019]

In [None]:
data2019

In [None]:
plt.hist(data2019.gdp_per_capita)
plt.xlabel('GDP')
plt.show()

In [None]:
sns.histplot(data2019.life_expectancy, kde=True)
plt.show()

In [None]:
data2019.plot(x='gdp_per_capita', y='life_expectancy', kind='scatter')
plt.show()

In [None]:
sns.jointplot(data=data2019,
              x='gdp_per_capita',
              y='life_expectancy',
              kind='kde',
              fill=True)
plt.show()

In [None]:
plt.scatter(data2019.gdp_per_capita, data2019.life_expectancy)
plt.xscale('log')
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
plt.scatter(data2019.gdp_per_capita, data2019.life_expectancy,
            s=data2019.population/data2019.population.max()*5000,
            alpha=0.5)
plt.xscale('log')
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
for key, group in data2019.groupby('region_name'):
    plt.scatter(group.gdp_per_capita, group.life_expectancy,
                s=group.population/data2019.population.max()*5000,
                label=key,
                alpha=0.5)
plt.xscale('log')
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
sns.scatterplot(data=data2019,
                x='gdp_per_capita',
                y='life_expectancy',
                size='population',
                sizes=(10, 5000),
                hue='region_name',
                alpha=0.5,
                legend=False)
plt.xscale('log')
plt.show()

---

## Interactive Visualizations

In [None]:
data2019.hvplot.scatter(x='gdp_per_capita',
                        y='life_expectancy',
                        s='population',
                        c='region_name',
                        scale=1/data2019.population.max()*2000000,
                        hover_cols=['country_name', 'country_code'],
                        alpha=0.5,
                        logx=True,
                        width=650,
                        height=500)

In [None]:
px.scatter(data_frame=data2019.dropna(),
           x='gdp_per_capita',
           y='life_expectancy',
           size='population',
           color='region_name',
           hover_name='country_name',
           hover_data=['country_code'],
           size_max=40,
           opacity=0.5,
           log_x=True,
           width=650,
           height=600)

---

## Working with Timeseries

In [None]:
mbta = pd.read_csv('data/mbta-gated-entries-2021.csv')

In [None]:
mbta.head()

In [None]:
mbta.dtypes

In [None]:
mbta['time_period'] = mbta.time_period.str.strip('()')

In [None]:
mbta.head()

In [None]:
mbta['timestamp'] = pd.to_datetime(mbta.service_date + ' ' + mbta.time_period)

In [None]:
mbta

In [None]:
mbta.dtypes

In [None]:
mbta = mbta[['timestamp', 'station_name', 'route_or_line', 'gated_entries']].copy()

In [None]:
mbta

In [None]:
mbta.gated_entries.sum()

In [None]:
mbta.gated_entries[mbta.timestamp == '2020-02-24'].sum()

In [None]:
mbta.gated_entries[
    (mbta.timestamp >= '2020-02-01') & (mbta.timestamp < '2020-03-01')].sum()

In [None]:
mbta.gated_entries[mbta.timestamp.dt.month == 2].sum()

---

## Grouping and Aggregating

In [None]:
mbta['date'] = mbta.timestamp.dt.date

In [None]:
mbta_agg = mbta.groupby('date').gated_entries.sum().to_frame().reset_index()

In [None]:
mbta_agg.head()

In [None]:
mbta_agg.gated_entries.max()

In [None]:
mbta_agg.date[mbta_agg.gated_entries == mbta_agg.gated_entries.max()]

In [None]:
mbta_agg.date[mbta_agg.gated_entries == mbta_agg.gated_entries.max()].values[0]

In [None]:
(mbta.groupby('station_name')
     .gated_entries.sum()
     .sort_values()
     .to_frame()
     .reset_index())

In [None]:
(mbta.groupby('route_or_line')
     .gated_entries.sum()
     .sort_values()
     .to_frame()
     .reset_index())

In [None]:
mbta_bydate = (mbta.groupby(['date', 'station_name', 'route_or_line'])
                   .gated_entries.sum()
                   .to_frame()
                   .reset_index())

In [None]:
mbta_bydate