# Exercise 1.3 | Data Structures

In [None]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# File Path
file_path = 'https://tayweid.github.io/econ-0150/concepts/concept-1-3/data/'

## Exercise: Households

What's the data structure type?

In [None]:
# Load the dataset
household = pd.read_csv(exercises_url + 'household_size.csv')
household.head()

#### Summarize the data

In [None]:
# Summarize the dataset
household['Household Income (USD)'].describe()

#### Histogram

In [None]:
plt.figure(figsize=(8, 4))

# Create a histogram
sns.histplot(household['Household Income (USD)'], bins=10)

# Add title and labels
plt.title("Household Income")
plt.xlabel("Household Income")
plt.ylabel("Count")

# Make it pretty
sns.despine(top=True, right=True, left=False, bottom=False, offset=True, trim=True)

#plt.savefig('i/Part_1_3_Time_Series_Histogram')

#### Boxplot

In [None]:
plt.figure(figsize=(8, 2))

# Create a stripplot
sns.stripplot(household['Household Income (USD)'], orient="h", color='firebrick')

# Create a boxplot
sns.boxplot(household['Household Income (USD)'], orient="h", color='lightgrey')

# Add title and labels
plt.xlabel("Household Income")

# Make it pretty
sns.despine(top=True, right=True, left=True, bottom=False, offset=True, trim=True)

#plt.savefig('i/Part_1_3_Time_Series_Box_Scatter')

## Exercise: Yearly GDP

What's the data structure type?

In [None]:
# Load the dataset
gdp = pd.read_csv(exercises_url + 'yearly_gdp.csv')
gdp.head()

#### Summary Statistics

In [None]:
# Summarize the dataset
gdp['GDP (Million USD)'].describe()

#### Histogram

In [None]:
plt.figure(figsize=(8, 4))

# Create a histogram
sns.histplot(gdp['GDP (Million USD)'], bins=10)

# Add title and labels
plt.title("Histogram of GDP")
plt.xlabel("GDP (Million USD)")
plt.ylabel("Count")

# Make it pretty
sns.despine(top=True, right=True, left=False, bottom=False, offset=True, trim=True)

#plt.savefig('i/Part_1_3_Time_Series_Histogram')

#### Stripplot and boxplot

In [None]:
plt.figure(figsize=(8, 2))

# Create a stripplot
sns.stripplot(gdp['GDP (Million USD)'], orient="h", alpha=0.5, facecolor='firebrick', zorder=3)

# Create a boxplot
sns.boxplot(gdp['GDP (Million USD)'], orient="h", boxprops=dict(facecolor='lightgrey'))

# Add title and labels
plt.title("Histogram of GDP")
plt.xlabel("GDP (Million USD)")

# Get the quartile values
quartiles = list(gdp['GDP (Million USD)'].quantile([0, 0.25, 0.50, 0.75, 1]))

# Set the ticks and format them
plt.xticks(quartiles, [f'${x:,.0f}' for x in quartiles])
plt.yticks([])

# Make it pretty
sns.despine(top=True, right=True, left=True, bottom=False, offset=True, trim=True)

#plt.savefig('i/Part_1_3_Time_Series_Box_Scatter')

#### Line plot

In [None]:
plt.figure(figsize=(10, 4))
gdp['Month-Year'] = pd.to_datetime(gdp['Month-Year'], format='%Y-%m-%d')

plt.plot(gdp['Month-Year'], gdp['GDP (Million USD)'])
plt.title("Time-Series of GDP")
plt.xlabel('')
plt.ylabel("GDP (Million USD)")

sns.despine(top=True, right=True, left=False, bottom=False, offset=True, trim=True)

In [None]:
plt.figure(figsize=(10, 4))

gdp['Month-Year'] = pd.to_datetime(gdp['Month-Year'], format='%Y-%m-%d')
gdp['Year'] = gdp['Month-Year'].dt.strftime('%Y')

sns.lineplot(data=gdp, x='Year', y='GDP (Million USD)')
plt.title("Time-Series of GDP")
plt.xlabel('')
plt.ylabel("GDP (Million USD)")

# Make it pretty
sns.despine(top=True, right=True, left=False, bottom=False, offset=True, trim=True)

#plt.savefig('Figures/Part_1_3_Time_Series_Line')f

## Example: Household Income

What's the data structure type?

In [None]:
# Load the dataset
income = pd.read_csv(exercises_url + 'household_income.csv')
income.head()

#### Summarize the dataset

In [None]:
income.groupby('Household ID')['Income (USD)'].describe()

#### Lineplot by household

In [None]:
plt.figure(figsize=(10, 6))

# Plot income by household
sns.lineplot(data=income, x='Year', y='Income (USD)', hue='Household ID')

# Title and labels
plt.title("Panel Data: Household Income Over Time")
plt.xlabel("Year")
plt.ylabel("Income (USD)")

# Make it pretty
sns.despine(top=True, right=True, left=False, bottom=False, offset=True, trim=True)

# Save it
#plt.savefig('i/Part_1_3_Panel_Line')

In [None]:
plt.figure(figsize=(10, 6))

# Plot income by household
sns.lineplot(data=income, x='Year', y='Income (USD)', hue='Household ID', palette=['lightgrey' for h in income['Household ID'].unique()])

# Plot highlighted household in red on top
highlight_household = 'H020'
highlight_data = income[income['Household ID'] == highlight_household]
sns.lineplot(data=highlight_data, x='Year', y='Income (USD)', hue='Household ID', palette=['firebrick'])

# Title and labels
plt.title("Panel Data: Household Income Over Time")
plt.xlabel("Year")
plt.ylabel("Income (USD)")

# Make it pretty
sns.despine(top=True, right=True, left=False, bottom=False, offset=True, trim=True)

# Save it
#plt.savefig('i/Part_1_3_Panel_Line_Highlight')

### Boxplot by household

In [None]:
plt.figure(figsize=(10, 6))

# Boxplots
sns.boxplot(data=panel, y='Household ID', x='Income (USD)', color='lightgrey')

highlight_household = 'H020'
highlight_data = panel[panel['Household ID'] == highlight_household]
sns.boxplot(data=highlight_data, y='Household ID', x='Income (USD)', color='firebrick')

# Title and labels
plt.ylabel("Household")
plt.xlabel("Income (USD)")

# Make it pretty
plt.grid(axis='x', linestyle='--', alpha=0.7)
sns.despine(top=True, right=True, left=False, bottom=False, offset=True, trim=True)

# Save it
#plt.savefig('i/Part_1_3_Panel_Boxplot_Household')