# Exercise 1.2 | Numerical Variables

In [None]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# File Path
file_path = 'https://tayweid.github.io/econ-0150/concepts/concept-1-2/data/'

## Discrete Data

### Summary and Visualization

This notebook summarizes and visualizes the discrete numerical dataset.

### Load the discrete dataset

In [None]:
discrete_data_path = "discrete_numerical_dataset.csv"  # Update this path if necessary
discrete = pd.read_csv(discrete_data_path)

### Summarize the dataset

In [None]:
discrete_summary = discrete['Number of Children'].value_counts().sort_index()
print(discrete_summary)

In [None]:
discrete.describe()

### Visualize the dataset

In [None]:
plt.figure(figsize=(8, 6))

bins = discrete_summary.index
plt.hist(discrete['Number of Children'], bins=bins, edgecolor='white')

plt.title("Distribution of Number of Children")
plt.xlabel("Number of Children")
plt.ylabel("")
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--', alpha=0.7)

for spine in plt.gca().spines.values():
    spine.set_visible(False)

plt.tight_layout()
plt.savefig('Figures/Part_1_2_Discrete_Numerical_Exercise')

## Continuous Data

### Summary and Visualization

This notebook summarizes and visualizes the continuous numerical dataset.

### Load the continuous dataset

In [None]:
continuous_data_path = "continuous_numerical_dataset.csv"  # Update this path if necessary
continuous = pd.read_csv(continuous_data_path)

### Summarize the dataset

In [None]:
continuous_summary = continuous['Household Income (USD)'].describe()
print(continuous_summary)

### Visualize the dataset - Histogram

In [None]:
plt.figure(figsize=(8, 6))
continuous['Household Income (USD)'].plot(kind='hist', bins=10, color='skyblue', edgecolor='black')
plt.title("Distribution of Household Income")
plt.xlabel("Household Income (USD)")
plt.ylabel("Frequency")
plt.grid(axis='y', linestyle='--', alpha=0.7)

for spine in plt.gca().spines.values():
    spine.set_visible(False)

plt.tight_layout()
plt.savefig('Figures/Part_1_2_Continuous_Numerical_Exercise')

### Visualize the dataset - Boxplot

In [None]:
plt.figure(figsize=(8, 3))
sns.boxplot(continuous['Household Income (USD)'], vert=False, patch_artist=True, boxprops=dict(facecolor='lightgrey'))

# Get the quartile values
quartiles = list(continuous['Household Income (USD)'].quantile([0, 0.25, 0.50, 0.75, 1]))

# Set the ticks and format them
plt.xticks(quartiles, [f'${x:,.0f}' for x in quartiles])

# Make tick marks visible
plt.tick_params(axis='x', width=1, length=6)

plt.title("Household Income (USD)")
plt.xlabel('')
plt.ylabel('')
plt.yticks([])

plt.tick_params(axis='x', width=2, length=8, color='black')

for spine in plt.gca().spines.values():
    spine.set_visible(False)

plt.tight_layout()

plt.savefig('Figures/Part_1_2_Continuous_Numerical_Boxplot_Exercise')

### Visualize the dataset - Boxplot with horizontal scatter


In [None]:
plt.figure(figsize=(8, 3))
sns.boxplot(continuous['Household Income (USD)'], vert=False, patch_artist=True, boxprops=dict(facecolor='lightgrey'))
plt.scatter(continuous['Household Income (USD)'], [0] * len(continuous),
            alpha=0.5, color='firebrick', zorder=3)

# Get the quartile values
quartiles = list(continuous['Household Income (USD)'].quantile([0, 0.25, 0.50, 0.75, 1]))

# Set the ticks and format them
plt.xticks(quartiles, [f'${x:,.0f}' for x in quartiles])

# Make tick marks visible
plt.tick_params(axis='x', width=1, length=6)

plt.title("Household Income (USD)")
plt.xlabel('')
plt.ylabel('')
plt.yticks([])

plt.tick_params(axis='x', width=2, length=8, color='black')

for spine in plt.gca().spines.values():
    spine.set_visible(False)

plt.tight_layout()

plt.savefig('Figures/Part_1_2_Continuous_Numerical_Scatter_Boxplot_Exercise')

### Visualize the dataset - Boxplot with jittered horizontal scatter

In [None]:
jitter = np.random.uniform(-0.1,0.1, len(continuous))

plt.figure(figsize=(8, 3))
sns.boxplot(continuous['Household Income (USD)'], vert=False, patch_artist=True, boxprops=dict(facecolor='lightgrey'))
plt.scatter(continuous['Household Income (USD)'], jitter,
            alpha=0.5, color='firebrick', zorder=3)

# Get the quartile values
quartiles = list(continuous['Household Income (USD)'].quantile([0, 0.25, 0.50, 0.75, 1]))

# Set the ticks and format them
plt.xticks(quartiles, [f'${x:,.0f}' for x in quartiles])

# Make tick marks visible
plt.tick_params(axis='x', width=1, length=6)

plt.title("Household Income (USD)")
plt.xlabel('')
plt.ylabel('')
plt.yticks([])

plt.tick_params(axis='x', width=2, length=8, color='black')

for spine in plt.gca().spines.values():
    spine.set_visible(False)

plt.tight_layout()

plt.savefig('Figures/Part_1_2_Continuous_Numerical_Jitter_Boxplot_Exercise')