In [1]:
import pandas as pd
import numpy as np

# Read the CSV file
def clean_unemployment_data():
    print("Starting data cleaning process...")
    
    # Read the data with the first column as index
    df = pd.read_csv('Unemployment_Rate01.csv', index_col=0)
    
    print(f"Original data shape: {df.shape}")
    
    # Step 1: Clean column names
    # Remove any leading/trailing whitespace from column names
    df.columns = df.columns.str.strip()
    
    # Step 2: Handle missing values
    # Replace various forms of missing values with NaN
    df = df.replace(['', 'NA', 'NaN', 'nan'], np.nan)
    
    # Step 3: Convert all data to numeric
    # Convert all columns to numeric, forcing errors to NaN
    for column in df.columns:
        df[column] = pd.to_numeric(df[column], errors='coerce')
    
    print(f"\nMissing values before interpolation:")
    print(df.isnull().sum().sum())
    
    # Step 4: Interpolate missing values
    # Forward fill followed by backward fill for any remaining NaN
    df = df.interpolate(method='linear', axis=0).ffill().bfill()
    
    print(f"\nMissing values after interpolation:")
    print(df.isnull().sum().sum())
    
    # Step 5: Basic statistics for verification
    print("\nBasic statistics after cleaning:")
    print(df.describe().round(2))
    
    # Step 6: Save cleaned data
    cleaned_file_name = 'unemployment_rate_cleaned.csv'
    df.to_csv(cleaned_file_name)
    print(f"\nCleaned data saved to {cleaned_file_name}")
    
    return df

# Run the cleaning function
cleaned_data = clean_unemployment_data()

# Display first few rows of cleaned data
print("\nFirst few rows of cleaned data:")
print(cleaned_data.head())

# Generate some basic insights
print("\nBasic insights about the data:")
print(f"Number of countries: {cleaned_data.shape[1]}")
print(f"Time period: {cleaned_data.index[0]} to {cleaned_data.index[-1]}")

# Calculate average unemployment rates by year
yearly_avg = cleaned_data.mean(axis=1)
print("\nAverage unemployment rates by year (first 5 years):")
print(yearly_avg.head())

# Find countries with highest and lowest average unemployment
country_avg = cleaned_data.mean()
print("\nTop 5 countries with highest average unemployment:")
print(country_avg.nlargest(5))
print("\nTop 5 countries with lowest average unemployment:")
print(country_avg.nsmallest(5))

Starting data cleaning process...
Original data shape: (31, 78)

Missing values before interpolation:
565

Missing values after interpolation:
31

Basic statistics after cleaning:
       Advanced Economies  Argentina  Australia  Austria  Belgium  Bulgaria  \
count               31.00      31.00      31.00    31.00    31.00     31.00   
mean                 6.52      12.82       5.84     7.21     7.62     10.16   
std                  1.12       6.63       1.36     0.97     1.34      3.73   
min                  4.56       6.13       3.69     5.80     5.37      4.50   
25%                  5.84       7.51       5.11     6.62     6.73      7.28   
50%                  6.53       9.21       5.59     7.01     7.85     10.06   
75%                  7.34      22.43       6.41     7.48     8.50     11.83   
max                  8.45      22.43       8.52    10.00     9.67     18.13   

       Bahrain  Belarus  Brazil  Canada  ...  Thailand  Tunisia  Turkey  \
count    31.00    31.00   31.00  

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px

# First, let's load our cleaned data
df = pd.read_csv('unemployment_rate_cleaned.csv', index_col=0)

# Convert index to datetime for better time series plotting
df.index = pd.to_datetime(df.index, format='%Y')

def create_unemployment_visualizations():
    # 1. Time Series Plot for Selected Major Economies
    plt.figure(figsize=(15, 8))
    major_economies = ['United States', 'United Kingdom', 'Germany', 'Japan', 'Australia']
    
    for country in major_economies:
        if country in df.columns:
            plt.plot(df.index.year, df[country], label=country, linewidth=2, marker='o', markersize=4)
    
    plt.title('Unemployment Rates in Major Economies (1995-2024)', fontsize=14)
    plt.xlabel('Year', fontsize=12)
    plt.ylabel('Unemployment Rate (%)', fontsize=12)
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.legend(fontsize=10)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('unemployment_major_economies.png')
    plt.close()

    # 2. Heatmap of Unemployment Rates Over Time
    # Select every 5th year to make the heatmap more readable
    years_to_plot = df.index[::5].year
    selected_data = df.loc[df.index.year.isin(years_to_plot)]
    
    plt.figure(figsize=(15, 10))
    sns.heatmap(selected_data.T, cmap='YlOrRd', center=df.mean().mean(),
                xticklabels=years_to_plot, yticklabels=True)
    plt.title('Unemployment Rates Heatmap (5-year intervals)', fontsize=14)
    plt.xlabel('Year', fontsize=12)
    plt.ylabel('Country', fontsize=12)
    plt.tight_layout()
    plt.savefig('unemployment_heatmap.png')
    plt.close()

    # 3. Box Plot Distribution by Decade
    plt.figure(figsize=(12, 6))
    decades = []
    decade_data = []
    
    for decade_start in range(1990, 2030, 10):
        mask = (df.index.year >= decade_start) & (df.index.year < decade_start + 10)
        if mask.any():
            decade_data.append(df[mask].values.flatten())
            decades.append(f'{decade_start}s')
    
    plt.boxplot(decade_data, labels=decades)
    plt.title('Distribution of Unemployment Rates by Decade', fontsize=14)
    plt.ylabel('Unemployment Rate (%)', fontsize=12)
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig('unemployment_distribution_by_decade.png')
    plt.close()

    # 4. Interactive Time Series with Plotly
    fig = go.Figure()
    
    for country in major_economies:
        if country in df.columns:
            fig.add_trace(go.Scatter(
                x=df.index,
                y=df[country],
                name=country,
                mode='lines+markers'
            ))
    
    fig.update_layout(
        title='Interactive Unemployment Rates Timeline',
        xaxis_title='Year',
        yaxis_title='Unemployment Rate (%)',
        hovermode='x unified'
    )
    fig.write_html('interactive_unemployment.html')

    # 5. Statistical Summary
    summary_stats = pd.DataFrame({
        'Mean': df.mean(),
        'Median': df.median(),
        'Std Dev': df.std(),
        'Min': df.min(),
        'Max': df.max()
    }).round(2)
    
    summary_stats.to_csv('unemployment_statistics.csv')

    return summary_stats

# Create visualizations and get summary statistics
summary_stats = create_unemployment_visualizations()

# Print some insights from the visualizations
print("\nKey Insights from the Unemployment Data Analysis:")
print("-" * 50)
print(f"Average unemployment rate across all countries: {df.mean().mean():.2f}%")
print(f"Country with highest average unemployment: {summary_stats['Mean'].idxmax()}")
print(f"Country with lowest average unemployment: {summary_stats['Mean'].idxmin()}")
print(f"Most volatile country (highest std dev): {summary_stats['Std Dev'].idxmax()}")


Key Insights from the Unemployment Data Analysis:
--------------------------------------------------
Average unemployment rate across all countries: 8.34%
Country with highest average unemployment: North Macedonia
Country with lowest average unemployment: Thailand
Most volatile country (highest std dev): North Macedonia


In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# Read the cleaned data
df = pd.read_csv('unemployment_rate_cleaned.csv', index_col=0)
df.index = pd.to_datetime(df.index, format='%Y')

# Define regions dictionary first - this was missing in the original code
regions = {
    'Europe': ['Germany', 'France', 'United Kingdom', 'Italy', 'Spain'],
    'Asia': ['Japan', 'China', 'Korea, Rep.', 'Singapore'],
    'Americas': ['United States', 'Canada', 'Brazil', 'Mexico'],
    'Oceania': ['Australia', 'New Zealand']
}

# Create visualizations
def create_unemployment_analysis():
    # 1. Major Economies Trend Analysis
    fig1 = go.Figure()
    major_economies = ['United States', 'United Kingdom', 'Germany', 'Japan', 'Australia']
    
    for country in major_economies:
        if country in df.columns:
            fig1.add_trace(go.Scatter(
                x=df.index,
                y=df[country],
                name=country,
                mode='lines+markers'
            ))
    
    fig1.update_layout(
        title='Unemployment Trends in Major Economies (1995-2024)',
        xaxis_title='Year',
        yaxis_title='Unemployment Rate (%)',
        template='plotly_white'
    )
    fig1.write_html('unemployment_trends.html')

    # 2. Regional Comparison Box Plot
    fig2 = go.Figure()
    for region, countries in regions.items():
        valid_countries = [c for c in countries if c in df.columns]
        if valid_countries:  # Only process region if we have data for its countries
            data = df[valid_countries].values.flatten()
            fig2.add_trace(go.Box(
                y=data,
                name=region,
                boxpoints='outliers'
            ))
    
    fig2.update_layout(
        title='Unemployment Distribution by Region',
        yaxis_title='Unemployment Rate (%)',
        template='plotly_white'
    )
    fig2.write_html('regional_comparison.html')

    # 3. Yearly Average Heatmap
    yearly_avg = pd.DataFrame()
    for region, countries in regions.items():
        valid_countries = [c for c in countries if c in df.columns]
        if valid_countries:
            yearly_avg[region] = df[valid_countries].mean(axis=1)
    
    fig3 = px.imshow(
        yearly_avg.T,
        aspect='auto',
        title='Regional Average Unemployment Rates Over Time',
        labels={'x': 'Year', 'y': 'Region', 'color': 'Unemployment Rate (%)'}
    )
    fig3.write_html('regional_heatmap.html')

    # Print insights from the analysis
    print("\nKey Insights from Unemployment Analysis:")
    print("-" * 50)
    
    # Major Economies Analysis
    major_avgs = df[major_economies].mean()
    print(f"\n1. Major Economies Trends:")
    print(f"- Highest average unemployment: {major_avgs.idxmax()} ({major_avgs.max():.2f}%)")
    print(f"- Lowest average unemployment: {major_avgs.idxmin()} ({major_avgs.min():.2f}%)")
    
    # Regional Analysis
    print("\n2. Regional Patterns:")
    for region, countries in regions.items():
        valid_countries = [c for c in countries if c in df.columns]
        if valid_countries:
            avg = df[valid_countries].mean().mean()
            print(f"- {region} average unemployment: {avg:.2f}%")

# Run the analysis
create_unemployment_analysis()


Key Insights from Unemployment Analysis:
--------------------------------------------------

1. Major Economies Trends:
- Highest average unemployment: Germany (6.03%)
- Lowest average unemployment: Japan (3.79%)

2. Regional Patterns:
- Europe average unemployment: 9.07%
- Asia average unemployment: 3.28%
- Americas average unemployment: 6.30%
- Oceania average unemployment: 5.51%
