In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

def clean_unemployment_data(file_path):
    """
    Clean and preprocess the unemployment rate dataset
    
    Parameters:
    file_path (str): Path to the CSV file
    
    Returns:
    pd.DataFrame: Cleaned DataFrame
    """
    # Read the CSV file
    df = pd.read_csv(file_path, skiprows=2)
    
    # Clean column names: remove any whitespace and special characters
    df.columns = df.columns.str.strip()
    
    # Convert the date column (first column)
    df.iloc[:, 0] = pd.to_datetime(df.iloc[:, 0], format='%YM%m')
    df = df.rename(columns={df.columns[0]: 'Date'})
    
    # Remove rows that are completely empty
    df = df.dropna(how='all')
    
    # Replace 'NA' strings with np.nan
    df = df.replace('NA', np.nan)
    
    # Convert all numeric columns to float
    for col in df.columns[1:]:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # Calculate the percentage of missing values for each column
    missing_percentages = (df.isnull().sum() / len(df)) * 100
    
    # Remove columns with more than 50% missing values
    columns_to_keep = missing_percentages[missing_percentages < 50].index
    df = df[columns_to_keep]
    
    return df

def analyze_unemployment_data(df):
    """
    Perform basic analysis on the cleaned unemployment data
    
    Parameters:
    df (pd.DataFrame): Cleaned DataFrame
    
    Returns:
    dict: Dictionary containing analysis results
    """
    analysis_results = {}
    
    # Basic statistics for each country
    analysis_results['basic_stats'] = df.describe()
    
    # Calculate average unemployment rate for each country
    avg_unemployment = df.mean()
    analysis_results['average_unemployment'] = avg_unemployment.sort_values(ascending=False)
    
    # Calculate the change in unemployment rate (last value - first value)
    first_values = df.iloc[0]
    last_values = df.iloc[-1]
    unemployment_change = last_values - first_values
    analysis_results['unemployment_change'] = unemployment_change.sort_values(ascending=False)
    
    # Find countries with highest and lowest current unemployment rates
    current_rates = df.iloc[-1]
    analysis_results['highest_current'] = current_rates.nlargest(5)
    analysis_results['lowest_current'] = current_rates.nsmallest(5)
    
    return analysis_results

def generate_visualizations(df):
    """
    Generate visualizations for the unemployment data
    
    Parameters:
    df (pd.DataFrame): Cleaned DataFrame
    """
    # Set up the plotting style
    plt.style.use('seaborn')
    
    # 1. Time series plot for selected major economies
    major_economies = ['United States', 'United Kingdom', 'Japan', 'Germany', 'France']
    plt.figure(figsize=(15, 8))
    for country in major_economies:
        if country in df.columns:
            plt.plot(df['Date'], df[country], label=country)
    plt.title('Unemployment Rates in Major Economies Over Time')
    plt.xlabel('Year')
    plt.ylabel('Unemployment Rate (%)')
    plt.legend()
    plt.xticks(rotation=45)
    plt.tight_layout()
    
    # 2. Box plot for distribution of unemployment rates
    plt.figure(figsize=(15, 8))
    df_melted = df.melt(id_vars=['Date'], var_name='Country', value_name='Unemployment Rate')
    sns.boxplot(data=df_melted, y='Unemployment Rate')
    plt.title('Distribution of Unemployment Rates Across All Countries')
    plt.xticks(rotation=45)
    plt.tight_layout()
    
    # 3. Heatmap of correlation between different countries
    plt.figure(figsize=(12, 10))
    correlation_matrix = df.select_dtypes(include=[np.number]).corr()
    sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm')
    plt.title('Correlation Between Countries\' Unemployment Rates')
    plt.tight_layout()

# Example usage:
if __name__ == "__main__":
    # Clean the data
    cleaned_df = clean_unemployment_data('Unemployment_Rate.csv')
    
    # Perform analysis
    analysis_results = analyze_unemployment_data(cleaned_df)
    
    # Print key findings
    print("Basic Statistics:")
    print(analysis_results['basic_stats'])
    
    print("\nTop 5 Countries with Highest Current Unemployment Rates:")
    print(analysis_results['highest_current'])
    
    print("\nTop 5 Countries with Lowest Current Unemployment Rates:")
    print(analysis_results['lowest_current'])
    
    # Generate visualizations
    generate_visualizations(cleaned_df)

ModuleNotFoundError: No module named 'matplotlib'