In [3]:
import pandas as pd
import plotly.express as px
import numpy as np
import re # Import Regular Expression library for advanced cleaning

# ====================================================================
# 1. DATA INGRESS AND CLEANSING (START OF NOTEBOOK)
# ====================================================================

# Function to clean and convert complex monetary strings (e.g., '10B', '$50M') to numbers
def clean_monetary_value(value):
    if pd.isna(value):
        return np.nan

    # Ensure value is a string for cleaning operations
    if not isinstance(value, str):
        return pd.to_numeric(value, errors='coerce')

    # Remove commas, dollar signs, and convert to lowercase
    value = value.replace(',', '').replace('$', '').lower()

    scale_factor = 1.0
    if 'b' in value:
        scale_factor = 1_000_000_000
        value = value.replace('b', '')
    elif 'm' in value:
        scale_factor = 1_000_000
        value = value.replace('m', '')

    try:
        return float(value) * scale_factor
    except ValueError:
        return np.nan

try:
    # Use the file name visible in your screenshot as the primary loading path
    df_financial = pd.read_csv('Largest companies in world.csv')
except FileNotFoundError:
    print("Error: 'Largest companies in world.csv' not found. Please verify the file name and path.")
    exit()

# Rename the columns based on your header image
df_financial.columns = ['Rank', 'Organization', 'Country', 'Revenue', 'Profits', 'Assets', 'MarketValue']

# Data Quality Assessment and Advanced Type Conversion
monetary_cols = ['Revenue', 'Profits', 'Assets', 'MarketValue']
for col in monetary_cols:
    # APPLY THE NEW CLEANING FUNCTION
    df_financial[col] = df_financial[col].apply(clean_monetary_value)

# Drop rows where critical financial metrics are still NaN after cleaning
df_financial.dropna(subset=monetary_cols, inplace=True)

# CRITICAL CHECK: Ensure the DataFrame is not empty after cleansing
if df_financial.empty:
    print("Error: DataFrame is empty after data cleaning. The cleaning function may need adjustment for your specific data format.")
    exit()

# ====================================================================
# 2. FEATURE ENGINEERING: KEY PERFORMANCE INDICATORS (KPIs)
# ====================================================================

# Calculate core financial ratios and performance indicators
df_financial['NetProfitMargin'] = (df_financial['Profits'] / df_financial['Revenue']) * 100
df_financial['AssetTurnoverRatio'] = df_financial['Revenue'] / df_financial['Assets']
df_financial['MV_to_Revenue'] = df_financial['MarketValue'] / df_financial['Revenue']

# ====================================================================
# 3. VISUALIZATION AND COMPARATIVE ANALYSIS
# ====================================================================

## 📊 3.1 Comparative Performance: Top 20 by Net Profit Margin
top_margin = df_financial.sort_values(by='NetProfitMargin', ascending=False).head(20)

fig1 = px.bar(
    top_margin,
    x='Organization',
    y='NetProfitMargin',
    color='NetProfitMargin',
    title='Top 20 Corporate Entities by Net Profit Margin (%)',
    labels={'NetProfitMargin': 'Net Profit Margin (%)'},
    color_continuous_scale=px.colors.sequential.Teal
)
fig1.update_layout(xaxis_title="Corporate Entity", yaxis_title="Profitability Ratio (%)")
fig1.update_xaxes(tickangle=45)
print("\n--- Figure 1: Top Entities by Profit Margin ---")
fig1.show()


## 📈 3.2 Scatter Plot for Correlation Analysis: Market Value vs. Revenue
df_financial['Log_Revenue'] = np.log1p(df_financial['Revenue'])
df_financial['Log_MarketValue'] = np.log1p(df_financial['MarketValue'])

fig2 = px.scatter(
    df_financial,
    x='Log_Revenue',
    y='Log_MarketValue',
    color='Country',
    hover_data=['Organization', 'Revenue', 'MarketValue'],
    title='Bivariate Analysis: Correlation between Corporate Revenue and Market Capitalization (Log Transformed)',
)
fig2.update_layout(
    xaxis_title=r"Log($1 + Revenue$)",
    yaxis_title=r"Log($1 + Market Value$)"
)
print("\n--- Figure 2: Revenue vs. Market Value Correlation ---")
fig2.show()


## 🌎 3.3 Geospatial Analysis: Asset Concentration by Country
country_assets = df_financial.groupby('Country')['Assets'].sum().reset_index()

fig3 = px.choropleth(
    country_assets,
    locations='Country',
    locationmode='country names',
    color='Assets',
    hover_name='Country',
    color_continuous_scale=px.colors.sequential.Reds,
    title='Geospatial Distribution of Aggregate Corporate Assets'
)

fig3.update_layout(
    title_text='Global Asset Concentration (Choropleth)',
    geo=dict(
        showframe=False,
        showcoastlines=False,
        projection_type='equirectangular'
    )
)
print("\n--- Figure 3: Global Asset Concentration Choropleth ---")
fig3.show()


--- Figure 1: Top Entities by Profit Margin ---



invalid value encountered in log1p




--- Figure 2: Revenue vs. Market Value Correlation ---



--- Figure 3: Global Asset Concentration Choropleth ---
