### 1. 匯入必要套件

In [1]:
import pandas as pd
import sqlite3
from pandasql import sqldf

### 2. 載入資料

In [None]:
# 讀取 CSV 檔案
df = pd.read_csv('data/df_combined.csv')

# 顯示前幾行
df.head()

Unnamed: 0,Category,Category_Type,Education,Salary,Real Salary,Country,Population
0,All Employees,Total,Junior High,875.0,2049.780381,Taiwan,
1,Male,Gender,Junior High,875.0,2049.780381,Taiwan,
2,Female,Gender,Junior High,906.25,2122.986823,Taiwan,
3,Small & Medium Enterprises,Enterprise Size,Junior High,875.0,2049.780381,Taiwan,
4,Large Enterprises & Others,Enterprise Size,Junior High,937.5,2196.193265,Taiwan,


### 3. Calculate All Employees Salary by Country and Education

This query computes the weighted average salary for all employees across different education levels. For Japan, we use population-weighted averages; for Taiwan, we use the original values.

In [33]:
# 建立 SQLite 資料庫連線
conn = sqlite3.connect(':memory:')

# 將 DataFrame 匯入資料庫
df.to_sql('wage_data', conn, index=False, if_exists='replace')


234

In [39]:
query = """
SELECT 
    ae.Country,
    ae.Education,
    ind.min_salary,
    ind.max_salary,
    ae.salary,
    ae.real_salary,
    ae.Population
FROM (
    SELECT 
        Country,
        Education,
        CASE 
            WHEN Population IS NOT NULL AND Population > 0 
            THEN SUM(Salary * Population) / SUM(Population)
            ELSE Salary
        END as salary,
        CASE 
            WHEN Population IS NOT NULL AND Population > 0 
            THEN SUM("Real Salary" * Population) / SUM(Population)
            ELSE "Real Salary"
        END as real_salary,
        SUM(Population) as Population
    FROM wage_data
    WHERE Category = 'All Employees'
    GROUP BY Country, Education
) ae
LEFT JOIN (
    SELECT 
        Country,
        Education,
        MIN(Salary) as min_salary,
        MAX(Salary) as max_salary
    FROM wage_data
    WHERE Category_Type = 'Industry'
    GROUP BY Country, Education
) ind ON ae.Country = ind.Country AND ae.Education = ind.Education
ORDER BY ae.Country,
    CASE ae.Education
        WHEN 'Junior High' THEN 1
        WHEN 'Senior High' THEN 2
        WHEN 'Junior College' THEN 3
        WHEN 'University' THEN 4
        WHEN 'Graduate School' THEN 5
        ELSE 6
    END
"""

all_employees = pd.read_sql_query(query, conn)
all_employees

Unnamed: 0,Country,Education,min_salary,max_salary,salary,real_salary,Population
0,Japan,Junior High,1195.862069,1749.655172,1522.068966,2367.771698,400.0
1,Japan,Senior High,1268.965517,1412.413793,1362.068966,2118.871366,107730.0
2,Japan,Junior College,1330.344828,1831.724138,1538.810605,2393.815446,115260.0
3,Japan,University,1513.103448,2226.896552,1712.413793,2663.877266,292210.0
4,Japan,Graduate School,1644.137931,2297.241379,1982.068966,3083.360154,31170.0
5,Taiwan,Junior High,875.0,906.25,875.0,2049.780381,
6,Taiwan,Senior High,875.0,1062.5,906.25,2122.986823,
7,Taiwan,Junior College,937.5,1187.5,1000.0,2342.606149,
8,Taiwan,University,968.75,1343.75,1062.5,2489.019034,
9,Taiwan,Graduate School,1031.25,1843.75,1625.0,3806.734993,


### 4. Industries with Salaries Above All Employees Average

This query identifies industries where salaries exceed the average for all employees at the same education level and country.

In [35]:
query = """
SELECT 
    i.Country, 
    i.Education, 
    i.Category, 
    i.Salary,
    ae.salary as all_employees_salary
FROM wage_data i
JOIN (
    SELECT 
        Country,
        Education,
        CASE 
            WHEN Population IS NOT NULL AND Population > 0 
            THEN SUM(Salary * Population) / SUM(Population)
            ELSE Salary
        END as salary
    FROM wage_data
    WHERE Category = 'All Employees'
    GROUP BY Country, Education
) ae ON i.Country = ae.Country AND i.Education = ae.Education
WHERE i.Category_Type = 'Industry' 
    AND i.Salary > ae.salary
ORDER BY i.Country, i.Education, i.Salary DESC
"""
above_average = pd.read_sql_query(query, conn)
above_average

Unnamed: 0,Country,Education,Category,Salary,all_employees_salary
0,Japan,Graduate School,Mining & Quarrying,2297.241379,1982.068966
1,Japan,Graduate School,Finance & Insurance,2289.655172,1982.068966
2,Japan,Graduate School,Education,2184.137931,1982.068966
3,Japan,Graduate School,Real Estate,2176.551724,1982.068966
4,Japan,Graduate School,"Publishing, Audio-Visual & Information Communi...",2124.137931,1982.068966
5,Japan,Graduate School,Construction,2048.275862,1982.068966
6,Japan,Graduate School,Combined Services,2033.103448,1982.068966
7,Japan,Graduate School,Wholesale & Retail Trade,1982.758621,1982.068966
8,Japan,Junior College,Mining & Quarrying,1831.724138,1538.810605
9,Japan,Junior College,Construction,1644.137931,1538.810605


### 5. Cross-Country Wage Comparison by Education Level

Comparing salaries between Japan and Taiwan across different education levels, with percentage differences calculated.

In [44]:
query = """
SELECT 
    Education,
    MAX(CASE WHEN Country = 'Japan' THEN salary END) as 'Japan (USD)',
    MAX(CASE WHEN Country = 'Taiwan' THEN salary END) as 'Taiwan (USD)',
    ROUND(
        (MAX(CASE WHEN Country = 'Japan' THEN salary END) - 
         MAX(CASE WHEN Country = 'Taiwan' THEN salary END)) * 100.0 / 
         MAX(CASE WHEN Country = 'Taiwan' THEN salary END), 2
    ) as '% Difference'
FROM (
    SELECT 
        Country,
        Education,
        Salary
    FROM wage_data
    WHERE Category = 'All Employees'
    GROUP BY Country, Education
)
GROUP BY Education
ORDER BY 
    CASE Education
        WHEN 'Junior High' THEN 1
        WHEN 'Senior High' THEN 2
        WHEN 'Junior College' THEN 3
        WHEN 'University' THEN 4
        WHEN 'Graduate School' THEN 5
        ELSE 6
    END
"""
diff = pd.read_sql_query(query, conn)
diff

Unnamed: 0,Education,Japan (USD),Taiwan (USD),% Difference
0,Junior High,1522.068966,875.0,73.95
1,Senior High,1362.068966,906.25,50.3
2,Junior College,1544.137931,1000.0,54.41
3,University,1712.413793,1062.5,61.17
4,Graduate School,1982.068966,1625.0,21.97


### 6. Education Wage Premiums Within Each Country

Analyzing the salary increases when advancing from Junior College to University, and from University to Graduate School in each country.

In [45]:
query = """
WITH base_salaries AS (
    SELECT 
        Country,
        Education,
        CASE 
            WHEN Population IS NOT NULL AND Population > 0 
            THEN SUM(Salary * Population) / SUM(Population)
            ELSE Salary
        END as salary
    FROM wage_data
    WHERE Category = 'All Employees'
    GROUP BY Country, Education
)
SELECT 
    Country,
    MAX(CASE WHEN Education = 'Junior College' THEN salary END) as 'Junior College (USD)',
    MAX(CASE WHEN Education = 'University' THEN salary END) as 'University (USD)',
    MAX(CASE WHEN Education = 'Graduate School' THEN salary END) as 'Graduate School (USD)',
    ROUND(
        (MAX(CASE WHEN Education = 'University' THEN salary END) - 
         MAX(CASE WHEN Education = 'Junior College' THEN salary END)) * 100.0 / 
         MAX(CASE WHEN Education = 'Junior College' THEN salary END), 2
    ) as 'Junior College → University (%)',
    ROUND(
        (MAX(CASE WHEN Education = 'Graduate School' THEN salary END) - 
         MAX(CASE WHEN Education = 'University' THEN salary END)) * 100.0 / 
         MAX(CASE WHEN Education = 'University' THEN salary END), 2
    ) as 'University → Graduate School (%)'
FROM base_salaries
GROUP BY Country
ORDER BY Country
"""

edu_premium = pd.read_sql_query(query, conn)
edu_premium

Unnamed: 0,Country,Junior College (USD),University (USD),Graduate School (USD),Junior College → University (%),University → Graduate School (%)
0,Japan,1538.810605,1712.413793,1982.068966,11.28,15.75
1,Taiwan,1000.0,1062.5,1625.0,6.25,52.94


### 7. Industry Wage Comparison Across Countries

Comparing industry salaries between Japan and Taiwan, sorted by the percentage difference. This shows which industries have the largest wage gaps between the two countries.

In [49]:
query = """
WITH industry_salaries AS (
    SELECT 
        Category,
        Country,
        CASE 
            WHEN Population IS NOT NULL AND Population > 0 
            THEN SUM(Salary * Population) / SUM(Population)
            ELSE AVG(Salary)
        END as salary
    FROM wage_data
    WHERE Category_Type = 'Industry'
    GROUP BY Category, Country
)
SELECT 
    Category,
    MAX(CASE WHEN Country = 'Japan' THEN salary END) as 'Japan',
    MAX(CASE WHEN Country = 'Taiwan' THEN salary END) as 'Taiwan',
    ROUND(
        (MAX(CASE WHEN Country = 'Japan' THEN salary END) - 
         MAX(CASE WHEN Country = 'Taiwan' THEN salary END)) * 100.0 / 
         MAX(CASE WHEN Country = 'Taiwan' THEN salary END), 2
    ) as 'Percentage_Difference_Japan_vs_Taiwan'
FROM industry_salaries
GROUP BY Category
HAVING MAX(CASE WHEN Country = 'Japan' THEN salary END) IS NOT NULL 
   AND MAX(CASE WHEN Country = 'Taiwan' THEN salary END) IS NOT NULL
ORDER BY Percentage_Difference_Japan_vs_Taiwan DESC
"""

industry_comparison = pd.read_sql_query(query, conn)
industry_comparison

Unnamed: 0,Category,Japan,Taiwan,Percentage_Difference_Japan_vs_Taiwan
0,Mining & Quarrying,1939.482759,1031.25,88.07
1,Real Estate,1783.228282,1000.0,78.32
2,Construction,1654.068966,981.25,68.57
3,Wholesale & Retail Trade,1625.776581,1025.0,58.61
4,Other Services,1514.206897,968.75,56.31
5,Accommodation & Food Service,1517.002432,975.0,55.59
6,"Arts, Entertainment & Recreation",1504.842661,968.75,55.34
7,Education,1648.275862,1109.375,48.58
8,"Professional, Scientific & Technical Services",1616.137931,1093.75,47.76
9,Finance & Insurance,1698.190361,1171.875,44.91


### 8. Close Database Connection

Clean up resources by closing the SQLite connection.

In [None]:
conn.close()