In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 加载数据集
personality_data = pd.read_csv('./personality_data.csv')
gdsp_data = pd.read_excel('./GDSP.xlsx')
gdsp_historical_data = pd.read_excel('./GDSP-and-Median-Income-historical-AM23.xlsx', sheet_name=None)
population_data = pd.read_csv('./世界各国人口结构数据 2008-2021.csv')

# 显示每个数据集的列名和前几行
data_overview = {
    "GDSP": gdsp_data.head(),
    "GDSP Historical": {sheet: data.head() for sheet, data in gdsp_historical_data.items()},
    "Personality Data": personality_data.head(),
    "Population Data": population_data.head()
}

data_overview


In [None]:
# 标准化列名，以确保一致性
gdsp_data.columns = [col.strip().replace(' ', '_').lower() for col in gdsp_data.columns]
for sheet in gdsp_historical_data:
    gdsp_historical_data[sheet].columns = [col.strip().replace(' ', '_').lower() for col in gdsp_historical_data[sheet].columns]
personality_data.columns = [col.strip().replace(' ', '_').lower() for col in personality_data.columns]
population_data.columns = [col.strip().replace(' ', '_').lower() for col in population_data.columns]

# 显示更新后的列名以确认改动
updated_column_names = {
    "GDSP": gdsp_data.columns.tolist(),
    "GDSP Historical - Example Sheet": list(gdsp_historical_data.values())[0].columns.tolist(),
    "Personality Data": personality_data.columns.tolist(),
    "Population Data": population_data.columns.tolist()
}

updated_column_names


In [None]:
# 检查每个数据集中缺失值的数量
missing_values = {
    "GDSP": gdsp_data.isnull().sum(),
    "GDSP Historical - Example Sheet": list(gdsp_historical_data.values())[0].isnull().sum(),
    "Personality Data": personality_data.isnull().sum(),
    "Population Data": population_data.isnull().sum()
}

missing_values


## 数据分析
计算人格特质与历史-社会生态因素的匹配模式。
我们将计算大五人格特质与不同历史-社会生态因素之间的匹配关系

In [None]:
# 将 'year' 转换为字符串类型，以匹配 'period'
population_data['year'] = population_data['year'].astype(str)

# 再次尝试合并
merged_data = pd.merge(gdsp_data, population_data, how='inner', left_on=['country_name', 'period'], right_on=['country_name', 'year'])

# 显示合并后的数据集的列名和前几行数据
merged_data.head(), merged_data.columns.tolist()


In [None]:
# 显示两个数据集中 "period" 和 "year" 字段的唯一值
unique_periods = {
    "GDSP Periods": gdsp_data['period'].unique(),
    "Population Years": population_data['year'].unique()
}

unique_periods


In [None]:
# 提取 GDSP 数据集中 period 字段的起始年份
gdsp_data['start_year'] = gdsp_data['period'].apply(lambda x: x.split('-')[0])

# 再次尝试合并，这次使用 start_year
merged_data = pd.merge(gdsp_data, population_data, how='inner', left_on=['country_name', 'start_year'], right_on=['country_name', 'year'])

# 显示合并后的数据集的列名和前几行数据
merged_data.head(), merged_data.columns.tolist()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# 设置绘图风格
sns.set(style="whitegrid")

# 创建散点图：人口增长率 vs. 底层40%的经济增长率
plt.figure(figsize=(10, 6))
sns.scatterplot(data=merged_data,
                x='population_growth',
                y='annualized_growth_in_mean_consumption_or_income_per_capita-bottom_40%',
                hue='region',
                style='type',
                s=100)

plt.title('Population Growth vs. Economic Growth of Bottom 40%', fontsize=16)
plt.xlabel('Population Growth (%)', fontsize=14)
plt.ylabel('Economic Growth of Bottom 40% (%)', fontsize=14)
plt.legend(title='Region/Type')
plt.savefig('./人口与底层经济.png')
plt.show()


In [None]:
# 筛选特定国家的数据
countries_of_interest = ['China', 'Indonesia']
filtered_data = merged_data[merged_data['country_name'].isin(countries_of_interest)]

# 创建折线图
plt.figure(figsize=(12, 8))
sns.lineplot(data=filtered_data,
             x='year',
             y='annualized_growth_in_mean_consumption_or_income_per_capita-bottom_40%',
             hue='country_name',
             marker='o')

plt.title('Economic Growth Over Time for Selected Countries', fontsize=16)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Annualized Growth in Mean Consumption or Income per Capita - Bottom 40% (%)', fontsize=14)
plt.legend(title='Country')
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# 创建一个新的DataFrame，用于热力图分析，包括年龄依赖比例和经济增长
heatmap_data = merged_data[['country_name', 'year', 'age_dependency_ratio', 'annualized_growth_in_mean_consumption_or_income_per_capita-total_population']].copy()
heatmap_data.set_index(['country_name', 'year'], inplace=True)

# 转换为适合热力图的格式
heatmap_data_pivot = heatmap_data.pivot_table(values='annualized_growth_in_mean_consumption_or_income_per_capita-total_population',
                                              index='country_name',
                                              columns='year')

# 绘制热力图
plt.figure(figsize=(12, 8))
sns.heatmap(heatmap_data_pivot, annot=True, cmap='coolwarm', fmt=".1f", linewidths=.5)
plt.title('Heatmap of Economic Growth vs. Age Dependency Ratio', fontsize=16)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Country', fontsize=14)
plt.savefig('./年龄依赖比例与经济增长heatmap.png')
plt.show()


In [None]:
# 确保国家名称在所有数据集中一致
personality_data['country_name'] = personality_data['country'].map(country_code_to_name)

# 提取年份信息，假设 personality_data 中的 dateload 字段包含了日期信息
personality_data['year'] = pd.to_datetime(personality_data['dateload']).dt.year.astype(str)

# 合并 personality 数据和之前合并的经济及人口数据
full_merged_data = pd.merge(merged_data, personality_data, how='inner', on=['country_name', 'year'])

# 检查合并后的数据结构
full_merged_data.head(), full_merged_data.columns.tolist()


In [None]:
# 查看 Personality Data 和已合并数据集中的年份和国家名称的唯一值
unique_values_personality = {
    "Personality Data Years": personality_data['year'].unique(),
    "Personality Data Countries": personality_data['country_name'].dropna().unique()
}

unique_values_merged = {
    "Merged Data Years": merged_data['year'].unique(),
    "Merged Data Countries": merged_data['country_name'].unique()
}

unique_values_personality, unique_values_merged


In [None]:
# 查看 Personality Data 中的 country 代码的唯一值
unique_country_codes = personality_data['country'].unique()

# 检查这些代码是否都在我们的映射中
unmapped_codes = [code for code in unique_country_codes if code not in country_code_to_name]

unique_country_codes, unmapped_codes


In [None]:
# 简化版国家代码到名称的映射表
country_code_name_map = {
    'GB': 'United Kingdom', 'MY': 'Malaysia', 'US': 'United States', 'SE': 'Sweden',
    'FI': 'Finland', 'UA': 'Ukraine', 'PH': 'Philippines', 'FR': 'France',
    'AU': 'Australia', 'IN': 'India', 'CA': 'Canada', 'NL': 'Netherlands',
    'ZA': 'South Africa', 'HK': 'Hong Kong', 'CN': 'China', 'JP': 'Japan',
    'DE': 'Germany', 'BR': 'Brazil', 'IT': 'Italy', 'RU': 'Russia', 'ES': 'Spain'
}

# 更新 Personality Data 的 country_name 列
personality_data['country_name'] = personality_data['country'].map(country_code_name_map)

# 再次尝试合并 Personality Data 与已合并的数据
full_merged_data = pd.merge(merged_data, personality_data, how='inner', on=['country_name', 'year'])

# 检查合并后的数据
full_merged_data.head(), full_merged_data.columns.tolist()


In [None]:
# 创建散点图：经济增长率 vs. 外向性 (EXT)
plt.figure(figsize=(10, 6))
sns.scatterplot(
    data=full_merged_data,
    x='annualized_growth_in_mean_consumption_or_income_per_capita-total_population',
    y='ext',
    hue='region',
    style='type',
    s=100
)

plt.title('Economic Growth vs. Extraversion (EXT)', fontsize=16)
plt.xlabel('Economic Growth Rate (%)', fontsize=14)
plt.ylabel('Extraversion Score', fontsize=14)
plt.legend(title='Region/Type')
plt.grid(True)
plt.show()


这个散点图展示了不同地区的经济增长率与外向性（EXT）得分之间的关系。通过这种可视化，我们可以探索不同社会经济背景下人格特质的分布情况。

In [None]:
# 设置绘图风格
sns.set(style="whitegrid")

# 创建一个包含所有人格维度与经济增长的散点图面板
fig, axes = plt.subplots(3, 2, figsize=(14, 18), sharex=True)
axes = axes.flatten()

# 绘制每个人格维度与经济增长率的关系
personality_traits = ['ext', 'agr', 'csn', 'est', 'opn']
for i, trait in enumerate(personality_traits):
    sns.scatterplot(
        ax=axes[i],
        data=full_merged_data,
        x='annualized_growth_in_mean_consumption_or_income_per_capita-total_population',
        y=trait,
        hue='region',
        style='type',
        s=100
    )
    axes[i].set_title(f'Economic Growth vs. {trait.upper()}', fontsize=16)
    axes[i].set_xlabel('Economic Growth Rate (%)', fontsize=14)
    axes[i].set_ylabel(f'{trait.upper()} Score', fontsize=14)
    axes[i].legend(title='Region/Type')

# 调整布局
plt.tight_layout()
plt.savefig('./picture/所有人格维度与经济增长的散点图面板.png')
plt.show()


In [None]:
# 创建一个适用于热力图的数据框，包括所有人格维度和经济增长率
heatmap_data = full_merged_data[['country_name', 'year', 'ext', 'agr', 'csn', 'est', 'opn',
                                 'annualized_growth_in_mean_consumption_or_income_per_capita-total_population']].copy()
heatmap_data.set_index(['country_name', 'year'], inplace=True)

# 计算相关性矩阵
correlation_matrix = heatmap_data.corr()

# 绘制热力图
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Heatmap of Personality Traits and Economic Growth Correlation', fontsize=16)
plt.show()


In [None]:
# 假设 full_merged_data 已经包含了GDP数据，我们将使用population_total代替进行演示
# 绘制人口增长率与外向性EXT的关系，气泡大小代表经济规模（这里使用人口总数模拟）

plt.figure(figsize=(12, 8))
sns.scatterplot(
    data=full_merged_data,
    x='population_growth',
    y='ext',
    size='population_total',
    hue='region',
    sizes=(50, 1000),  # 控制气泡大小范围
    alpha=0.6,
    legend='brief'
)

plt.title('Population Growth vs. Extraversion (Bubble Size: Economic Size)', fontsize=16)
plt.xlabel('Population Growth (%)', fontsize=14)
plt.ylabel('Extraversion (EXT) Score', fontsize=14)
plt.grid(True)
plt.legend(title='Region/Type')
plt.show()


箱线图：展示不同经济水平（使用GDP或人均收入分组）下的人格特质分布，可以帮助我们看出经济水平对人格特质的影响。

In [None]:
# 由于数据中经济增长率的分布可能过于集中，我们使用平均值进行分组，而非分位数
mean_growth = full_merged_data['annualized_growth_in_mean_consumption_or_income_per_capita-total_population'].mean()
full_merged_data['gdp_group'] = pd.cut(full_merged_data['annualized_growth_in_mean_consumption_or_income_per_capita-total_population'],
                                       bins=[full_merged_data['annualized_growth_in_mean_consumption_or_income_per_capita-total_population'].min(), 
                                             mean_growth, 
                                             full_merged_data['annualized_growth_in_mean_consumption_or_income_per_capita-total_population'].max()],
                                       labels=['Low', 'High'])

# 重新创建箱线图
plt.figure(figsize=(10, 6))
sns.boxplot(
    data=full_merged_data,
    x='gdp_group',
    y='ext',
    palette='Set3'
)

plt.title('Extraversion (EXT) Across Economic Groups', fontsize=16)
plt.xlabel('Economic Group (by Growth Rate)', fontsize=14)
plt.ylabel('Extraversion (EXT) Score', fontsize=14)
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd


# 计算经济增长率的平均值
mean_growth = full_merged_data['annualized_growth_in_mean_consumption_or_income_per_capita-total_population'].mean()

# 将经济增长率分组
full_merged_data['gdp_group'] = pd.cut(full_merged_data['annualized_growth_in_mean_consumption_or_income_per_capita-total_population'],
                                       bins=[full_merged_data['annualized_growth_in_mean_consumption_or_income_per_capita-total_population'].min(), 
                                             mean_growth, 
                                             full_merged_data['annualized_growth_in_mean_consumption_or_income_per_capita-total_population'].max()],
                                       labels=['Low', 'High'])

# 设置绘图风格
sns.set(style="whitegrid")

# 人格特质列表
traits = ['ext', 'est', 'agr', 'csn', 'opn']
titles = ['Extraversion (EXT)', 'Emotional Stability (EST)', 'Agreeableness (AGR)', 'Conscientiousness (CSN)', 'Openness (OPN)']

# 逐个绘制箱线图
for trait, title in zip(traits, titles):
    plt.figure(figsize=(10, 6))
    sns.boxplot(
        data=full_merged_data,
        x='gdp_group',
        y=trait,
        palette='Set3'
    )

    plt.title(f'{title} Across Economic Groups', fontsize=16)
    plt.xlabel('Economic Group (by Growth Rate)', fontsize=14)
    plt.ylabel(f'{title} Score', fontsize=14)
    plt.savefig(f'./picture/{trait}_economic_groups_boxplot.png', bbox_inches='tight')
    plt.show()


雷达图：展示顶尖经济体和发展中经济体的五大人格特质的平均分，对比不同经济背景下的人格差异。

In [None]:
# 修正雷达图数据准备
# 计算每个经济组内的五大人格平均值
personality_means = full_merged_data.groupby('gdp_group')[['ext', 'agr', 'csn', 'est', 'opn']].mean().reset_index()

# 设置雷达图参数
labels = np.array(['EXT', 'AGR', 'CSN', 'EST', 'OPN'])
num_vars = len(labels)

# 创建角度数组
angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist() + [0]

fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))

# 绘制每个经济组的雷达图
for idx, row in personality_means.iterrows():
    values = row[['ext', 'agr', 'csn', 'est', 'opn']].tolist() + [row['ext']]
    ax.plot(angles, values, linewidth=1, linestyle='solid', label=row['gdp_group'])
    ax.fill(angles, values, alpha=0.25)

ax.set_thetagrids(np.degrees(angles[:-1]), labels)
ax.set_title('Personality Traits by Economic Group', fontsize=15, y=1.1)
ax.legend(title='Economic Group', loc='upper right', bbox_to_anchor=(0.1, 0.1))
plt.show()


In [None]:
# 为简化，手动指定两个组以避免重复边界问题
full_merged_data['gdp_group'] = pd.cut(
    full_merged_data['annualized_growth_in_mean_consumption_or_income_per_capita-total_population'],
    bins=[full_merged_data['annualized_growth_in_mean_consumption_or_income_per_capita-total_population'].min() - 1,  # 少于最小值
          0,  # 假设0为分界线
          full_merged_data['annualized_growth_in_mean_consumption_or_income_per_capita-total_population'].max() + 1],  # 超过最大值
    labels=['Low', 'High']
)

# 重新计算每个经济组内的五大人格平均值
personality_means = full_merged_data.groupby('gdp_group')[['ext', 'agr', 'csn', 'est', 'opn']].mean().reset_index()

# 设置雷达图参数
labels = np.array(['EXT', 'AGR', 'CSN', 'EST', 'OPN'])
num_vars = len(labels)

# 创建角度数组
angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist() + [0]

fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))

# 绘制每个经济组的雷达图
for idx, row in personality_means.iterrows():
    values = row[['ext', 'agr', 'csn', 'est', 'opn']].tolist() + [row['ext']]
    ax.plot(angles, values, linewidth=1, linestyle='solid', label=f'{row["gdp_group"]} Economic Growth')
    ax.fill(angles, values, alpha=0.25)

ax.set_thetagrids(np.degrees(angles[:-1]), labels)
ax.set_title('Personality Traits by Economic Group', fontsize=15, y=1.1)
ax.legend(title='Economic Group', loc='upper right', bbox_to_anchor=(0.1, 0.1))
plt.show()


地图：如果数据包括足够的国家级细节，可以在世界地图上展示不同国家的人格特质平均分或特定人格与经济指标的关联。

In [None]:
import geopandas as gpd
import matplotlib.pyplot as plt

# 加载世界地图数据
world = gpd.read_file('./ne_110m_admin_0_countries/ne_110m_admin_0_countries.shp')
print(world.columns)
# 创建示例数据：国家ISO代码和外向性（EXT）得分
example_data = pd.DataFrame({
    'SOV_A3': ['USA', 'BRA', 'RUS', 'IND', 'CHN', 'AUS', 'FRA', 'DEU'],
    'ext_score': [30, 25, 20, 35, 28, 32, 27, 29]
})

# 合并地图数据和人格特质得分数据
world = world.merge(example_data, how="left", on="SOV_A3")

# 绘制地图
fig, ax = plt.subplots(1, 1, figsize=(15, 10))
world.boundary.plot(ax=ax)
world.plot(column='ext_score', ax=ax, legend=True,
           legend_kwds={'label': "Extraversion (EXT) Score"},
           cmap='OrRd', missing_kwds={'color': 'lightgrey'})

plt.title('Global Distribution of Extraversion Scores', fontsize=15)
plt.show()


## 人口

In [None]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import pycountry

# 加载人口数据
population_data = pd.read_excel('population.xlsx')

# 加载世界地图数据
world = gpd.read_file('./ne_110m_admin_0_countries/ne_110m_admin_0_countries.shp')



In [None]:
# 将国家名称转换为ISO代码
ctr_to_country = {i.name: i.alpha_3 for i in pycountry.countries}

def convert_country_name_to_iso(name):
    try:
        return ctr_to_country[name]
    except KeyError:
        return None

population_data['iso_a3'] = population_data['country_name'].apply(convert_country_name_to_iso)

# 合并数据
merged_data = world.set_index('iso_a3').join(population_data.set_index('iso_a3'))


In [None]:
plt.figure(figsize=(15, 10))
ax = merged_data.plot(column='population_total', cmap='OrRd', legend=True, 
                      legend_kwds={'label': "Total Population by Country",
                                   'orientation': "horizontal"})
ax.set_title('World Map of Total Population by Country', fontsize=16)
plt.savefig('./picture/world_map_total_population.png', bbox_inches='tight')
plt.show()


In [None]:
plt.figure(figsize=(15, 10))
ax = merged_data.plot(column='population_growth', cmap='YlGn', legend=True, 
                      legend_kwds={'label': "Population Growth Rate by Country",
                                   'orientation': "horizontal"})
ax.set_title('World Map of Population Growth Rate by Country', fontsize=16)
plt.savefig('./picture/world_map_population_growth.png', bbox_inches='tight')
plt.show()


In [None]:
# 示例数据处理
# 将数据整理为适合绘制人口金字塔图的格式
age_groups = ['0-14', '15-64', '65+']
population_data['age_group'] = pd.cut(population_data['year'],
                                      bins=[0, 14, 64, 100],
                                      labels=age_groups)

plt.figure(figsize=(12, 8))
sns.barplot(data=population_data, x='age_group', y='population_total', hue='gender')
plt.title('Population Pyramid by Age Group', fontsize=16)
plt.xlabel('Age Group', fontsize=14)
plt.ylabel('Total Population', fontsize=14)
plt.legend(title='Gender')
plt.savefig('./picture/population_pyramid.png', bbox_inches='tight')
plt.show()


In [None]:
plt.figure(figsize=(10, 8))
sns.scatterplot(data=population_data, x='birth_rate_crude', y='death_rate_crude', hue='country_name')
plt.title('Scatter Plot: Birth Rate vs Death Rate by Country', fontsize=16)
plt.xlabel('Crude Birth Rate', fontsize=14)
plt.ylabel('Crude Death Rate', fontsize=14)
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), ncol=2)
plt.savefig('./picture/birth_death_scatter.png', bbox_inches='tight')
plt.show()
