<a href="https://colab.research.google.com/github/takuma-uchida/google_colab_uchida/blob/main/%E3%80%90HRT%E3%80%91%E9%81%A9%E6%80%A7%E3%83%9E%E3%83%83%E3%83%81%E3%83%B3%E3%82%B0_%E5%80%8B%E4%BA%BA%E5%AF%BE%E5%80%8B%E4%BA%BA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 初期設定

## ドライブ認証等

In [1]:
#ドライブのマウント
from google.colab import drive
drive.mount('/content/drive')

#認証
from google.colab import auth
auth.authenticate_user()

Mounted at /content/drive


## ライブラリー読込

In [24]:
# matplotlibの日本語化に使用
!pip install japanize_matplotlib

#基本ライブラリー
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import japanize_matplotlib
import plotly.express as px
import time

# 多次元尺度構成法(MDS)
from sklearn.manifold import MDS



## このコラボがいるファイルパスまで移動
- ご自身のパス状況に合わせて編集してください。

In [3]:
#特定のディレクトリに移動
%cd /content/drive/My Drive/mar-データ戦略/事業施策/HRTech/[HRT]適性マッチのマッチングロジック

/content/drive/.shortcut-targets-by-id/1RLRkq0c3TMfT_n4FaUruqWK2YQsqpq_s/mar-データ戦略/事業施策/HRTech/[HRT]適性マッチのマッチングロジック


## BQからデータ読込
- カラムの論理名
  - employee_id：回答者ID
  - corporation_id：所属企業ID
  - factor_type_id：PIONALYの因子名
  - value：因子スコア（0～10の範囲）
  - scoring_date：検査回答日
- 「BQ⇒Python」への接続のためには「bigquery.readsessions.create」の権限が必要です。


In [4]:
%%bigquery input_rawdata --project hrtech-datainfra-prod
/*
HRT_Nalysys
*/
select
  es.employee_id --pionaly回答者ID
  ,e.corporation_id --pionaly回答者の所属会社ID
  ,esf.factor_type_id --pionalyの各因子の名称
  ,esf.value --pionalyの各因子の因子スコア
  ,es.scoring_date --pionaly回答者の回答日
from
  `hrtech-datainfra-prod.view_worx_personal.employee_score_factors` as esf
left join
  `hrtech-datainfra-prod.view_worx_personal.employee_scores` as es
  on esf.score_id = es.id
left join
  `hrtech-datainfra-prod.view_worx_personal.employee_personals` as ep
  on es.employee_id = ep.id
left join
  `hrtech-datainfra-prod.view_worx_employee.employee` as e
  on ep.id = e.id
where
  es.is_answered = true --未回答者を除く（回答者に絞る）
  and es.type_id = 'pionaly' --pionaly回答者に絞る
order by
  employee_id

Query is running:   0%|          |

Downloading:   0%|          |

In [5]:
input_rawdata.head(5)

Unnamed: 0,employee_id,corporation_id,factor_type_id,value,scoring_date
0,0013d97e-ea35-492e-b2e3-ba9f0badc5cf,89eac118-a1ce-45a2-847c-c78c435193ac,tendency,2.460446872148746,2024-10-17 02:15:08
1,0013d97e-ea35-492e-b2e3-ba9f0badc5cf,89eac118-a1ce-45a2-847c-c78c435193ac,formalization,4.882873910451993,2024-10-17 02:15:08
2,0013d97e-ea35-492e-b2e3-ba9f0badc5cf,89eac118-a1ce-45a2-847c-c78c435193ac,fulfillment,2.182942257698869,2024-10-17 02:15:08
3,0013d97e-ea35-492e-b2e3-ba9f0badc5cf,89eac118-a1ce-45a2-847c-c78c435193ac,influence,0.0537013161581523,2024-10-17 02:15:08
4,0013d97e-ea35-492e-b2e3-ba9f0badc5cf,89eac118-a1ce-45a2-847c-c78c435193ac,gathering,5.002348752194493,2024-10-17 02:15:08


## 「input_rawdata」をユークリッド距離算出に使う因子のみに絞る
- ユークリッド距離の算出元とである因子を変更する場合は、「factor_type_japanese」の値を変更してください。

In [6]:
# ユークリッド距離の算出元とである因子を変更する場合は、「factor_type_japanese」の値を変更
factor_type_japanese = ["社交性","影響性","好奇心","創造性","謙虚心","援助心","安定感","回復力","計画性","正義感","慎重性","持続性","活動性","達成欲","目的的","競争心"]

# 「factor_type」の名称を追加
file_path = "factorTypeIdと因子名.csv"
factor_type_id_conversion = pd.read_csv(file_path, encoding='utf-8')

# ユークリッド距離の算出元の因子_英語
factor_type_english = []
for _,row in factor_type_id_conversion.iterrows():
  for i in factor_type_japanese:
    if row["factor_type_name"] == i:
      factor_type_english.append(row["factor_type_id"])

# 距離算出に使用する因子のみのデータに絞る
input_rawdata = pd.merge(input_rawdata, factor_type_id_conversion, how ='left', on="factor_type_id")
input_rawdata = input_rawdata[input_rawdata['factor_type_name'].isin(factor_type_japanese)]
input_data = input_rawdata.drop('factor_type_name', axis=1)
input_data.head(5)

Unnamed: 0,employee_id,corporation_id,factor_type_id,value,scoring_date
3,0013d97e-ea35-492e-b2e3-ba9f0badc5cf,89eac118-a1ce-45a2-847c-c78c435193ac,influence,0.0537013161581523,2024-10-17 02:15:08
6,0013d97e-ea35-492e-b2e3-ba9f0badc5cf,89eac118-a1ce-45a2-847c-c78c435193ac,planning,3.577304604492314,2024-10-17 02:15:08
8,0013d97e-ea35-492e-b2e3-ba9f0badc5cf,89eac118-a1ce-45a2-847c-c78c435193ac,competitiveness,5.411541162812172,2024-10-17 02:15:08
18,0013d97e-ea35-492e-b2e3-ba9f0badc5cf,89eac118-a1ce-45a2-847c-c78c435193ac,sociability,5.55056736212645,2024-10-17 02:15:08
25,0013d97e-ea35-492e-b2e3-ba9f0badc5cf,89eac118-a1ce-45a2-847c-c78c435193ac,persistence,6.680957258657366,2024-10-17 02:15:08


## 「input_data」をデータ成形
- 複数回回答されている方は、回答最新日を正とする。（古い回答日レコードは削除）
- 「factor_type_id」をダミー変数化（値は「value」）

In [7]:
# 「value」をフロート値に変換
input_data["value"] = input_data["value"].astype(float)

# scoring_date列から「○○○○-○○-○○」を抽出
input_data['scoring_date'] = input_data['scoring_date'].astype(str).str[:10]

# scoring_dateをデータ型に変換
input_data["scoring_date"] = pd.to_datetime(input_data["scoring_date"])

# scoring_dateを最新日順にソート
input_data = input_data.sort_values('scoring_date', ascending=False)

# 重複するレコードを削除し、最初のレコードを残す
input_data = input_data.drop_duplicates(subset=['employee_id', 'factor_type_id'], keep='first')

# 列を削除
input_data = input_data.drop(['scoring_date'], axis=1)

#因子スコアカラムをダミー変数化
factor_df = pd.get_dummies(input_data, columns=['factor_type_id'], prefix='', prefix_sep='', dtype=int)
for col in factor_type_english:
  if col in factor_df.columns:
      factor_df[col] = factor_df[col] * factor_df["value"]

input_data = factor_df.groupby('employee_id', as_index=False).agg(
     {'corporation_id': 'first'}|{col: 'sum' for col in factor_type_english} )

# 「employee_id」のHRテック従業員の名称を追加
file_path = "HRテック従業員_レバ全社員.csv"
employee_name = pd.read_csv(file_path, encoding='utf-8')

# Name1のフルネームを追加
input_data = pd.merge(input_data, employee_name, left_on='employee_id', right_on='employee_id', how='left')
input_data.head(5)

Unnamed: 0,employee_id,corporation_id,sociability,influence,curiosity,creativity,humility,willingness,stability,resilience,planning,justice,prudence,persistence,activity,achievement,purposeful,competitiveness,fullname
0,0013d97e-ea35-492e-b2e3-ba9f0badc5cf,89eac118-a1ce-45a2-847c-c78c435193ac,5.550567,0.053701,3.913089,2.36375,8.604818,2.93914,5.148542,5.532717,3.577305,5.94955,7.840909,6.680957,3.612698,3.956943,3.410659,5.411541,
1,0033e100-e265-45cb-8c98-2f20d4d54184,56f195ed-73e8-4efe-bdd5-4b26072e6080,3.669543,6.165298,4.111794,4.636109,4.161891,6.659658,4.026398,3.479052,3.394327,5.663563,3.121137,4.489037,3.774801,3.477878,5.552994,3.485494,中野上 龍太郎
2,004d1690-175d-484f-b35a-46326d8c2bcc,56f195ed-73e8-4efe-bdd5-4b26072e6080,1.557103,6.810973,0.729797,2.025396,8.488006,8.636189,8.907818,8.300193,7.104841,8.04678,7.925479,8.56704,9.05,8.322711,5.319574,7.871664,相月 俊紀
3,00857a9d-9622-4c65-983e-ae062d6e2b19,56f195ed-73e8-4efe-bdd5-4b26072e6080,7.551115,5.112665,5.699968,1.114114,3.344323,7.578699,4.989998,6.113833,1.950939,4.474529,3.043569,3.321091,5.44599,3.882562,4.798803,6.477777,ルパワッタゲ アナン
4,008d1131-c9fc-4177-859c-dedc52575558,,3.839282,5.529811,6.504259,7.081895,5.944887,6.873977,7.953816,6.122444,5.429369,6.715763,4.666261,5.935446,7.315717,6.912303,7.458597,6.72607,黒澤 愛夏


## 回答者IDと所属企業IDのマスターを作成

In [8]:
# 回答者idと企業IDのデータ
employee_company_data = input_data[["employee_id", "corporation_id"]]
employee_company_data = employee_company_data.drop_duplicates(subset=["employee_id", "corporation_id"])
employee_company_data

Unnamed: 0,employee_id,corporation_id
0,0013d97e-ea35-492e-b2e3-ba9f0badc5cf,89eac118-a1ce-45a2-847c-c78c435193ac
1,0033e100-e265-45cb-8c98-2f20d4d54184,56f195ed-73e8-4efe-bdd5-4b26072e6080
2,004d1690-175d-484f-b35a-46326d8c2bcc,56f195ed-73e8-4efe-bdd5-4b26072e6080
3,00857a9d-9622-4c65-983e-ae062d6e2b19,56f195ed-73e8-4efe-bdd5-4b26072e6080
4,008d1131-c9fc-4177-859c-dedc52575558,
...,...,...
4340,ffd67bfb-0309-4f64-89a5-38ae6930fb59,56f195ed-73e8-4efe-bdd5-4b26072e6080
4341,fff0847a-2668-4fa9-9980-bd7172c1a7e1,56f195ed-73e8-4efe-bdd5-4b26072e6080
4342,fff22958-1634-4517-b656-40b8095bc8bd,56f195ed-73e8-4efe-bdd5-4b26072e6080
4343,fff7ab5e-fe53-4704-9b9a-34b3e371ad47,56f195ed-73e8-4efe-bdd5-4b26072e6080


## 回答者IDと因子スコア（値が0～100の範囲）のマスターを作成
- 「value」の値を10倍

In [9]:
# 因子スコアを10倍
factor_score = input_data.columns.difference(['employee_id', 'fullname', 'corporation_id'])
factor_score_scaling = input_data.copy()
factor_score_scaling[factor_score] = factor_score_scaling[factor_score]*10
factor_score_scaling = factor_score_scaling.drop('corporation_id', axis=1)
factor_score_scaling.head(5)

Unnamed: 0,employee_id,sociability,influence,curiosity,creativity,humility,willingness,stability,resilience,planning,justice,prudence,persistence,activity,achievement,purposeful,competitiveness,fullname
0,0013d97e-ea35-492e-b2e3-ba9f0badc5cf,55.505674,0.537013,39.130887,23.637502,86.048178,29.391402,51.485422,55.327174,35.773046,59.4955,78.409089,66.809573,36.12698,39.56943,34.106594,54.115412,
1,0033e100-e265-45cb-8c98-2f20d4d54184,36.695429,61.652977,41.117945,46.361088,41.618914,66.596582,40.263979,34.790523,33.94327,56.635629,31.211371,44.890372,37.748006,34.778784,55.529941,34.85494,中野上 龍太郎
2,004d1690-175d-484f-b35a-46326d8c2bcc,15.571032,68.109725,7.297973,20.253965,84.880057,86.361889,89.078184,83.00193,71.048415,80.467796,79.254787,85.6704,90.500002,83.227109,53.195745,78.716636,相月 俊紀
3,00857a9d-9622-4c65-983e-ae062d6e2b19,75.511147,51.126646,56.999684,11.141141,33.443229,75.786992,49.899981,61.138334,19.509387,44.745291,30.435686,33.21091,54.459899,38.825621,47.988028,64.777768,ルパワッタゲ アナン
4,008d1131-c9fc-4177-859c-dedc52575558,38.392816,55.298106,65.042588,70.81895,59.448871,68.739774,79.538162,61.224437,54.293685,67.157629,46.662613,59.354465,73.157171,69.123035,74.585969,67.260704,黒澤 愛夏


# ユークリッド距離
- 「ユークリッド距離」は、数値データ間の直線距離を測る指標。
- 個人ごとの各因子スコアのユークリッド距離を算出することで、性格の類似度を測定。
- ユークリッド距離が小さいほど性格が似ており、大きいほど異なる傾向があります。

### 分割した「input_data」をnumpy配列に変換
- データ量が多いため、処理速度が速いmumpy配列に変換

In [None]:
# 必要なカラムのみ抽出
filtered_data = input_data[factor_type_english]

# データ型をfloatに明示的に変換（エラーの場合はNaNに置き換え）
filtered_data = filtered_data.apply(pd.to_numeric, errors='coerce')

# NaNを含む行を削除
filtered_data = filtered_data.dropna()

# 明示的にfloat型のnumpy配列に変換
data_array = filtered_data.to_numpy(dtype=float)

# 対応するemployee_idを取得
ids_array = input_data.loc[filtered_data.index, "employee_id"].to_numpy()

# ユニークなemployee_idを取得
names = np.unique(ids_array)

### ユークリッド距離を測定
- データ量が多いため、「input_data」を分割してユークリッド距離を算出

In [None]:
# ユークリッド距離を計算
# 全データ(4738行)で実行時間は11分ほど、レコード数は2211639
euclid_distance_1 = []
for i in range(len(names) // 8):
    for j in range(i + 1, len(names)):
        person_factor_1 = np.where(ids_array == names[i])[0][0]
        person_factor_2 = np.where(ids_array == names[j])[0][0]
        distance = np.linalg.norm(data_array[person_factor_1] - data_array[person_factor_2])  # ユークリッド距離
        euclid_distance_1.append([names[i], names[j], distance])

In [None]:
# ユークリッド距離を計算
# 全データ(4738行)で実行時間は9分ほど、レコード数は1916790
euclid_distance_2 = []
for i in range(len(names) // 8, len(names) // 4):
    for j in range(i + 1, len(names)):
        person_factor_1 = np.where(ids_array == names[i])[0][0]
        person_factor_2 = np.where(ids_array == names[j])[0][0]
        distance = np.linalg.norm(data_array[person_factor_1] - data_array[person_factor_2])  # ユークリッド距離
        euclid_distance_2.append([names[i], names[j], distance])

In [None]:
# ユークリッド距離を計算
# 全データ(4738行)で実行時間は14分ほど、レコード数は2273778
euclid_distance_3 = []
for i in range(len(names) // 4, int(len(names) // 2.5)):
    for j in range(i + 1, len(names)):
        person_factor_1 = np.where(ids_array == names[i])[0][0]
        person_factor_2 = np.where(ids_array == names[j])[0][0]
        distance = np.linalg.norm(data_array[person_factor_1] - data_array[person_factor_2])  # ユークリッド距離
        euclid_distance_3.append([names[i], names[j], distance])

In [None]:
# ユークリッド距離を計算
# 全データ(4738行)で実行時間は10分ほど、レコード数は1662468
euclid_distance_4 = []
for i in range(int(len(names) // 2.5), int(len(names) // 1.75)):
    for j in range(i + 1, len(names)):
        person_factor_1 = np.where(ids_array == names[i])[0][0]
        person_factor_2 = np.where(ids_array == names[j])[0][0]
        distance = np.linalg.norm(data_array[person_factor_1] - data_array[person_factor_2])  # ユークリッド距離
        euclid_distance_4.append([names[i], names[j], distance])

In [None]:
# ユークリッド距離を計算
# 全データ(4738行)で実行時間は5分ほど、レコード数は1524299
euclid_distance_5 = []
for i in range(int(len(names) // 1.75), int(len(names) // 1.28)):
    for j in range(i + 1, len(names)):
        person_factor_1 = np.where(ids_array == names[i])[0][0]
        person_factor_2 = np.where(ids_array == names[j])[0][0]
        distance = np.linalg.norm(data_array[person_factor_1] - data_array[person_factor_2])  # ユークリッド距離
        euclid_distance_5.append([names[i], names[j], distance])

In [None]:
# ユークリッド距離を計算
# 全データ(4738行)で実行時間は3分ほど、レコード数は451725
euclid_distance_6 = []
for i in range(int(len(names) // 1.28), len(names)):
    for j in range(i + 1, len(names)):
        person_factor_1 = np.where(ids_array == names[i])[0][0]
        person_factor_2 = np.where(ids_array == names[j])[0][0]
        distance = np.linalg.norm(data_array[person_factor_1] - data_array[person_factor_2])  # ユークリッド距離
        euclid_distance_6.append([names[i], names[j], distance])

### 分割したユークリッド距離を結合し、データ成形
- 分割した「euclid_distance_1」～「euclid_distance_6」を結合

In [None]:
# 「euclid_distance_1」～「euclid_distance_6」の結合
euclid_distance = euclid_distance_1 + euclid_distance_2 + euclid_distance_3 + euclid_distance_4 + euclid_distance_5 + euclid_distance_6

# データフレーム化
euclid_distance_df = pd.DataFrame(euclid_distance, columns=['Name1', 'Name2', 'Distance'])
euclid_distance_df.head(3)

Unnamed: 0,Name1,Name2,Distance
0,0013d97e-ea35-492e-b2e3-ba9f0badc5cf,0033e100-e265-45cb-8c98-2f20d4d54184,10.995573
1,0013d97e-ea35-492e-b2e3-ba9f0badc5cf,004d1690-175d-484f-b35a-46326d8c2bcc,14.312375
2,0013d97e-ea35-492e-b2e3-ba9f0badc5cf,00857a9d-9622-4c65-983e-ae062d6e2b19,11.378277


### ユークリッド距離をスケーリング（0～100の尺度に変換）
- ユークリッド距離（Distance）をロジスティック関数を用いてスケーリング（Distance_logit）

In [None]:
# ユークリッド距離をロジスティック関数でスケーリング
a = -1.5
distance_x = euclid_distance_df["Distance"].values
distance_a = a / np.std(distance_x)
distance_b = -distance_a * np.median(distance_x)
distance_y = 1 / (1 + np.exp(distance_a * distance_x + distance_b))
euclid_distance_df[f"Distance_logit"] = distance_y * 100
euclid_distance_df.head(3)

Unnamed: 0,Name1,Name2,Distance,Distance_logit
0,0013d97e-ea35-492e-b2e3-ba9f0badc5cf,0033e100-e265-45cb-8c98-2f20d4d54184,10.995573,76.565419
1,0013d97e-ea35-492e-b2e3-ba9f0badc5cf,004d1690-175d-484f-b35a-46326d8c2bcc,14.312375,96.185691
2,0013d97e-ea35-492e-b2e3-ba9f0badc5cf,00857a9d-9622-4c65-983e-ae062d6e2b19,11.378277,80.529598


### 値が大きいほど、類似度が高い「マッチング値（0～100の範囲）」指標を作成
- 「Distance_logit（スケーリングされたユークリッド距離）」は値が小さいほど類似しており、直感的にはわかりづらい。
- そのため、値が大きいほど、類似する指標「マッチング値（100 - 「Distance_logit」）」を作成

In [None]:
# マッチング値
euclid_distance_df["matching_value"] = 100 - euclid_distance_df["Distance_logit"]
euclid_distance_df.head(3)

Unnamed: 0,Name1,Name2,Distance,Distance_logit,matching_value
0,0013d97e-ea35-492e-b2e3-ba9f0badc5cf,0033e100-e265-45cb-8c98-2f20d4d54184,10.995573,76.565419,23.434581
1,0013d97e-ea35-492e-b2e3-ba9f0badc5cf,004d1690-175d-484f-b35a-46326d8c2bcc,14.312375,96.185691,3.814309
2,0013d97e-ea35-492e-b2e3-ba9f0badc5cf,00857a9d-9622-4c65-983e-ae062d6e2b19,11.378277,80.529598,19.470402


### 「euclid_distance_df」に所属企業IDを追加

In [None]:
# Name1のcorporation_idを追加
euclid_distance_df = pd.merge(euclid_distance_df, employee_company_data, left_on='Name1', right_on='employee_id', how='left')
euclid_distance_df = euclid_distance_df.rename(columns={'corporation_id': 'Name1_corporation_id'})
euclid_distance_df = euclid_distance_df.drop('employee_id', axis=1)

# Name2のcorporation_idを追加
euclid_distance_df = pd.merge(euclid_distance_df, employee_company_data, left_on='Name2', right_on='employee_id', how='left')
euclid_distance_df = euclid_distance_df.rename(columns={'corporation_id': 'Name2_corporation_id'})
euclid_distance_df = euclid_distance_df.drop('employee_id', axis=1)

# カラムの順序変更
new_column_order = ['Name1', 'Name1_corporation_id','Name2','Name2_corporation_id', 'Distance', 'Distance_logit', 'matching_value']
euclid_distance_df = euclid_distance_df[new_column_order]
euclid_distance_df.head(5)

Unnamed: 0,Name1,Name1_corporation_id,Name2,Name2_corporation_id,Distance,Distance_logit,matching_value
0,0013d97e-ea35-492e-b2e3-ba9f0badc5cf,89eac118-a1ce-45a2-847c-c78c435193ac,0033e100-e265-45cb-8c98-2f20d4d54184,56f195ed-73e8-4efe-bdd5-4b26072e6080,10.995573,76.565419,23.434581
1,0013d97e-ea35-492e-b2e3-ba9f0badc5cf,89eac118-a1ce-45a2-847c-c78c435193ac,004d1690-175d-484f-b35a-46326d8c2bcc,56f195ed-73e8-4efe-bdd5-4b26072e6080,14.312375,96.185691,3.814309
2,0013d97e-ea35-492e-b2e3-ba9f0badc5cf,89eac118-a1ce-45a2-847c-c78c435193ac,00857a9d-9622-4c65-983e-ae062d6e2b19,56f195ed-73e8-4efe-bdd5-4b26072e6080,11.378277,80.529598,19.470402
3,0013d97e-ea35-492e-b2e3-ba9f0badc5cf,89eac118-a1ce-45a2-847c-c78c435193ac,008d1131-c9fc-4177-859c-dedc52575558,,12.160532,87.008395,12.991605
4,0013d97e-ea35-492e-b2e3-ba9f0badc5cf,89eac118-a1ce-45a2-847c-c78c435193ac,00c87eaa-b4ff-4862-8172-d788d26b7b5d,56f195ed-73e8-4efe-bdd5-4b26072e6080,13.817104,94.89412,5.10588


### 所属企業IDが異なる組合せは削除
- 所属企業が異なる同士のユークリッド距離を使用することはないためレコードを削除

In [None]:
# Name1とName2の会社IDが異なるレコードは削除
euclid_distance_df = euclid_distance_df[euclid_distance_df['Name1_corporation_id'] == euclid_distance_df['Name2_corporation_id']]
euclid_distance_df.head(5)

Unnamed: 0,Name1,Name1_corporation_id,Name2,Name2_corporation_id,Distance,Distance_logit,matching_value
20,0013d97e-ea35-492e-b2e3-ba9f0badc5cf,89eac118-a1ce-45a2-847c-c78c435193ac,015ddec2-7e2a-4d53-b320-6329d14f92d9,89eac118-a1ce-45a2-847c-c78c435193ac,8.944561,48.007021,51.992979
34,0013d97e-ea35-492e-b2e3-ba9f0badc5cf,89eac118-a1ce-45a2-847c-c78c435193ac,0283de03-ebf4-48c4-bf21-61b1f9dcaf40,89eac118-a1ce-45a2-847c-c78c435193ac,14.727711,97.021241,2.978759
47,0013d97e-ea35-492e-b2e3-ba9f0badc5cf,89eac118-a1ce-45a2-847c-c78c435193ac,03089f12-4e88-4f62-8909-57e00217a451,89eac118-a1ce-45a2-847c-c78c435193ac,11.580789,82.411843,17.588157
92,0013d97e-ea35-492e-b2e3-ba9f0badc5cf,89eac118-a1ce-45a2-847c-c78c435193ac,05b6aeec-a692-41b7-b202-b7ca3e14fa8d,89eac118-a1ce-45a2-847c-c78c435193ac,13.804081,94.855102,5.144898
121,0013d97e-ea35-492e-b2e3-ba9f0badc5cf,89eac118-a1ce-45a2-847c-c78c435193ac,071f6706-8095-4af1-bf75-1e726f5ddc9a,89eac118-a1ce-45a2-847c-c78c435193ac,10.546381,71.242101,28.757899


### 「euclid_distance_df」にName1とName2の因子スコア（値が0～100の範囲）を追加

In [None]:
# Name1の因子のカラム名を作成
Name1_factor_dict = {}
for i in factor_type_english:
  Name1_factor_dict[i]="Name1_" + i

# Name2の因子のカラム名を作成
Name2_factor_dict = {}
for i in factor_type_english:
  Name2_factor_dict[i]="Name2_" + i

In [None]:
# Name1の因子データを追加
cluster_df = pd.merge(euclid_distance_df, factor_score_scaling, left_on='Name1', right_on='employee_id', how='left')
cluster_df = cluster_df.rename(columns = Name1_factor_dict)
cluster_df = cluster_df.rename(columns = {"fullname":"Name1_fullname", "employee_id":"Name1_emoployee_id"})

In [None]:
# Name2の因子データを追加
cluster_df = pd.merge(cluster_df, factor_score_scaling, left_on='Name2', right_on='employee_id', how='left')
cluster_df = cluster_df.rename(columns = Name2_factor_dict)
cluster_df = cluster_df.rename(columns = {"fullname":"Name2_fullname", "employee_id":"Name2_emoployee_id"})

In [None]:
# employee_idは削除
euclid_distance_df = cluster_df.drop(['Name1_emoployee_id','Name2_emoployee_id'], axis=1)
euclid_distance_df.head(5)

Unnamed: 0,Name1,Name1_corporation_id,Name2,Name2_corporation_id,Distance,Distance_logit,matching_value,Name1_sociability,Name1_influence,Name1_curiosity,...,Name2_resilience,Name2_planning,Name2_justice,Name2_prudence,Name2_persistence,Name2_activity,Name2_achievement,Name2_purposeful,Name2_competitiveness,Name2_fullname
0,0013d97e-ea35-492e-b2e3-ba9f0badc5cf,89eac118-a1ce-45a2-847c-c78c435193ac,015ddec2-7e2a-4d53-b320-6329d14f92d9,89eac118-a1ce-45a2-847c-c78c435193ac,8.944561,48.007021,51.992979,55.505674,0.537013,39.130887,...,50.017023,39.821418,44.810977,45.633627,37.218814,34.799802,27.517836,46.541028,35.914936,
1,0013d97e-ea35-492e-b2e3-ba9f0badc5cf,89eac118-a1ce-45a2-847c-c78c435193ac,0283de03-ebf4-48c4-bf21-61b1f9dcaf40,89eac118-a1ce-45a2-847c-c78c435193ac,14.727711,97.021241,2.978759,55.505674,0.537013,39.130887,...,79.5985,31.042545,51.229904,29.896136,53.243054,42.232825,67.337965,62.374652,45.459573,
2,0013d97e-ea35-492e-b2e3-ba9f0badc5cf,89eac118-a1ce-45a2-847c-c78c435193ac,03089f12-4e88-4f62-8909-57e00217a451,89eac118-a1ce-45a2-847c-c78c435193ac,11.580789,82.411843,17.588157,55.505674,0.537013,39.130887,...,38.952125,42.750251,55.473251,22.257195,38.573404,44.859649,33.905943,53.906952,55.290015,
3,0013d97e-ea35-492e-b2e3-ba9f0badc5cf,89eac118-a1ce-45a2-847c-c78c435193ac,05b6aeec-a692-41b7-b202-b7ca3e14fa8d,89eac118-a1ce-45a2-847c-c78c435193ac,13.804081,94.855102,5.144898,55.505674,0.537013,39.130887,...,47.695895,65.689053,59.256884,39.96982,56.951639,45.464813,76.434077,88.68374,74.86901,
4,0013d97e-ea35-492e-b2e3-ba9f0badc5cf,89eac118-a1ce-45a2-847c-c78c435193ac,071f6706-8095-4af1-bf75-1e726f5ddc9a,89eac118-a1ce-45a2-847c-c78c435193ac,10.546381,71.242101,28.757899,55.505674,0.537013,39.130887,...,68.245424,35.662706,77.221387,55.438624,54.218934,52.145225,49.371985,35.539304,59.667654,


# 全PIONALY回答者から特定の企業にデータを絞る
- デフォルトは「レバレジーズ社員（56f195ed-73e8-4efe-bdd5-4b26072e6080）」に絞っています。

In [16]:
def filtered_companies(euclid_distance_df, corporation_id):
  euclid_distance_filtered_companies = euclid_distance_df[euclid_distance_df["Name1_corporation_id"] == corporation_id]
  return euclid_distance_filtered_companies

In [20]:
# 「corporation_id」に特定の企業IDを記述。
euclid_distance_filtered_companies = filtered_companies(euclid_distance_df = euclid_distance_df, corporation_id = "56f195ed-73e8-4efe-bdd5-4b26072e6080")
euclid_distance_filtered_companies

Unnamed: 0,Name1_fullname,Name1_corporation_id,Name1_社交性,Name1_影響性,Name1_好奇心,Name1_創造性,Name1_謙虚心,Name1_援助心,Name1_安心感,Name1_回復力,...,Name2_計画性,Name2_正義感,Name2_慎重性,Name2_持続性,Name2_活動性,Name2_達成欲,Name2_目的的,Name2_競争心,距離,マッチング度合い
0,桐生 直輝,56f195ed-73e8-4efe-bdd5-4b26072e6080,51.5,46.3,54.3,42.8,75.8,83.5,52.7,37.9,...,49.9,51.6,52.5,55.5,18.9,34.4,15.2,35.7,23.7,76.3
1,香川 淳,56f195ed-73e8-4efe-bdd5-4b26072e6080,66.9,32.5,72.1,75.4,32.5,69.0,54.0,27.8,...,49.9,51.6,52.5,55.5,18.9,34.4,15.2,35.7,37.3,62.8
2,三口 廉,56f195ed-73e8-4efe-bdd5-4b26072e6080,64.9,39.9,42.8,66.3,39.4,51.1,34.0,47.4,...,49.9,51.6,52.5,55.5,18.9,34.4,15.2,35.7,16.2,83.9
3,森脇 斗也,56f195ed-73e8-4efe-bdd5-4b26072e6080,61.5,47.7,49.6,72.9,28.4,55.2,52.1,39.5,...,49.9,51.6,52.5,55.5,18.9,34.4,15.2,35.7,18.3,81.7
4,先﨑 啓介,56f195ed-73e8-4efe-bdd5-4b26072e6080,43.3,24.1,23.5,62.7,49.9,51.2,47.7,56.1,...,49.9,51.6,52.5,55.5,18.9,34.4,15.2,35.7,17.1,82.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
485,吉野 颯海,56f195ed-73e8-4efe-bdd5-4b26072e6080,70.0,49.7,53.7,69.7,47.6,19.3,50.2,58.8,...,77.3,50.1,48.9,69.3,40.6,64.0,63.9,54.9,34.2,65.8
486,三口 廉,56f195ed-73e8-4efe-bdd5-4b26072e6080,64.9,39.9,42.8,66.3,39.4,51.1,34.0,47.4,...,77.3,50.1,48.9,69.3,40.6,64.0,63.9,54.9,21.1,78.9
487,森脇 斗也,56f195ed-73e8-4efe-bdd5-4b26072e6080,61.5,47.7,49.6,72.9,28.4,55.2,52.1,39.5,...,77.3,50.1,48.9,69.3,40.6,64.0,63.9,54.9,14.1,85.9
488,大滝 圭修,56f195ed-73e8-4efe-bdd5-4b26072e6080,65.5,69.3,56.7,67.2,40.9,68.5,70.1,49.5,...,77.3,50.1,48.9,69.3,40.6,64.0,63.9,54.9,24.0,76.0


# ユークリッド距離を可視化
- MDS（多次元尺度構成法）を用いて、因子数分の次元を、2次元に圧縮して、ユークリッド距離を可視化。
- MDS実装時に最適化がかかる背景から、同じデータを基にMDSをしても、若干、可視化図が異なる場合がございます。（初期値を固定にするなど対策は実施済み）

In [25]:
def mds_stable(distance_df, n_iter=10, random_state=42):
    # 1. 距離行列の作成
    names = sorted(set(distance_df["Name1_fullname"]).union(set(distance_df["Name2_fullname"])))
    distance_matrix = pd.DataFrame(index=names, columns=names, data=np.nan)

    # 2. データを距離行列に埋める
    for _, row in distance_df.iterrows():
        distance_matrix.loc[row["Name1_fullname"], row["Name2_fullname"]] = row["Distance"]
        distance_matrix.loc[row["Name2_fullname"], row["Name1_fullname"]] = row["Distance"]

    # 3. 未定義の距離を最大距離に置換（対角成分は0）
    np.fill_diagonal(distance_matrix.values, 0)
    distance_matrix = distance_matrix.fillna(distance_matrix.max().max())

    # 4. 最もストレス値の低いMDS結果を選択
    best_embedding = None
    best_stress = float("inf")

    for _ in range(n_iter):
        mds = MDS(n_components=2, dissimilarity="precomputed", random_state=random_state)  # random_stateを設定
        embedding = mds.fit_transform(distance_matrix)

        if mds.stress_ < best_stress:  # 最小ストレス値の結果を保存
            best_stress = mds.stress_
            best_embedding = embedding

    # 5. データフレーム化
    mds_df = pd.DataFrame(best_embedding, columns=["Dim1", "Dim2"], index=names).reset_index()
    mds_df.rename(columns={"index": "Name"}, inplace=True)

    # 6. 可視化
    fig = px.scatter(
        mds_df,
        x="Dim1",
        y="Dim2",
        text="Name",
        title=f"各個人のユークリッド距離を可視化（MDSを活用）",
        labels={"Dim1": "次元 1", "Dim2": "次元 2"},
    )
    fig.update_traces(textposition='top center')
    fig.show()

In [26]:
# ユークリッド距離を可視化
mds_stable(euclid_distance_filtered_companies)

## CSVダウンロード
- ご自身のパス状況に合わせて編集してください。

In [None]:
#特定の会社の距離データをCSVファイルで保存
file_path = "/content/drive/MyDrive/mar-データ戦略/事業施策/HRTech/[HRT]適性マッチのマッチングロジック/euclid_distance_person_to_person_filtered_companies.csv"
euclid_distance_filtered_companies.to_csv(file_path, index=False, encoding='utf-8')

In [None]:
#全PIONALY回答者の距離データをCSVファイルで保存（非常にファイルが重たいため、保存が上手くいかない場合があります）
file_path = "/content/drive/MyDrive/mar-データ戦略/事業施策/HRTech/[HRT]適性マッチのマッチングロジック/euclid_distance_person_to_person.csv"
euclid_distance_df.to_csv(file_path, index=False, encoding='utf-8')