# 合成人口データのフィルタリング処理

In [2]:
# ライブラリ
from pathlib import Path
import pandas as pd

### original の合成人口データを読み込む

In [4]:
# /Users/osamu/study/poi_sim/data/raw/population/2015_001_8_13208.csv を読み込む

pop_path = Path("../data/raw/population/2015_001_8_13208.csv")

df_pop = pd.read_csv(pop_path)
print("shape:", df_pop.shape)
print("columns:", df_pop.columns.tolist())
display(df_pop.head())


shape: (214281, 25)
columns: ['prefecture_code', 'prefecture_name', 'city_code', 'city_name', 'town_code', 'town_name', 'latitude', 'longitude', 'household_id', 'family_type_id', 'family_type', 'num_member', 'abnormal_household', 'person_id', 'age', 'gender_id', 'gender', 'role_household_type_id', 'role_household_type', 'industry_type_id', 'industry_type', 'employment_type_id', 'employment_type', 'company_size_id', 'company_size']


Unnamed: 0,prefecture_code,prefecture_name,city_code,city_name,town_code,town_name,latitude,longitude,household_id,family_type_id,...,gender_id,gender,role_household_type_id,role_household_type,industry_type_id,industry_type,employment_type_id,employment_type,company_size_id,company_size
0,13,東京都,13208,調布市,13208010005,多摩川５丁目,35.642698,139.536368,18324646,0,...,0,男性,0,単独世帯（男性）,,,,,,
1,13,東京都,13208,調布市,13208008006,国領町６丁目,35.644873,139.554937,18324647,0,...,0,男性,0,単独世帯（男性）,,,,,,
2,13,東京都,13208,調布市,13208001002,飛田給２丁目,35.658862,139.525556,18324648,0,...,0,男性,0,単独世帯（男性）,,,,,,
3,13,東京都,13208,調布市,13208022004,調布ケ丘４丁目,35.662079,139.54499,18324649,0,...,0,男性,0,単独世帯（男性）,,,,,,
4,13,東京都,13208,調布市,13208017001,仙川町１丁目,35.66079,139.589332,18324650,0,...,0,男性,0,単独世帯（男性）,,,,,,


### 阪上加工のデータを読み込む

In [5]:
pop_proc_path = Path("../data/processed/yamada_processed/population/2015_001_8_13208_01.csv")

df_pop_proc = pd.read_csv(pop_proc_path)
print("shape:", df_pop_proc.shape)
print("columns:", df_pop_proc.columns.tolist())
display(df_pop_proc.head())


shape: (88849, 26)
columns: ['Unnamed: 0', 'prefecture_code', 'prefecture_name', 'city_code', 'city_name', 'town_code', 'town_name', 'latitude', 'longitude', 'household_id', 'family_type_id', 'family_type', 'num_member', 'abnormal_household', 'person_id', 'age', 'gender_id', 'gender', 'role_household_type_id', 'role_household_type', 'industry_type_id', 'industry_type', 'employment_type_id', 'employment_type', 'company_size_id', 'company_size']


Unnamed: 0.1,Unnamed: 0,prefecture_code,prefecture_name,city_code,city_name,town_code,town_name,latitude,longitude,household_id,...,gender_id,gender,role_household_type_id,role_household_type,industry_type_id,industry_type,employment_type_id,employment_type,company_size_id,company_size
0,177,13,東京都,13208,調布市,13208003004,富士見町４丁目,35.662585,139.534513,18324823,...,0,男性,0,単独世帯（男性）,60.0,Ｆ 電気・ガス・熱供給・水道業,10.0,一般労働者,1000.0,1000人以上
1,181,13,東京都,13208,調布市,13208009002,染地２丁目,35.639386,139.549963,18324827,...,0,男性,0,単独世帯（男性）,130.0,Ｍ 宿泊業，飲食サービス業,20.0,短時間労働者,10.0,10～99人
2,187,13,東京都,13208,調布市,13208004003,下石原３丁目,35.649432,139.534088,18324833,...,0,男性,0,単独世帯（男性）,130.0,Ｍ 宿泊業，飲食サービス業,20.0,短時間労働者,1000.0,1000人以上
3,188,13,東京都,13208,調布市,13208004003,下石原３丁目,35.652872,139.530387,18324834,...,0,男性,0,単独世帯（男性）,90.0,Ｉ 卸売業，小売業,20.0,短時間労働者,100.0,100～999人
4,189,13,東京都,13208,調布市,13208001001,飛田給１丁目,35.66324,139.522585,18324835,...,0,男性,0,単独世帯（男性）,150.0,Ｏ 教育，学習支援業,20.0,短時間労働者,1000.0,1000人以上


### 加工方法を調査

In [8]:
raw = df_pop.copy()
proc = df_pop_proc.copy()

print("raw rows :", len(raw))
print("proc rows:", len(proc))

# 1) abnormal_household の違い
print("\n[abnormal_household]")
print("raw:\n", raw["abnormal_household"].value_counts(dropna=False))
print("proc:\n", proc["abnormal_household"].value_counts(dropna=False))

# 2) household / person のユニーク数
print("\n[unique counts]")
print("raw household:", raw["household_id"].nunique(), "person:", raw["person_id"].nunique())
print("proc household:", proc["household_id"].nunique(), "person:", proc["person_id"].nunique())

# 3) 年齢分布の比較（代表）
print("\n[age describe]")
print("raw:\n", raw["age"].describe())
print("proc:\n", proc["age"].describe())

# 4) 雇用形態の比較（代表）
print("\n[employment_type top]")
print("raw:\n", raw["employment_type"].value_counts(dropna=False).head(10))
print("proc:\n", proc["employment_type"].value_counts(dropna=False).head(10))


raw rows : 214281
proc rows: 88849

[abnormal_household]
raw:
 abnormal_household
0    214281
Name: count, dtype: int64
proc:
 abnormal_household
0    88849
Name: count, dtype: int64

[unique counts]
raw household: 105217 person: 214281
proc household: 63955 person: 88849

[age describe]
raw:
 count    214281.000000
mean         42.842571
std          22.549450
min           0.000000
25%          26.000000
50%          43.000000
75%          60.000000
max         100.000000
Name: age, dtype: float64
proc:
 count    88849.000000
mean        45.932200
std         14.665341
min         18.000000
25%         35.000000
50%         45.000000
75%         56.000000
max        100.000000
Name: age, dtype: float64

[employment_type top]
raw:
 employment_type
NaN       125243
一般労働者      66609
短時間労働者     21535
臨時労働者        894
Name: count, dtype: int64
proc:
 employment_type
一般労働者     66584
短時間労働者    21378
臨時労働者       887
Name: count, dtype: int64


In [13]:
# 阪上加工

ddf = df_pop_proc
print("proc min age:", df["age"].min())
print("proc max age:", df["age"].max())

raw = df_pop.copy()
raw18 = raw[raw["age"] >= 18].copy()

print("raw rows :", len(raw))
print("raw18 rows:", len(raw18))

print("raw household:", raw["household_id"].nunique())
print("raw18 household:", raw18["household_id"].nunique())

print("raw person:", raw["person_id"].nunique())
print("raw18 person:", raw18["person_id"].nunique())

proc_ids = set(df_pop_proc["person_id"])
raw18_ids = set(raw18["person_id"])

print("proc ⊂ raw18 ?", proc_ids.issubset(raw18_ids))
print("一致数:", len(proc_ids & raw18_ids), "/", len(proc_ids))




proc min age: 18
proc max age: 100
raw rows : 214281
raw18 rows: 181929
raw household: 105217
raw18 household: 105051
raw person: 214281
raw18 person: 181929
proc ⊂ raw18 ? True
一致数: 88849 / 88849


In [23]:
import pandas as pd
from pathlib import Path

# =========================
# 読み込み
# =========================
raw_path = Path("../data/raw/population/2015_001_8_13208.csv")
raw = pd.read_csv(raw_path)

yamada_path = Path("../data/processed/yamada_processed/population/2015_001_8_13208_01.csv")
yamada = pd.read_csv(yamada_path)   # index_col=0 が必要なら付けてOK

print("raw   :", raw.shape)
print("yamada:", yamada.shape)

# =========================
# 0) raw側にemployment_type列が本当にあるか確認
# =========================
print("\n[raw columns check]")
print("employment_type in raw ?", "employment_type" in raw.columns)
print("employment_type_id in raw ?", "employment_type_id" in raw.columns)

# =========================
# 1) rawの employment_type の分布を見る
# =========================
print("\n[raw employment_type 分布]")
print(raw["employment_type"].value_counts(dropna=False))

print("\n[raw employment_type_id 分布]")
print(raw["employment_type_id"].value_counts(dropna=False))

# =========================
# 2) raw の 18歳以上の分布を見る
# =========================
raw18 = raw[raw["age"] >= 18].copy()

print("\n[raw18 基本情報]")
print("raw18 rows:", len(raw18))
print("raw18 min/max age:", raw18["age"].min(), raw18["age"].max())

print("\n[raw18 employment_type 分布]")
print(raw18["employment_type"].value_counts(dropna=False))

# =========================
# 3) yamada は raw18 の部分集合か（person_idで検証）
# =========================
raw18_ids = set(raw18["person_id"])
yamada_ids = set(yamada["person_id"])

print("\n[yamada ⊂ raw18 ?]", yamada_ids.issubset(raw18_ids))
print("一致数:", len(yamada_ids & raw18_ids), "/", len(yamada_ids))

# =========================
# 4) employment_type 条件も含めるなら：
#    raw18 から yamada と同じ employment_type だけ残して一致するかを見る
# =========================

# yamada側に employment_type があるか（英語列か日本語列かチェック）
print("\n[yamada columns check]")
print("employment_type in yamada ?", "employment_type" in yamada.columns)
print("就業形態 in yamada ?", "就業形態" in yamada.columns)

# yamadaの就業形態を取得（どっちの列名でも動くように）
if "employment_type" in yamada.columns:
    yamada_emp = set(yamada["employment_type"].dropna().unique())
elif "就業形態" in yamada.columns:
    yamada_emp = set(yamada["就業形態"].dropna().unique())
else:
    yamada_emp = None

print("\n[yamada employment_type unique]")
print(yamada_emp)

# raw18 を yamada の就業形態に合わせて絞ってみる（可能なら）
if yamada_emp is not None:
    raw18_emp = raw18[raw18["employment_type"].isin(yamada_emp)].copy()
    raw18_emp_ids = set(raw18_emp["person_id"])

    print("\n[raw18 + employment_type 条件]")
    print("raw18_emp rows:", len(raw18_emp))
    print("yamada ⊂ raw18_emp ?", yamada_ids.issubset(raw18_emp_ids))
    print("一致数:", len(yamada_ids & raw18_emp_ids), "/", len(yamada_ids))
else:
    print("\n⚠️ yamada に就業形態カラムが無いので、employment_type条件の一致検証はスキップします")


raw   : (214281, 25)
yamada: (88849, 26)

[raw columns check]
employment_type in raw ? True
employment_type_id in raw ? True

[raw employment_type 分布]
employment_type
NaN       125243
一般労働者      66609
短時間労働者     21535
臨時労働者        894
Name: count, dtype: int64

[raw employment_type_id 分布]
employment_type_id
NaN     125243
10.0     66609
20.0     21535
30.0       894
Name: count, dtype: int64

[raw18 基本情報]
raw18 rows: 181929
raw18 min/max age: 18 100

[raw18 employment_type 分布]
employment_type
NaN       93080
一般労働者     66584
短時間労働者    21378
臨時労働者       887
Name: count, dtype: int64

[yamada ⊂ raw18 ?] True
一致数: 88849 / 88849

[yamada columns check]
employment_type in yamada ? True
就業形態 in yamada ? False

[yamada employment_type unique]
{'臨時労働者', '一般労働者', '短時間労働者'}

[raw18 + employment_type 条件]
raw18_emp rows: 88849
yamada ⊂ raw18_emp ? True
一致数: 88849 / 88849
