# processed データの中身を確認する

- home, output ,なし, pre_outputの違い
- 遷移確率計算に使用可能か
- データ数の担保

In [None]:
from pathlib import Path
import pandas as pd

## 各データを読み込む

In [None]:
# homeデータの読み込み
data_home = Path("/Users/y-osamu/study/poi_sim/data/processed/201902_04/201902_week1_home.csv")
trace_home_df = pd.read_csv(data_home,index_col=0)
display(trace_home_df.head())

# outputデータの読み込み
data_output = Path("/Users/y-osamu/study/poi_sim/data/processed/201902_04/201902_week1_output.csv")
trace_output_df = pd.read_csv(data_output,index_col=0)
display(trace_output_df.head())

# rawデータの読み込み
data_raw = Path("/Users/y-osamu/study/poi_sim/data/processed/201902_04/201902_week1.csv")
trace_raw_df = pd.read_csv(data_raw,index_col=0)
display(trace_raw_df.head())

## 各データの基本情報を収集


In [None]:
# データの基本情報確認
for name, df in {
    "raw": trace_raw_df,
    "home": trace_home_df,
    "output": trace_output_df
}.items():
    print(f"\n=== {name} ===")
    print(df.shape)
    print(df.dtypes)

In [None]:
# カラムの差分確認
raw_cols = set(trace_raw_df.columns)

print("raw - home:", raw_cols - set(trace_home_df.columns))
print("home - raw:", set(trace_home_df.columns) - raw_cols)

print("raw - output:", raw_cols - set(trace_output_df.columns))
print("output - raw:", set(trace_output_df.columns) - raw_cols)

## raw vs home

In [None]:
# raw → home で消えた行
key = ["dailyid", "hour", "min"]

raw_keys = trace_raw_df[key].drop_duplicates()
home_keys = trace_home_df[key].drop_duplicates()

lost_raw_to_home = raw_keys.merge(
    home_keys,
    on=key,
    how="left",
    indicator=True
).query("_merge == 'left_only'")

lost_raw_to_home.shape

In [None]:
trace_raw_df.merge(
    lost_raw_to_home[key],
    on=key,
    how="inner"
).head()


In [None]:
# raw → home 
##judge の value_counts 差分（raw vs home）
judge_diff = (
    trace_raw_df["judge"].value_counts(dropna=False)
    .rename("raw")
    .to_frame()
    .join(
        trace_home_df["judge"].value_counts(dropna=False)
        .rename("home"),
        how="outer"
    )
    .fillna(0)
    .astype(int)
)

judge_diff["diff_raw_minus_home"] = judge_diff["raw"] - judge_diff["home"]

judge_diff.sort_values("diff_raw_minus_home", ascending=False)



In [None]:
## 大分類 の value_counts 差分

bigcat_diff = (
    trace_raw_df["大分類"].value_counts(dropna=False)
    .rename("raw")
    .to_frame()
    .join(
        trace_home_df["大分類"].value_counts(dropna=False)
        .rename("home"),
        how="outer"
    )
    .fillna(0)
    .astype(int)
)

bigcat_diff["diff_raw_minus_home"] = bigcat_diff["raw"] - bigcat_diff["home"]

bigcat_diff.sort_values("diff_raw_minus_home", ascending=False)


In [None]:
# home_flag の value_counts 確認
trace_home_df["home_flag"].value_counts(dropna=False)


In [None]:
## home_flag と大分類の関係（最重要）
homeflag_by_cat = (
    trace_home_df
    .groupby(["大分類", "home_flag"])
    .size()
    .unstack(fill_value=0)
)

homeflag_by_cat.sort_values(True if True in homeflag_by_cat.columns else homeflag_by_cat.columns[0], ascending=False)


## home vs output

In [None]:
# home_flag の value_counts 確認
print(trace_home_df["home_flag"].value_counts(dropna=False))
print(trace_output_df["home_flag"].value_counts(dropna=False))

In [None]:
## home → output で消えた行

key = ["dailyid", "hour", "min"]

lost = trace_home_df.merge(
    trace_output_df[key],
    on=key,
    how="left",
    indicator=True
)

lost["_merge"].value_counts()



In [None]:
## home_flag の value_counts 確認（消えた行）
lost.query("_merge == 'left_only'")["home_flag"].value_counts(dropna=False)


In [None]:
## home → output の差分確認
trace_output_df.merge(
    trace_home_df[key],
    on=key,
    how="left",
    indicator=True
)["_merge"].value_counts()

In [None]:
## 大分類 の value_counts 差分（home vs output）
bigcat_diff_home_output = (

    trace_home_df["大分類"].value_counts(dropna=False)
    .rename("home")
    .to_frame()
    .join(
        trace_output_df["大分類"].value_counts(dropna=False)
        .rename("output"),
        how="outer"
    )
    .fillna(0)
    .astype(int)
)

bigcat_diff_home_output["diff_home_minus_output"] = (
    bigcat_diff_home_output["home"]
    - bigcat_diff_home_output["output"]
)

bigcat_diff_home_output.sort_values(
    "diff_home_minus_output",
    ascending=False
)


In [None]:
print("=== home : 大分類 value_counts ===")
print(
    trace_home_df["大分類"]
    .value_counts(dropna=False)
    .sort_values(ascending=False)
    .to_string()
)

print("\n=== output : 大分類 value_counts ===")
print(
    trace_output_df["大分類"]
    .value_counts(dropna=False)
    .sort_values(ascending=False)
    .to_string()
)



In [None]:
## home_flag, home_citycode の null count 確認
print("=== home_df null counts ===")
print(trace_home_df[["home_flag", "home_citycode"]].isna().sum())

print("\n=== output_df null counts ===")
print(trace_output_df[["home_flag", "home_citycode"]].isna().sum())




In [None]:
## home_flag の value_counts 確認
print("\n=== home_df : home_flag value counts ===")
print(trace_home_df["home_flag"].value_counts(dropna=False))

print("\n=== output_df : home_flag value counts ===")
print(trace_output_df["home_flag"].value_counts(dropna=False))

In [None]:
cond = (trace_home_df["home_flag"] == True) & (trace_home_df["home_citycode"].isna())
print("\n=== home_flag=True & home_citycode is NaN (home_df) ===")
print(cond.sum())

cond = (trace_home_df["home_flag"] == False) & (trace_home_df["home_citycode"].notna())
print("\n=== home_flag=False & home_citycode not NaN (home_df) ===")
print(cond.sum())


## raw home outputの違い

In [None]:
print("=== raw : count per hour ===")
print(
    trace_raw_df["hour"]
    .value_counts()
    .sort_index()
)

print("\n=== home : count per hour ===")
print(
    trace_home_df["hour"]
    .value_counts()
    .sort_index()
)

print("\n=== output : count per hour ===")
print(
    trace_output_df["hour"]
    .value_counts()
    .sort_index()
)


## 生データを確認

In [None]:
# データを読み込む
trace_path = Path("/Users/y-osamu/study/poi_sim/data/raw/trace/sktrace(old)/20190201.csv")
trace_df = pd.read_csv(trace_path,index_col=0)
trace_df = trace_df.reset_index()
trace_df = trace_df.drop(columns=['Unnamed: 0'])

trace_df.head()

In [None]:
# TODO: traece_df のデータ数の確認
print(f"trace_dfのデータ数: {len(trace_df)}")

In [None]:
# TODO: 各カラムを１行ずつ表示して確認
for column in trace_df.columns:
    print(f"{column}")

In [None]:
# 各カラムのユニーク値の数を確認
unique_count_df = (
    trace_df
    .nunique(dropna=True)
    .reset_index()
    .rename(columns={"index": "column", 0: "unique_count"})
)

display(unique_count_df)



In [None]:
# TODO: poiカラムのユニークな値を確認
unique_pois = trace_df["poi"].unique()
print(f"poiカラムのユニークな値の数: {len(unique_pois)}")
print("ユニークなpoiの一覧:")
print(unique_pois)

for poi in unique_pois:
    print(poi)

In [None]:
# 各個人の軌跡データを見る

import pandas as pd

sample_ids = (
    trace_df["dailyid"].dropna().drop_duplicates().sample(n=10, random_state=42)
)

by_person_sample = {pid: g for pid, g in trace_df[trace_df["dailyid"].isin(sample_ids)].groupby("dailyid")}

# 表示
# for pid, df_person in by_person_sample.items():
#     print(f"--- {pid} ---")
#     display(df_person)


In [None]:
# trace_dfのhomecitycodeのユニーク値を確認
unique_homecitycodes = trace_df["home_citycode"].unique()
print(f"homecitycodeカラムのユニークな値の数: {len(unique_homecitycodes)}")
print("ユニークなhomecitycodeの一覧:")
print(unique_homecitycodes)