# データ確認


- 調布市以外のデータを確認
    - データ数を確認
    - PoIの数、種類、分布を確認

In [None]:
import pandas as pd
import os
import json
import csv
import glob
import collections
import matplotlib.pyplot as plt
import numpy as np

In [None]:
print(os.getcwd())  # 例: '/Users/username/projects'

### 0. データの読み込み

In [None]:
# Load

#軌跡データ　ファイル一覧取得
trace_path = "/Users/osamu/study/poi_sim/data/exe_data/"
files = sorted(os.listdir(trace_path))  
# display(files)



# データ内容確認　（all, defalt, home , output)
trace_path = "/Users/osamu/study/poi_sim/data/origin/trace/20190201.csv"
default_path = "/Users/osamu/study/poi_sim/data/exe_data/201902_week1.csv"
home_path    = "/Users/osamu/study/poi_sim/data/exe_data/201902_week1_home.csv"
output_path  = "/Users/osamu/study/poi_sim/data/exe_data/201902_week1_output.csv"

trace_df = pd.read_csv(trace_path, index_col=0)
default_df = pd.read_csv(default_path, index_col=0)
home_df    = pd.read_csv(home_path, index_col=0)
output_df  = pd.read_csv(output_path, index_col=0)

def drop_unnamed_columns(df):
    return df.loc[:, ~df.columns.str.contains('^Unnamed')]

trace_df   = drop_unnamed_columns(trace_df)
default_df = drop_unnamed_columns(default_df)
home_df    = drop_unnamed_columns(home_df)
output_df  = drop_unnamed_columns(output_df)

display(trace_df.head(20))
# display(default_df.head(5))
# display(home_df.head(5))
# display(output_df.head(5))


# print("trace_df",len(trace_df))
# print("default_df",len(default_df))
# print("home_df",len(home_df))
# print("output_df",len(output_df))


In [None]:
print("trace_df",trace_df["citycode"].value_counts())
print("default_df",default_df["citycode"].value_counts())
print("home_df",home_df["citycode"].value_counts())
print("output_df",output_df["citycode"].value_counts())

### 1. 隣接市町村　
＊６桁目はチェック数字
- 131121	世田谷区	せたがやく
- 132047	三鷹市	みたかし
- 132101	小金井市	こがねいし
- 132063	府中市	ふちゅうし
- 132195	狛江市	こまえし
- 132250	稲城市	いなぎし

#### 1.1 各市区のデータ数確認

In [None]:
city_map = {
    13112.0: "世田谷区",
    13204.0: "三鷹市",
    13210.0: "小金井市",
    13206.0: "府中市",
    13219.0: "狛江市",
    13225.0: "稲城市"
}

counts = trace_df['citycode'].value_counts()
result = counts[counts.index.isin(city_map.keys())]

result_df = result.reset_index()
result_df.columns = ['citycode', 'count']
result_df['city'] = result_df['citycode'].map(city_map)
result_df = result_df[['citycode', 'city', 'count']]

print(result_df)

#### 1.2 PoI大カテゴリのユニークを確認

In [None]:
import re

# 文字列から先頭の要素（大分類）を抜き出す関数
def fast_get_major(x):
    if isinstance(x, str) and x.startswith("["):
        # 先頭のシングルクオートで囲まれた文字列を正規表現で取る
        m = re.match(r"\['([^']+)'", x)
        return m.group(1) if m else None
    else:
        return x

# 大分類を抽出
trace_df['poi_major'] = trace_df['poi'].map(fast_get_major)

# ユニーク一覧
unique_major = trace_df['poi_major'].unique()
print(sorted(unique_major))


#### 1.3 隣接する各市区のPoI頻出上位１０件取得

In [None]:
import ast

def get_major_category(x):
    try:
        if isinstance(x, str) and x.startswith("[") and x.endswith("]"):
            parsed = ast.literal_eval(x)
            return parsed[0] if parsed else None
        else:
            return x
    except:
        return x

trace_df['poi_major'] = trace_df['poi'].apply(get_major_category)

city_map = {
    13112.0: "世田谷区",
    13204.0: "三鷹市",
    13210.0: "小金井市",
    13206.0: "府中市",
    13219.0: "狛江市",
    13225.0: "稲城市"
}
trace_df['city'] = trace_df['citycode'].map(city_map)

city_major_counts = (
    trace_df.groupby(['city','poi_major'])
    .size()
    .reset_index(name='count')
)

top10_each_city = (
    city_major_counts
    .sort_values(['city','count'], ascending=[True,False])  # 都市ごとに count 降順
    .groupby('city')
    .head(10)  
)

print(top10_each_city)


#### 1.4 小分類までみる

In [None]:
trace_df['city'] = trace_df['citycode'].map(city_map)

top_pois = (
    trace_df.groupby('city')['poi']
    .apply(lambda x: x.value_counts().head(10))
    .reset_index()
)

top_pois.columns = ['city', 'poi', 'count']

print(top_pois)

#### 1.5 homeが調布市である隣接各市区のデータ数

In [None]:
trace_df[trace_df['citycode'] == 13208]

In [None]:
trace_df['home_citycode'].unique()

In [None]:
chofu_code = 13208.0  

chofu_home = trace_df[trace_df['home_citycode'] == chofu_code]

# 各 citycode ごとの件数を集計
chofu_counts = (
    chofu_home.groupby('citycode')
    .size()
    .reset_index(name='count')
    .sort_values('count', ascending=False)
)

# 市町村名を付けたい場合
chofu_counts['city'] = chofu_counts['citycode'].map(city_map)

print(chofu_counts)
