In [9]:
#!/usr/bin/env python
# coding: utf-8

# In[30]:


import json
import re
import gc
import pickle
import itertools
import pandas as pd
import numpy as np
from glob import glob
import os
from datetime import datetime as dt
from pathlib import Path
from tqdm import tqdm
import datetime
ts_conv = np.vectorize(datetime.datetime.fromtimestamp) # ut(10 digit) -> date

# pandas settings -----------------------------------------
pd.set_option("display.max_colwidth", 100)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.options.display.float_format = '{:,.5f}'.format

# Graph drawing -------------------------------------------
import matplotlib
from matplotlib import font_manager
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib import rc
from matplotlib_venn import venn2, venn2_circles
from matplotlib import animation as ani
from IPython.display import Image
from pylab import imread

plt.rcParams["patch.force_edgecolor"] = True
from IPython.display import display # Allows the use of display() for DataFrames
import seaborn as sns
sns.set(style="whitegrid", palette="muted", color_codes=True)
sns.set_style("whitegrid", {'grid.linestyle': '--'})
red = sns.xkcd_rgb["light red"]
green = sns.xkcd_rgb["medium green"]
blue = sns.xkcd_rgb["denim blue"]

get_ipython().run_line_magic('matplotlib', 'inline')
get_ipython().run_line_magic('config', "InlineBackend.figure_format='retina'")

# ML -------------------------------------------
from sklearn.preprocessing import LabelEncoder


# In[31]:


def unpickle(filename):
    with open(filename, 'rb') as fo:
        p = pickle.load(fo)
    return p

def to_pickle(filename, obj):
    with open(filename, 'wb') as f:
        pickle.dump(obj, f, -1)



class FeatureStore():
    
    # necessayr to re-check
    floor_convert = {'1F' :  0, '2F' : 1, '3F' : 2, '4F' : 3, '5F' : 4, 
                     '6F' : 5, '7F' : 6, '8F' : 7, '9F' : 8,
                     'B'  : -1, 'B1' : -1, 'B2' : -2, 'B3' : -3, 
                     'BF' : -1, 'BM' : -1, 
                     'F1' : 0, 'F2' : 1, 'F3' : 2, 'F4' : 3, 'F5' : 4, 
                     'F6' : 5, 'F7' : 6, 'F8' : 7, 'F9' : 8, 'F10': 9,
                     'L1' : 0, 'L2' : 1, 'L3' : 2, 'L4' : 3, 'L5' : 4, 
                     'L6' : 5, 'L7' : 6, 'L8' : 7, 'L9' : 8, 'L10': 9, 
                     'L11': 10,
                     'G'  : 0, 'LG1': 0, 'LG2': 1, 'LM' : 0, 'M'  : 0, 
                     'P1' : 0, 'P2' : 1,}
    
    df_types = ['accelerometer',
                'accelerometer_uncalibrated',
                'beacon',
                'gyroscope',
                'gyroscope_uncalibrated',
                'magnetic_field',
                'magnetic_field_uncalibrated',
                'rotation_vector',
                'waypoint',
                'wifi']
    
    # https://github.com/location-competition/indoor-location-competition-20
    df_type_cols = {'accelerometer': ["timestamp", "x", "y", "z", "accuracy"],
                'accelerometer_uncalibrated': ["timestamp", "x", "y", "z", 
                                               "x2", "y2", "z2", "accuracy" ],
                'beacon': ["timestamp", "uuid", "major_id", "minor_id", "tx_power", 
                           "rssi", "distance", "mac_addr", "timestamp2"],
                'gyroscope': ["timestamp", "x", "y", "z", "accuracy"],
                'gyroscope_uncalibrated': ["timestamp", "x", "y", "z", 
                                           "x2", "y2", "z2", "accuracy" ],
                'magnetic_field': ["timestamp", "x", "y", "z", "accuracy"],
                'magnetic_field_uncalibrated': ["timestamp", "x", "y", "z", 
                                                "x2", "y2", "z2", "accuracy" ],
                'rotation_vector': ["timestamp", "x", "y", "z", "accuracy"],
                'waypoint': ["timestamp", "x", "y"],
                'wifi': ["timestamp", "ssid", "bssid","rssi","frequency",
                         "last_seen_timestamp",]}

    dtype_dict = {}
    dtype_dict["accelerometer"] = {"timestamp":int, "x":float, "y":float, "z":float, 
                                   "accuracy":int}
    dtype_dict["accelerometer_uncalibrated"] = {"timestamp":int, "x":float, "y":float, 
                                                "z":float, "x2":float, "y2":float, 
                                                "z2":float, "accuracy":int}
    dtype_dict["beacon"] = {"timestamp":int, "uuid":str, "major_id":str, 
                            "minor_id":str, "tx_power":int,  "rssi":int, 
                            "distance":float, "mac_addr":str, "timestamp2":int}
    dtype_dict["gyroscope"] = {"timestamp":int, "x":float, "y":float, "z":float, 
                               "accuracy":int}
    dtype_dict["gyroscope_uncalibrated"] = {"timestamp":int, "x":float, "y":float, 
                                            "z":float, "x2":float, "y2":float, 
                                            "z2":float, "accuracy":int}
    dtype_dict["magnetic_field"] = {"timestamp":int, "x":float, "y":float, 
                                    "z":float, "accuracy":int}
    dtype_dict["magnetic_field_uncalibrated"] = {"timestamp":int, "x":float, 
                                                 "y":float, "z":float, "x2":float, 
                                                 "y2":float, "z2":float, "accuracy":int}
    dtype_dict["rotation_vector"] = {"timestamp":int, "x":float, "y":float, 
                                     "z":float, "accuracy":int}
    dtype_dict["waypoint"] = {"timestamp":int, "x":float, "y":float, "z":float}
    dtype_dict["wifi"] = {"timestamp":int, "ssid":str, "bssid":str,
                          "rssi":int,"frequency":int, "last_seen_timestamp":int}

    def __init__(self, site_id, floor, path_id, 
                 input_path="../input/indoor-location-navigation/",
                 save_path="../mid"):
        self.site_id = site_id.strip()
        self.floor = floor.strip()
        self.n_floor = 0 #self.floor_convert[self.floor]
        self.path_id = path_id.strip()
        
        self.input_path = input_path
        assert Path(input_path).exists(), f"input_path do not exist: {input_path}"
        
        self.save_path = save_path
        Path(save_path).mkdir(parents=True, exist_ok=True)
        
        self.site_info = SiteInfo(site_id=self.site_id, floor=self.floor, input_path=self.input_path)
        
    def _flatten(self, l):
        return list(itertools.chain.from_iterable(l))
    
    def multi_line_spliter(self, s):
        matches = re.finditer("TYPE_", s)
        matches_positions = [match.start() for match in matches]
        split_idx = [0] + [matches_positions[i]-14 for i in range(1, len(matches_positions))] + [len(s)]
        return [s[split_idx[i]:split_idx[i+1]] for i in range(len(split_idx)-1)]
    
    def load_df(self, ):
        path = str(Path(self.input_path)/f"test/{self.path_id}.txt")
        with open(path) as f:
            data = f.readlines()
        
        modified_data = []
        for s in data:
            if s.count("TYPE_")>1:
                lines = self.multi_line_spliter(s)
                modified_data.extend(lines)
            else:
                modified_data.append(s)
        del data
        self.meta_info_len = len([d for d in modified_data if d[0]=="#"])
        self.meta_info_df = pd.DataFrame([m.replace("\n", "").split(":") 
                                          for m in self._flatten([d.split("\t") 
                                                                  for d in modified_data if d[0]=="#"]) if m!="#"])

        data_df = pd.DataFrame([d.replace("\n", "").split("\t") for d in modified_data if d[0]!="#"])
        for dt in self.df_types:
            # select data type
            ##ここ
            df_s = data_df[data_df[1]==f"TYPE_{dt.upper()}"]
            if len(df_s)==0:
                setattr(self, dt, pd.DataFrame(columns=self.df_type_cols[dt]))
            else:
                # remove empty cols
                na_info = df_s.isna().sum(axis=0) == len(df_s)
                df_s = df_s[[i for i in na_info[na_info==False].index if i!=1]].reset_index(drop=True)
                
                if len(df_s.columns)!=len(self.df_type_cols[dt]):
                    df_s.columns = self.df_type_cols[dt][:len(df_s.columns)]
                else:
                    df_s.columns = self.df_type_cols[dt]
            
                # set dtype          
                for c in df_s.columns:
                    
                    #ここ
                    df_s[c] = df_s[c].astype(self.dtype_dict[dt][c])
                                     
                # set DataFrame to attr
                setattr(self, dt, df_s)
    
    def get_site_info(self, keep_raw=False):
        self.site_info.get_site_info(keep_raw=keep_raw)
            
    def load_all_data(self, keep_raw=False):     
        self.load_df()
        self.get_site_info(keep_raw=keep_raw)
        
    def __getitem__(self, item):
        if item in self.df_types:
            return getattr(self, item)
        else:
            return None
    
    def save(self, ):
        # to be implemented
        pass
    
    
class SiteInfo():
    def __init__(self, site_id, floor, input_path="../input/indoor-location-navigation/"):
        self.site_id = site_id
        self.floor = floor
        self.input_path = input_path
        assert Path(input_path).exists(), f"input_path do not exist: {input_path}"
        
    def get_site_info(self, keep_raw=False):
        floor_info_path = "../input/indoor-location-navigation/metadata/5a0546857ecc773753327266/B1/floor_info.json"
        with open(floor_info_path, "r") as f:
            self.floor_info = json.loads(f.read())
            self.site_height = self.floor_info["map_info"]["height"]
            self.site_width = self.floor_info["map_info"]["width"]
            if not keep_raw:
                del self.floor_info
            
        geojson_map_path = "../input/indoor-location-navigation/metadata/5a0546857ecc773753327266/B1/geojson_map.json"
        with open(geojson_map_path, "r") as f:
            self.geojson_map = json.loads(f.read())
            self.map_type = self.geojson_map["type"]
            self.features = self.geojson_map["features"]
            
            self.floor_coordinates = self.features[0]["geometry"]["coordinates"]
            self.store_coordinates = [self.features[i]["geometry"]["coordinates"] 
                                          for i in range(1, len(self.features))]
                
            if not keep_raw:
                del self.geojson_map
    
    def show_site_image(self):
        path = "../input/indoor-location-navigation/metadata/5a0546857ecc773753327266/B1/floor_image.png"
        plt.imshow(imread(path), extent=[0, self.site_width, 0, self.site_height])

    def draw_polygon(self, size=8, only_floor=False):

        fig = plt.figure()
        ax = plt.subplot(111)
            
        xmax, xmin, ymax, ymin = self._draw(self.floor_coordinates, ax, calc_minmax=True)
        if not only_floor:
            self._draw(self.store_coordinates, ax, fill=True)
        plt.legend([])
        
        xrange = xmax - xmin
        yrange = ymax - ymin
        ratio = yrange / xrange
        
        self.x_size = size
        self.y_size = size*ratio

        fig.set_figwidth(size)
        fig.set_figheight(size*ratio)
        # plt.show()
        return ax
        
    def _draw(self, coordinates, ax, fill=False, calc_minmax=False):
        xmax, ymax = -np.inf, -np.inf
        xmin, ymin = np.inf, np.inf
        for i in range(len(coordinates)):
            ndim = np.ndim(coordinates[i])
            if ndim==2:
                corrd_df = pd.DataFrame(coordinates[i])
                if fill:
                    ax.fill(corrd_df[0], corrd_df[1], alpha=0.7)
                else:
                    corrd_df.plot.line(x=0, y=1, style="-", ax=ax)
                        
                if calc_minmax:
                    xmax = max(xmax, corrd_df[0].max())
                    xmin = min(xmin, corrd_df[0].min())

                    ymax = max(ymax, corrd_df[1].max())
                    ymin = min(ymin, corrd_df[1].min())
            elif ndim==3:
                for j in range(len(coordinates[i])):
                    corrd_df = pd.DataFrame(coordinates[i][j])
                    if fill:
                        ax.fill(corrd_df[0], corrd_df[1], alpha=0.6)
                    else:
                        corrd_df.plot.line(x=0, y=1, style="-", ax=ax)
                        
                    if calc_minmax:
                        xmax = max(xmax, corrd_df[0].max())
                        xmin = min(xmin, corrd_df[0].min())

                        ymax = max(ymax, corrd_df[1].max())
                        ymin = min(ymin, corrd_df[1].min())
            else:
                assert False, f"ndim of coordinates should be 2 or 3: {ndim}"
        if calc_minmax:
            return xmax, xmin, ymax, ymin
        else:
            return None
         


# In[32]:


# site_meta_data
site_meta_data = pd.DataFrame([[p.split("/")[-2], p.split("/")[-1]] for p in glob("../input/indoor-location-navigation/metadata/**/*")])
site_meta_data.columns = ["site_id", "floor"]
site_meta_data.head()



Unnamed: 0,site_id,floor
0,5cd56c0ce2acfd2d33b6ab27,B1
1,5cd56c0ce2acfd2d33b6ab27,F3
2,5cd56c0ce2acfd2d33b6ab27,F2
3,5cdbc652853bc856e89a8694,B1
4,5cdbc652853bc856e89a8694,F1


In [10]:
'''
# train_meta_data
#globはパスをlist形式で取得できる。train_metaにtrain以下のパスの文字列を全部入れている。
train_meta = glob("../input/indoor-location-navigation/train/*/*/*")

#train_metaをpdの形式にする。（4桁行×1列の表形式）
train_meta_org = pd.DataFrame(train_meta)

#train_meta_orgはパス名が入ってるので、パス名から建物名と、フロア名、ファイル名（＝＝.txt）を取り出す。
#以下の形
#0      5cd56c0ce2acfd2d33b6ab27   B1  5d09a625bd54340008acddb9.txt
#1      5cd56c0ce2acfd2d33b6ab27   B1  5d09a625bd54340008acddb7.txt
train_meta = train_meta_org[0].str.split("/", expand=True)[[4, 5, 6]]

#列名を付けてあげる
train_meta.columns = ["site_id", "floor", "path_id"]

#.txtをとる
train_meta["path_id"] = train_meta["path_id"].str.replace(".txt", "")

#path列を追加して、そこにパスを代入する（train_meta_orgの0列目にパスが入ってる）
train_meta["path"] = train_meta_org[0]
#train_meta.head()
'''

'\n# train_meta_data\n#globはパスをlist形式で取得できる。train_metaにtrain以下のパスの文字列を全部入れている。\ntrain_meta = glob("../input/indoor-location-navigation/train/*/*/*")\n\n#train_metaをpdの形式にする。（4桁行×1列の表形式）\ntrain_meta_org = pd.DataFrame(train_meta)\n\n#train_meta_orgはパス名が入ってるので、パス名から建物名と、フロア名、ファイル名（＝＝.txt）を取り出す。\n#以下の形\n#0      5cd56c0ce2acfd2d33b6ab27   B1  5d09a625bd54340008acddb9.txt\n#1      5cd56c0ce2acfd2d33b6ab27   B1  5d09a625bd54340008acddb7.txt\ntrain_meta = train_meta_org[0].str.split("/", expand=True)[[4, 5, 6]]\n\n#列名を付けてあげる\ntrain_meta.columns = ["site_id", "floor", "path_id"]\n\n#.txtをとる\ntrain_meta["path_id"] = train_meta["path_id"].str.replace(".txt", "")\n\n#path列を追加して、そこにパスを代入する（train_meta_orgの0列目にパスが入ってる）\ntrain_meta["path"] = train_meta_org[0]\n#train_meta.head()\n'

In [11]:

#test_metaの作成

sample_sub = pd.read_csv('../input/indoor-location-navigation/sample_submission.csv')
test_sites = sample_sub.site_path_timestamp.apply(lambda x: pd.Series(x.split("_")))[0].unique().tolist()

test_meta = sample_sub["site_path_timestamp"].apply(
    lambda x: pd.Series(x.split("_")))
test_meta.columns = ["site_id", "path_id", "timestamp"]
#test_meta=test_meta.drop('timestamp', axis=1)
test_meta = test_meta.drop_duplicates(subset=["site_id", "path_id"]).reset_index(drop=True)

test_meta 


#floorだけない


# In[38]:


def pickle_dump_dill(obj, path):
    with open(path, mode='wb') as f:
        dill.dump(obj, f)


def pickle_load_dill(path):
    with open(path, mode='rb') as f:
        data = dill.load(f)
        return data

In [12]:
#featureで繰り返す数を抽出
sample_sub = pd.read_csv('../input/indoor-location-navigation/sample_submission.csv')
test_sites = sample_sub.site_path_timestamp.apply(lambda x: pd.Series(x.split("_")))[0].unique().tolist()

test_meta = sample_sub["site_path_timestamp"].apply(lambda x: pd.Series(x.split("_")))
test_meta.columns = ["site_id", "path_id", "timestamp"]
#test_meta=test_meta.drop('timestamp', axis=1)
test_meta['site+path'] = test_meta['site_id'].str.cat(test_meta['path_id'])
#test_meta = test_meta.drop_duplicates(subset=["site_id", "path_id"]).reset_index(drop=True)

#site.pathに入ってるwaypointの数をsite＿path:個数でdictに格納
#あとでsite_pathで検索して回数を取り出す
count_dict = test_meta['site+path'].value_counts(sort=False).to_dict()

In [13]:
#train_meta_sub.head(50)

In [14]:
#a=train_meta_sub[train_meta_sub['site+path']=='5a0546857ecc7737533272660c06cc9f21d172618d74c6c8']

In [15]:
#a.index[0]

In [None]:

create_train_meta_sub=True
row=0

if create_train_meta_sub:
    #train_get_row_n = train_meta[train_meta['site_id'].isin(test_sites)].reset_index(drop=True)
    #train_meta_sub['site_id']=0
    train_meta_sub = test_meta[test_meta['site_id'].isin(test_sites)].reset_index(drop=True)

#                        site_id floor                   path_id  \
#0      5da138764db8ce0c98bcaa46    B1  5dabdb5e18410e00067e6fe2   
#                            ↓
#site_id                   path_id
#0    5a0546857ecc773753327266  046cfa46be49fc10834815c6

#列を作成
    my_train = pd.DataFrame()
    my_train['site_id']=0
    my_train['floor']=0
    my_train['path_id']=0
    my_train['path']=0
    
    #for i in tqdm(range(len(train_meta_sub))):
    for i in tqdm(range(0,10132)):
        
        
        
        feature = FeatureStore(site_id=train_meta_sub.iloc[i]['site_id'], floor="BF", path_id=train_meta_sub.iloc[i]['path_id'])
        feature.load_all_data() 
        #start_time=int(feature.meta_info_df[feature.meta_info_df[0]=='startTime'][1])
        #end_time=int(feature.meta_info_df[feature.meta_info_df[0]=='endTime'][1])

        k=0
        m=0
        wifi_unique_timestamp = feature.wifi["timestamp"].unique()
        site_path = train_meta_sub.iloc[i]['site_id']+train_meta_sub.iloc[i]['path_id']
        my_train.loc[row,'site_id']=train_meta_sub.iloc[i]['site_id']
        my_train.loc[row,'floor']= 999
        my_train.loc[row,'path_id']=train_meta_sub.iloc[i]['path_id']
        waypoint_time = int(train_meta_sub.iloc[i]['timestamp'])
        my_train.loc[row,'wp_time']=waypoint_time
        
        #for j in range(count_dict[site_path]):
    
            #waypoint_time = int(train_meta_sub.iloc[i]['timestamp'])
            #my_train.loc[row,'site_id']=feature.site_id
            #my_train.loc[row,'floor']=feature.n_floor
            #my_train.loc[row,'path_id']=feature.path_id
            #my_train.loc[row,'path']=feature.input_path
            #my_train.loc[row,'wp_time']=waypoint_time
            #my_train.loc[row,'x']=feature.waypoint.iloc[j]['x']
            #my_train.loc[row,'y']=feature.waypoint.iloc[j]['y']
            #my_train.loc[row,'start_time']=start_time
            #my_train.loc[row,'end_time']=end_time
            #my_train.loc[row,'next_to_acce_X']=0
            #my_train.loc[row,'next_to_acce_Y']=0
            #my_train.loc[row,'next_to_acce_Z']=0
            
            #wifiが近い5件を追加する。（wifi[0]には一番近いデータが入っている）
        wifi_time_dict = {}
        for m in range(len(wifi_unique_timestamp)):
            wifi_abs=abs(wifi_unique_timestamp[m]-waypoint_time)
            wifi_time_dict[wifi_unique_timestamp[m]] = wifi_abs
                    
        if not len(wifi_time_dict) == 0:
            #一番差が小さいtimestampを抽出
            wifi_nearest = min(wifi_time_dict, key=wifi_time_dict.__getitem__)
            wifi_nearest_time = feature.wifi.index[feature.wifi['timestamp'] == wifi_nearest]
            #print(wifi_nearest_time)
            if len(wifi_nearest_time)<=5:
                fornum = len(wifi_nearest_time)
            else:
                fornum = 5
            for n in range(fornum):
                my_train.loc[row,'wifi_timestamp']=feature.wifi.iloc[wifi_nearest_time[n]]['timestamp']
                        #my_train.loc[row,'wifi_ssid' + str(n)]=feature.wifi.iloc[m+n]['ssid']]
                my_train.loc[row,'wifi_bssid' + str(n)]=feature.wifi.iloc[wifi_nearest_time[n]]['bssid']
                my_train.loc[row,'wifi_rssi' + str(n)]=feature.wifi.iloc[wifi_nearest_time[n]]['rssi']
                    
                my_train.loc[row,'wifi_frequency' + str(n)]=feature.wifi.iloc[wifi_nearest_time[n]]['frequency']
            
            
            #beacon(直近の時間)を追加する。（複数のbeaconを追加するとか考えられるかも）
        for m in range(len(feature.beacon)):
            beacon_time = feature.beacon.iloc[m]["timestamp"]
                #ここのif文は改良の余地あり。timestampが超えても、よりtimeが近いデータがあればそちらを採用した方が良い。
            if beacon_time < waypoint_time:
                my_train.loc[row,'beacon_timestamp']=feature.beacon.iloc[m]['timestamp']
                    #my_train.loc[row,'wifi_ssid' + str(n)]=feature.wifi.iloc[m+n]['ssid']
                my_train.loc[row,'beacon_minorid']=feature.beacon.iloc[m]['minor_id']
                my_train.loc[row,'beacon_rssi']=feature.beacon.iloc[m]['rssi']
                my_train.loc[row,'beacon_distance']=feature.beacon.iloc[m]['distance']
            else:
                break
            #なんか有効な特徴量を追加する。timestampがtimestamp<wp_timeとなっているデータを特徴量とする。
            
            
        x=0
        y=0
        z=0
        for k in range(len(feature.accelerometer)-1):
            acce_time = feature.accelerometer.iloc[k]["timestamp"]
                
                #test_dataの時は、waypoint_time=submission_dataのtimeなので処理が必要
  
                #xyzの時間は1個後のwaypointに対応したもの。
            #if not i+k+1==len(feature.waypoint):

            if my_train.loc[row,'wifi_timestamp']<acce_time<waypoint_time:
                x += (feature.accelerometer.iloc[k+1]["timestamp"]-acce_time) * feature.accelerometer.iloc[k]["x"]
                y += (feature.accelerometer.iloc[k+1]["timestamp"]-acce_time) * feature.accelerometer.iloc[k]["y"]
                z += (feature.accelerometer.iloc[k+1]["timestamp"]-acce_time) * feature.accelerometer.iloc[k]["z"]
                my_train.loc[row,'next_to_acce_X']= x
                my_train.loc[row,'next_to_acce_Y']= y
                my_train.loc[row,'next_to_acce_Z']= z
                    
                    #こっちがメイン
            elif my_train.loc[row,'wifi_timestamp']>acce_time>waypoint_time:
                x +=  (feature.accelerometer.iloc[k+1]["timestamp"]-acce_time) * feature.accelerometer.iloc[k]["x"]
                y += (feature.accelerometer.iloc[k+1]["timestamp"]-acce_time) * feature.accelerometer.iloc[k]["y"]
                z += (feature.accelerometer.iloc[k+1]["timestamp"]-acce_time) * feature.accelerometer.iloc[k]["z"]
                my_train.loc[row,'next_to_acce_X']= x
                my_train.loc[row,'next_to_acce_Y']= y
                my_train.loc[row,'next_to_acce_Z']= z
            
        
        row += 1
            
    my_train.to_csv('test0-10132.csv', index=False)
else:
    my_train = pd.read_csv('../input/indoor-public/my_train.csv')

  9%|▉         | 954/10132 [41:26<32:32:29, 12.76s/it]

In [None]:
my_train.head(50)