In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import datetime
import time as T
import pyproj

In [2]:
ndf = pd.read_csv('data.csv')
ndf

Unnamed: 0,ID,user_id,latitude,longitude,flag,time
0,37,0,30.216216,-81.506529,True,2022-09-09 01:01:29
1,38,0,30.216200,-81.506517,True,2022-09-09 01:01:53
2,39,0,30.216216,-81.506529,True,2022-09-09 01:02:35
3,40,0,30.216216,-81.506529,True,2022-09-09 01:03:13
4,41,0,30.216216,-81.506529,True,2022-09-09 01:03:54
...,...,...,...,...,...,...
3917132,5844351,12419,30.382490,-81.611984,True,2022-10-27 01:08:48
3917133,5844352,12419,30.382490,-81.611984,True,2022-10-27 01:09:53
3917134,5844412,12420,30.336060,-81.757240,True,2022-10-31 02:06:17
3917135,5844413,12420,30.336050,-81.757240,True,2022-10-31 02:06:46


<h3>Add columns Lat-3856 and Lon-3857 with EPSG:3857</h3>

In [3]:
# Create a transformer object to convert from EPSG 4326 (WGS84) to EPSG 3857 (Web Mercator)
transformer = pyproj.Transformer.from_crs("EPSG:4326", "EPSG:3857", always_xy=True)

# Iterate over rows and apply the transformer to convert the coordinates
for idx, row in tqdm(ndf.iterrows(), total=ndf.shape[0]):
    lat, lon = row['latitude'], row['longitude']
    x, y = transformer.transform(lon, lat)
    ndf.at[idx, 'Lat-3857'] = y
    ndf.at[idx, 'Lon-3857'] = x

# Print the updated DataFrame
print(ndf)

100%|██████████| 3917137/3917137 [06:51<00:00, 9519.97it/s] 

              ID  user_id   latitude  longitude  flag                 time  \
0             37        0  30.216216 -81.506529  True  2022-09-09 01:01:29   
1             38        0  30.216200 -81.506517  True  2022-09-09 01:01:53   
2             39        0  30.216216 -81.506529  True  2022-09-09 01:02:35   
3             40        0  30.216216 -81.506529  True  2022-09-09 01:03:13   
4             41        0  30.216216 -81.506529  True  2022-09-09 01:03:54   
...          ...      ...        ...        ...   ...                  ...   
3917132  5844351    12419  30.382490 -81.611984  True  2022-10-27 01:08:48   
3917133  5844352    12419  30.382490 -81.611984  True  2022-10-27 01:09:53   
3917134  5844412    12420  30.336060 -81.757240  True  2022-10-31 02:06:17   
3917135  5844413    12420  30.336050 -81.757240  True  2022-10-31 02:06:46   
3917136  5844414    12420  30.336050 -81.757240  True  2022-10-31 02:07:50   

             Lat-3857      Lon-3857  
0        3.531373e+06 -9.




In [4]:
ID_list=ndf['user_id'].unique()
len(ID_list)

12421

In [5]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
def home_location(input_df,s_hour=22,t_hour=6):
    '''
    ndf: gps data
    s_hour: define start time of night, defult = 22
    t_hour: define end time of night, defult = 6
    '''
    input_df['time']=pd.to_datetime(input_df['time'], infer_datetime_format=True)
    input_df['date']=input_df['time'].dt.date
    input_df['hour']=pd.Series(input_df['time']).dt.hour
    
    input_df['LAT_Grid']=np.round(input_df['Lat-3857']/20)*20  # Grid size = 20m
    input_df['LON_Grid']=np.round(input_df['Lon-3857']/20)*20
    id=input_df.iloc[0,0]
    input_df=input_df[(input_df['hour']>=s_hour)|(input_df['hour']<t_hour)]

    if len(input_df)==0:  # if no signal during the night, return NaN
        return np.nan,np.nan
    home = input_df.groupby(['LAT_Grid','LON_Grid'])['date'].nunique().idxmax()
    return home[0],home[1]

In [6]:
home=pd.DataFrame(columns=['ID','LAT','LON']) # initial a DataFrame to save home location

for i in tqdm(ID_list[0:]): 
    dffh=ndf[ndf['user_id']==i].sort_values('time',axis=0,ascending=True) # extract records
    h_lat,h_lon=home_location(dffh) # calculate home location
    home=home.append(pd.DataFrame([[i,h_lat,h_lon]],columns=['ID','LAT','LON'])) # append the new row 

100%|██████████| 12421/12421 [01:41<00:00, 122.63it/s]


In [7]:
nan_df = home[home.isna().any(axis=1)]
home.dropna(subset=['LAT'], inplace=True)
home

Unnamed: 0,ID,LAT,LON
0,0,3531380.0,-9073260.0
0,1,3516560.0,-9096800.0
0,2,3565080.0,-9087560.0
0,4,3546060.0,-9072020.0
0,5,3519320.0,-9107220.0
...,...,...,...
0,12415,3544960.0,-9089520.0
0,12417,3546600.0,-9091320.0
0,12418,3541820.0,-9079540.0
0,12419,3552380.0,-9084280.0


<h3>----------------------------------------------------------------------------------------------------------------</h3>

<h3>
Further adjust to infer home location for more users</h3>

In [8]:
ID_list=nan_df['ID'].unique()
len(ID_list)

4019

In [9]:
#check if a date is in saturday or sunday.
def is_weekend(date):
    return date.weekday() in [5, 6]

def home_location(input_df):
    '''
    ndf: gps data
    '''
    input_df['time']=pd.to_datetime(input_df['time'], infer_datetime_format=True)
    input_df['date']=input_df['time'].dt.date
    input_df['hour']=pd.Series(input_df['time']).dt.hour
    
    input_df['LAT_Grid']=np.round(input_df['Lat-3857']/20)*20  # Grid size = 20m
    input_df['LON_Grid']=np.round(input_df['Lon-3857']/20)*20
    input_df= input_df[input_df['date'].apply(is_weekend)]
    
    if len(input_df)==0: 
        return np.nan,np.nan
    home2 = input_df.groupby(['LAT_Grid','LON_Grid']).count()['ID'].idxmax()
    return home2[0],home2[1]

In [10]:
home2=pd.DataFrame(columns=['ID','LAT','LON']) # initial a DataFrame to save home location

for i in tqdm(ID_list[0:]): 
    dffh=ndf[ndf['user_id']==i].sort_values('time',axis=0,ascending=True) # extract records
    h_lat,h_lon=home_location(dffh) # calculate home location
    home2=home2.append(pd.DataFrame([[i,h_lat,h_lon]],columns=['ID','LAT','LON'])) # append the new row 

100%|██████████| 4019/4019 [00:31<00:00, 128.81it/s]


In [11]:
home2.dropna(subset=['LAT'], inplace=True)
home2

Unnamed: 0,ID,LAT,LON
0,8,3529340.0,-9078080.0
0,9,3524980.0,-9088820.0
0,22,3516960.0,-9106620.0
0,26,3531860.0,-9078100.0
0,34,3559580.0,-9092660.0
...,...,...,...
0,12395,3578260.0,-9088880.0
0,12402,3572660.0,-9109640.0
0,12405,3537840.0,-9077960.0
0,12410,3516540.0,-9109640.0


In [12]:
new_home = pd.concat([home, home2], ignore_index=True)
new_home

Unnamed: 0,ID,LAT,LON
0,0,3531380.0,-9073260.0
1,1,3516560.0,-9096800.0
2,2,3565080.0,-9087560.0
3,4,3546060.0,-9072020.0
4,5,3519320.0,-9107220.0
...,...,...,...
10095,12395,3578260.0,-9088880.0
10096,12402,3572660.0,-9109640.0
10097,12405,3537840.0,-9077960.0
10098,12410,3516540.0,-9109640.0


In [13]:
# Create a transformer object to convert from EPSG 3857 (Web Mercator) to EPSG 4326 (WGS84)
inputGrid = pyproj.Proj(projparams='epsg:3857')
wgs84 = pyproj.Proj(projparams='epsg:4326')

new_home = new_home.reset_index(drop=True)

# Iterate over rows and apply the transformer to convert the coordinates
for idx, row in tqdm(new_home.iterrows(), total=new_home.shape[0]):
    lat, lon = pyproj.transform(inputGrid, wgs84, row['LON'], row['LAT'])
    new_home.at[idx, 'LAT-4326'] = lat
    new_home.at[idx, 'LON-4326'] = lon

# Print the updated DataFrame
new_home

100%|██████████| 10100/10100 [00:19<00:00, 525.40it/s]


Unnamed: 0,ID,LAT,LON,LAT-4326,LON-4326
0,0,3531380.0,-9073260.0,30.216272,-81.506481
1,1,3516560.0,-9096800.0,30.101163,-81.717945
2,2,3565080.0,-9087560.0,30.477524,-81.634940
3,4,3546060.0,-9072020.0,30.330161,-81.495342
4,5,3519320.0,-9107220.0,30.122610,-81.811549
...,...,...,...,...,...
10095,12395,3578260.0,-9088880.0,30.579509,-81.646798
10096,12402,3572660.0,-9109640.0,30.536190,-81.833288
10097,12405,3537840.0,-9077960.0,30.266406,-81.548702
10098,12410,3516540.0,-9109640.0,30.101007,-81.833288


In [14]:
new_home.to_csv('home_localtion.csv')