In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import datetime
import time as T
import pyproj

In [2]:
ndf = pd.read_csv('data.csv')
ndf

Unnamed: 0,ID,user_id,latitude,longitude,flag,time
0,44,0,29.199405,-82.114429,True,2022-09-05 22:41:13
1,48,0,29.193010,-82.070770,True,2022-09-06 13:11:03
2,49,0,29.193013,-82.070768,True,2022-09-06 13:11:54
3,34,0,29.199564,-82.116608,True,2022-09-07 02:48:04
4,35,0,29.199564,-82.116608,True,2022-09-07 15:18:45
...,...,...,...,...,...,...
9692290,13980046,26517,29.067680,-82.178090,True,2022-10-30 20:06:01
9692291,13980047,26517,29.381230,-82.233250,True,2022-10-30 20:39:30
9692292,13980048,26517,29.381230,-82.233220,True,2022-10-30 20:39:39
9692293,13980049,26517,29.381239,-82.233202,True,2022-10-30 20:39:44


<h3>Add columns Lat-3856 and Lon-3857 with EPSG:3857</h3>

In [3]:
# Create a transformer object to convert from EPSG 4326 (WGS84) to EPSG 3857 (Web Mercator)
transformer = pyproj.Transformer.from_crs("EPSG:4326", "EPSG:3857", always_xy=True)

# Iterate over rows and apply the transformer to convert the coordinates
for idx, row in tqdm(ndf.iterrows(), total=ndf.shape[0]):
    lat, lon = row['latitude'], row['longitude']
    x, y = transformer.transform(lon, lat)
    ndf.at[idx, 'Lat-3857'] = y
    ndf.at[idx, 'Lon-3857'] = x

# Print the updated DataFrame
print(ndf)

100%|██████████| 9692295/9692295 [14:59<00:00, 10776.48it/s]

               ID  user_id   latitude  longitude  flag                 time  \
0              44        0  29.199405 -82.114429  True  2022-09-05 22:41:13   
1              48        0  29.193010 -82.070770  True  2022-09-06 13:11:03   
2              49        0  29.193013 -82.070768  True  2022-09-06 13:11:54   
3              34        0  29.199564 -82.116608  True  2022-09-07 02:48:04   
4              35        0  29.199564 -82.116608  True  2022-09-07 15:18:45   
...           ...      ...        ...        ...   ...                  ...   
9692290  13980046    26517  29.067680 -82.178090  True  2022-10-30 20:06:01   
9692291  13980047    26517  29.381230 -82.233250  True  2022-10-30 20:39:30   
9692292  13980048    26517  29.381230 -82.233220  True  2022-10-30 20:39:39   
9692293  13980049    26517  29.381239 -82.233202  True  2022-10-30 20:39:44   
9692294  13980084    26518  30.618410 -81.615250  True  2022-10-23 08:23:05   

             Lat-3857      Lon-3857  
0        3.40




In [4]:
ID_list=ndf['user_id'].unique()
len(ID_list)

26519

In [5]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
def home_location(input_df,s_hour=22,t_hour=6):
    '''
    ndf: gps data
    s_hour: define start time of night, defult = 22
    t_hour: define end time of night, defult = 6
    '''
    input_df['time']=pd.to_datetime(input_df['time'], infer_datetime_format=True)
    input_df['date']=input_df['time'].dt.date
    input_df['hour']=pd.Series(input_df['time']).dt.hour
    
    input_df['LAT_Grid']=np.round(input_df['Lat-3857']/20)*20  # Grid size = 20m
    input_df['LON_Grid']=np.round(input_df['Lon-3857']/20)*20
    id=input_df.iloc[0,0]
    input_df=input_df[(input_df['hour']>=s_hour)|(input_df['hour']<t_hour)]

    if len(input_df)==0:  # if no signal during the night, return NaN
        return np.nan,np.nan
    home = input_df.groupby(['LAT_Grid','LON_Grid'])['date'].nunique().idxmax()
    return home[0],home[1]

In [6]:
home=pd.DataFrame(columns=['ID','LAT','LON']) # initial a DataFrame to save home location

for i in tqdm(ID_list[0:]): 
    dffh=ndf[ndf['user_id']==i].sort_values('time',axis=0,ascending=True) # extract records
    h_lat,h_lon=home_location(dffh) # calculate home location
    home=home.append(pd.DataFrame([[i,h_lat,h_lon]],columns=['ID','LAT','LON'])) # append the new row 

100%|██████████| 26519/26519 [05:50<00:00, 75.57it/s]


In [7]:
nan_df = home[home.isna().any(axis=1)]
home.dropna(subset=['LAT'], inplace=True)
home

Unnamed: 0,ID,LAT,LON
0,0,3401080.0,-9141180.0
0,1,3458860.0,-9093020.0
0,2,3456360.0,-9163720.0
0,3,3531380.0,-9073260.0
0,4,3510020.0,-9105740.0
...,...,...,...
0,26513,3455080.0,-9048720.0
0,26514,3502100.0,-9070380.0
0,26515,3552380.0,-9084280.0
0,26516,3546820.0,-9101180.0


<h3>----------------------------------------------------------------------------------------------------------------</h3>

<h3>
Further adjust to infer home location for more users</h3>

In [8]:
ID_list=nan_df['ID'].unique()
len(ID_list)

8225

In [9]:
#check if a date is in saturday or sunday.
def is_weekend(date):
    return date.weekday() in [5, 6]

def home_location(input_df):
    '''
    ndf: gps data
    '''
    input_df['time']=pd.to_datetime(input_df['time'], infer_datetime_format=True)
    input_df['date']=input_df['time'].dt.date
    input_df['hour']=pd.Series(input_df['time']).dt.hour
    
    input_df['LAT_Grid']=np.round(input_df['Lat-3857']/20)*20  # Grid size = 20m
    input_df['LON_Grid']=np.round(input_df['Lon-3857']/20)*20
    input_df= input_df[input_df['date'].apply(is_weekend)]
    
    if len(input_df)==0: 
        return np.nan,np.nan
    home2 = input_df.groupby(['LAT_Grid','LON_Grid']).count()['ID'].idxmax()
    return home2[0],home2[1]

In [10]:
home2=pd.DataFrame(columns=['ID','LAT','LON']) # initial a DataFrame to save home location

for i in tqdm(ID_list[0:]): 
    dffh=ndf[ndf['user_id']==i].sort_values('time',axis=0,ascending=True) # extract records
    h_lat,h_lon=home_location(dffh) # calculate home location
    home2=home2.append(pd.DataFrame([[i,h_lat,h_lon]],columns=['ID','LAT','LON'])) # append the new row 

100%|██████████| 8225/8225 [01:40<00:00, 82.08it/s]


In [11]:
home2.dropna(subset=['LAT'], inplace=True)
home2

Unnamed: 0,ID,LAT,LON
0,11,3494500.0,-9220780.0
0,31,3524980.0,-9088820.0
0,35,3509280.0,-9140160.0
0,41,3370180.0,-9124240.0
0,45,3480920.0,-9146080.0
...,...,...,...
0,26492,3372900.0,-9048460.0
0,26501,3588800.0,-9065680.0
0,26503,3490020.0,-9051880.0
0,26506,3545180.0,-9097480.0


In [12]:
new_home = pd.concat([home, home2], ignore_index=True)
new_home

Unnamed: 0,ID,LAT,LON
0,0,3401080.0,-9141180.0
1,1,3458860.0,-9093020.0
2,2,3456360.0,-9163720.0
3,3,3531380.0,-9073260.0
4,4,3510020.0,-9105740.0
...,...,...,...
22020,26492,3372900.0,-9048460.0
22021,26501,3588800.0,-9065680.0
22022,26503,3490020.0,-9051880.0
22023,26506,3545180.0,-9097480.0


In [13]:
# Create a transformer object to convert from EPSG 3857 (Web Mercator) to EPSG 4326 (WGS84)
inputGrid = pyproj.Proj(projparams='epsg:3857')
wgs84 = pyproj.Proj(projparams='epsg:4326')

new_home = new_home.reset_index(drop=True)

# Iterate over rows and apply the transformer to convert the coordinates
for idx, row in tqdm(new_home.iterrows(), total=new_home.shape[0]):
    lat, lon = pyproj.transform(inputGrid, wgs84, row['LON'], row['LAT'])
    new_home.at[idx, 'LAT-4326'] = lat
    new_home.at[idx, 'LON-4326'] = lon

# Print the updated DataFrame
new_home

100%|██████████| 22025/22025 [00:37<00:00, 589.65it/s]


Unnamed: 0,ID,LAT,LON,LAT-4326,LON-4326
0,0,3401080.0,-9141180.0,29.199637,-82.116617
1,1,3458860.0,-9093020.0,29.651722,-81.683988
2,2,3456360.0,-9163720.0,29.632203,-82.319097
3,3,3531380.0,-9073260.0,30.216272,-81.506481
4,4,3510020.0,-9105740.0,30.050323,-81.798254
...,...,...,...,...,...
22020,26492,3372900.0,-9048460.0,28.978423,-81.283699
22021,26501,3588800.0,-9065680.0,30.660990,-81.438389
22022,26503,3490020.0,-9051880.0,29.894687,-81.314422
22023,26506,3545180.0,-9097480.0,30.323338,-81.724053


In [14]:
new_home.to_csv('home_location.csv')