<a href="https://colab.research.google.com/github/tchtan/Taxi-Analytics/blob/main/taxi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Import Lib**

In [None]:
import pandas as pd
import numpy as np
import shapely.geometry as geometry
from tqdm import tqdm
from datetime import datetime
from math import radians, cos, sin, asin, sqrt

# **Haversine Formula**

In [None]:
def haversine(lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    km = 6371* c
    m = km * 1000
    return m

# **BKK Polygon**

In [None]:
polygoncoords = [
    (100.3260000, 13.8080000),
    (100.3240000, 13.8040000),
    (100.3290000, 13.7860000),
    (100.3260000, 13.7710000),
    (100.3260000, 13.7610000),
    (100.3290000, 13.7580000),
    (100.3290000, 13.7530000),
    (100.3320000, 13.7510000),
    (100.3310000, 13.7450000),
    (100.3330000, 13.7360000),
    (100.3310000, 13.7340000),
    (100.3320000, 13.7250000),
    (100.3290000, 13.7230000),
    (100.3310000, 13.7160000),
    (100.3280000, 13.7140000),
    (100.3290000, 13.7070000),
    (100.3270000, 13.6950000),
    (100.3300000, 13.6790000),
    (100.3340000, 13.6740000),
    (100.3370000, 13.6630000),
    (100.3350000, 13.6540000),
    (100.3380000, 13.6510000),
    (100.3420000, 13.6510000),
    (100.3420000, 13.6420000),
    (100.3470000, 13.6370000),
    (100.3510000, 13.6360000),
    (100.3530000, 13.6330000),
    (100.3560000, 13.6330000),
    (100.3580000, 13.6290000),
    (100.3630000, 13.6290000),
    (100.3640000, 13.6270000),
    (100.3660000, 13.6270000),
    (100.3680000, 13.6190000),
    (100.3700000, 13.6170000),
    (100.3730000, 13.6170000),
    (100.3750000, 13.5920000),
    (100.3810000, 13.5860000),
    (100.3810000, 13.5780000),
    (100.3870000, 13.5670000),
    (100.3870000, 13.5630000),
    (100.3920000, 13.5510000),
    (100.3960000, 13.5490000),
    (100.4040000, 13.5490000),
    (100.4110000, 13.5510000),
    (100.4110000, 13.5490000),
    (100.4160000, 13.5450000),
    (100.4090000, 13.5380000),
    (100.4070000, 13.5260000),
    (100.4030000, 13.5210000),
    (100.4030000, 13.5110000),
    (100.4010000, 13.5100000),
    (100.4010000, 13.5070000),
    (100.4540000, 13.2160000),
    (100.4560000, 13.2130000),
    (100.4610000, 13.2130000),
    (100.4620000, 13.2170000),
    (100.4580000, 13.4980000),
    (100.4480000, 13.5540000),
    (100.4510000, 13.5820000),
    (100.4500000, 13.5850000),
    (100.4550000, 13.5930000),
    (100.4560000, 13.6010000),
    (100.4580000, 13.6000000),
    (100.4620000, 13.6010000),
    (100.4630000, 13.5950000),
    (100.4660000, 13.5920000),
    (100.4660000, 13.5900000),
    (100.4710000, 13.5890000),
    (100.4770000, 13.5910000),
    (100.4780000, 13.5950000),
    (100.4770000, 13.5970000),
    (100.4830000, 13.5970000),
    (100.4870000, 13.5990000),
    (100.4890000, 13.6020000),
    (100.4980000, 13.6020000),
    (100.5010000, 13.6040000),
    (100.5040000, 13.6020000),
    (100.5050000, 13.5960000),
    (100.5080000, 13.5930000),
    (100.5140000, 13.5930000),
    (100.5170000, 13.6000000),
    (100.5240000, 13.6020000),
    (100.5250000, 13.6070000),
    (100.5240000, 13.6110000),
    (100.5210000, 13.6140000),
    (100.5190000, 13.6230000),
    (100.5190000, 13.6290000),
    (100.5220000, 13.6330000),
    (100.5200000, 13.6440000),
    (100.5220000, 13.6540000),
    (100.5210000, 13.6610000),
    (100.5220000, 13.6650000),
    (100.5250000, 13.6680000),
    (100.5250000, 13.6710000),
    (100.5300000, 13.6670000),
    (100.5420000, 13.6650000),
    (100.5490000, 13.6680000),
    (100.5520000, 13.6710000),
    (100.5570000, 13.6860000),
    (100.5550000, 13.6990000),
    (100.5560000, 13.7020000),
    (100.5620000, 13.7020000),
    (100.5780000, 13.6960000),
    (100.5850000, 13.6890000),
    (100.5850000, 13.6840000),
    (100.5820000, 13.6780000),
    (100.5750000, 13.6700000),
    (100.5760000, 13.6660000),
    (100.5890000, 13.6610000),
    (100.5900000, 13.6590000),
    (100.6000000, 13.6540000),
    (100.6050000, 13.6540000),
    (100.6060000, 13.6520000),
    (100.6120000, 13.6500000),
    (100.6200000, 13.6510000),
    (100.6260000, 13.6480000),
    (100.6270000, 13.6460000),
    (100.6350000, 13.6430000),
    (100.6380000, 13.6440000),
    (100.6440000, 13.6500000),
    (100.6520000, 13.6480000),
    (100.6540000, 13.6500000),
    (100.6540000, 13.6580000),
    (100.6570000, 13.6670000),
    (100.6620000, 13.6670000),
    (100.6650000, 13.6630000),
    (100.6870000, 13.6540000),
    (100.6970000, 13.6520000),
    (100.7010000, 13.6530000),
    (100.7030000, 13.6620000),
    (100.7080000, 13.6700000),
    (100.7110000, 13.6840000),
    (100.7140000, 13.6900000),
    (100.7140000, 13.6960000),
    (100.7160000, 13.6990000),
    (100.7150000, 13.7130000),
    (100.7190000, 13.7140000),
    (100.7500000, 13.7120000),
    (100.7640000, 13.7130000),
    (100.7660000, 13.7110000),
    (100.7880000, 13.7130000),
    (100.8600000, 13.6850000),
    (100.8630000, 13.6870000),
    (100.8650000, 13.6960000),
    (100.8640000, 13.7000000),
    (100.8610000, 13.7010000),
    (100.8940000, 13.7540000),
    (100.9130000, 13.7880000),
    (100.9310000, 13.7980000),
    (100.9430000, 13.8150000),
    (100.9130000, 13.8390000),
    (100.9170000, 13.8480000),
    (100.9140000, 13.8510000),
    (100.9100000, 13.8510000),
    (100.9120000, 13.8960000),
    (100.9180000, 13.9460000),
    (100.9160000, 13.9500000),
    (100.7920000, 13.9360000),
    (100.7520000, 13.9220000),
    (100.6910000, 13.9200000),
    (100.6940000, 13.9320000),
    (100.6910000, 13.9350000),
    (100.6440000, 13.9350000),
    (100.6440000, 13.9390000),
    (100.6420000, 13.9410000),
    (100.6290000, 13.9420000),
    (100.6290000, 13.9500000),
    (100.6250000, 13.9540000),
    (100.6150000, 13.9590000),
    (100.6060000, 13.9590000),
    (100.6020000, 13.9570000),
    (100.6000000, 13.9540000),
    (100.6010000, 13.9510000),
    (100.5880000, 13.9560000),
    (100.5740000, 13.9590000),
    (100.5640000, 13.9540000),
    (100.5400000, 13.8520000),
    (100.5340000, 13.8500000),
    (100.5160000, 13.8330000),
    (100.5050000, 13.8270000),
    (100.5020000, 13.8180000),
    (100.5040000, 13.8130000),
    (100.5070000, 13.8120000),
    (100.4990000, 13.8040000),
    (100.4740000, 13.7950000),
    (100.4730000, 13.7980000),
    (100.4630000, 13.8020000),
    (100.4390000, 13.8040000),
    (100.3260000, 13.8080000)
]
line = geometry.LineString(polygoncoords)
polygon = geometry.Polygon(line)
xmin, ymin, xmax, ymax = polygon.bounds
n = 1000
x = np.arange(np.floor(xmin * n) / n, np.ceil(xmax * n) / n, 1 / n)
y = np.arange(np.floor(ymin * n) / n, np.ceil(ymax * n) / n, 1 / n)
points = geometry.MultiPoint(np.transpose(
    [np.tile(x, len(y)), np.repeat(y, len(x))]))
result = points.intersection(polygon)
individual_points = [(str(round(point.y, 3)) + '-' +
                      str(round(point.x, 3))) for point in result]
individual_points




['13.801-100.325',
 '13.802-100.325',
 '13.803-100.325',
 '13.804-100.325',
 '13.805-100.325',
 '13.806-100.325',
 '13.762-100.326',
 '13.763-100.326',
 '13.764-100.326',
 '13.765-100.326',
 '13.766-100.326',
 '13.767-100.326',
 '13.768-100.326',
 '13.769-100.326',
 '13.77-100.326',
 '13.771-100.326',
 '13.797-100.326',
 '13.798-100.326',
 '13.799-100.326',
 '13.8-100.326',
 '13.801-100.326',
 '13.802-100.326',
 '13.803-100.326',
 '13.804-100.326',
 '13.805-100.326',
 '13.806-100.326',
 '13.807-100.326',
 '13.808-100.326',
 '13.761-100.327',
 '13.762-100.327',
 '13.763-100.327',
 '13.764-100.327',
 '13.765-100.327',
 '13.766-100.327',
 '13.767-100.327',
 '13.768-100.327',
 '13.769-100.327',
 '13.77-100.327',
 '13.771-100.327',
 '13.772-100.327',
 '13.773-100.327',
 '13.774-100.327',
 '13.775-100.327',
 '13.776-100.327',
 '13.794-100.327',
 '13.795-100.327',
 '13.796-100.327',
 '13.797-100.327',
 '13.798-100.327',
 '13.799-100.327',
 '13.8-100.327',
 '13.801-100.327',
 '13.802-100.327',

# **Check BKK Function**

In [None]:
def isBKK(month,day,hour):
    df = pd.read_csv("./2020"+month+"/2020"+month+day+"/2020"+month+day+"-"+hour+".csv")
    lat_lon = df["lat"].round(3).astype(str) + '_' + df["lon"].round(3).astype(str)
    df['lat-lon'] = df["lat"].round(4).astype(str) + '_' + df["lon"].round(4).astype(str)
    df['BKK']= lat_lon.isin(individual_points)
    # df.to_csv("./2020"+month+"/2020"+month+day+"/2020"+month+day+"-"+hour+".csv")

# **Import Drive**

In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

In [None]:
cluster = pd.read_pickle('/content/gdrive/MyDrive/taxi-data/cluster.pickle')

# Data Manipulation
- Check BKK
- Add [ lat-lon ]
- Check ClusterID from cluster.pickle & Add [ ClusterID ] 
- Add [ pickup ] & [ dropoff ]

In [None]:
def allInOne(month,day,hour):
  df = pd.read_csv('gdrive/My Drive/taxi-data/2020/2020'+month+'/2020'+month+day+'/2020'+month+day+'-'+hour+'.csv')
  df = df.drop(columns=['date','timeR'])  
  lat_lon = df["lat"].round(3).astype(str) + '-' + df["lon"].round(3).astype(str)
  df['lat-lon'] = df["lat"].round(4).astype(str) + '-' + df["lon"].round(4).astype(str)
  df['BKK']= lat_lon.isin(individual_points)
  df['clusterID'] = df['lat-lon'].apply(lambda x: cluster[x] if x in cluster.keys() else np.nan)
  df = df.sort_values(["VehicleID","timestamp"])
  pu = df[(df['VehicleID'] == df['VehicleID'].shift(-1)) & (df['for_hire_light'].shift(-1) - df['for_hire_light'] == -1)]
  do = df[(df['VehicleID'] == df['VehicleID'].shift(-1)) & (df['for_hire_light'].shift(-1) - df['for_hire_light'] == 1)]
  df['pickup'] = df.index.isin(pu.index)
  df['dropoff'] = df.index.isin(do.index)
  df.to_csv('gdrive/MyDrive/taxi-data/2020/202001V2/2020'+month+day+'/2020'+month+day+'-'+hour+'.csv')

In [None]:
# from datetime import date, timedelta

# def daterange(start_date, end_date):
#     for n in range(int((end_date - start_date).days)):
#         yield start_date + timedelta(n)

# start_date = date(2020, 1, 1)
# end_date = date(2021, 1, 1)
# yearmonthdays = []
# months = []
# hours = []
# for single_date in daterange(start_date, end_date):
#     yearmonthdays.append(single_date.strftime("%Y%m%d"))
# for i in range(12):
#     months.append(str(i+1).zfill(2))
# for i in range(24):
#     hours.append(str(i).zfill(2))
# yearmonthdays.remove('20200331')
  

In [None]:
import os

def mkdir(month,day,hour):
    path = '/content/gdrive/MyDrive/taxi-data/2021/2021'+month+'/2021'+month+day
    try:
        os.mkdir(path)
    except OSError as error:
        print(error)   

months = ['01','02','03','04','05','06','07','08','09','10','11','12']
hours = ['00','01','02','03','04','05','06','07','08','09','10','11','12','13','14','15','16','17','18','19','20','21','22','23']
days_28 = ['01','02','03','04','05','06','07','08','09','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25','26','27','28']
days_30 = ['01','02','03','04','05','06','07','08','09','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25','26','27','28','29','30']
days_31 = ['01','02','03','04','05','06','07','08','09','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25','26','27','28','29','30','31']
days_DEC = ['01','02','03','04','05','06','07','08','09','10','11','12','13','14','15','16','17','18','19','20','21','22','25','26','27','28','29','30','31']

In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

In [None]:
def splitHour(month,day,hour):
  df = pd.read_csv('gdrive/My Drive/taxi-data/2021/2021'+month+'/2021'+month+day+'.csv', sep=',', 
                 names=['VehicleID', 'gpsvalid', 'lat', 'lon', 'timestamp', 'speed', 'heading', 'for_hire_light', 'engine_acc'])
  df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y-%m-%dT%H:%M:%S') # convert to datetime
  df = df[(df['timestamp'].dt.month == int(month)) & (df['timestamp'].dt.day == int(day))] # check month & day
  df2 = df[(df['timestamp'].dt.hour == int(hour))] # check hour
  df2.to_csv('/content/gdrive/MyDrive/taxi-data/2021/2021'+month+'/2021'+month+day+'/2021'+month+day+'-'+hour+'.csv',index=False)

In [None]:
for month in months:
    if month == '02':
        for day in days_28:
            for hour in hours:    
                splitHour(month,day,hour)
            print(day+'-'+month)
    elif month in ('04','06','09','11'):
        for day in days_30:
            for hour in hours:    
                splitHour(month,day,hour)
            print(day+'-'+month)
    elif month in ('01','03','05','07','08','10'):
        for day in days_31:
            for hour in hours:    
                splitHour(month,day,hour)
            print(day+'-'+month)
    elif month == '12':
        for day in days_DEC:
            for hour in hours:    
                splitHour(month,day,hour)
            print(day+'-'+month)            

In [None]:
# for month in ['12']:
#     if month == '02':
#         for day in days_29:
#             for hour in hours:    
#                 allInOne(month,day,hour)
#             print(day+'-'+month)
#     elif month in ('04','06','09','11'):
#         for day in days_30:
#             for hour in hours:    
#                 allInOne(month,day,hour)
#             print(day+'-'+month)
#     elif month in ('01','03','05','07','08','10','12'):
#         for day in days_31:
#             for hour in hours:    
#                 allInOne(month,day,hour)
#             print(day+'-'+month)

# **Distance**

In [None]:
from google.colab import drive

drive.mount('/content/gdrive' , force_remount=True)

Mounted at /content/gdrive


In [None]:
import glob
path = r'/content/gdrive/MyDrive/2020/202003/202003**/*.csv'
#for fname in glob.glob(path):
#  df = pd.read_csv(fname)
#  df['VehicleID1'] = df['VehicleID'].shift(-1)
#  df.to_csv(fname)
#  print(fname)

for fname in glob.glob(path):
  df = pd.read_csv(fname)
  df['lat1'] = df['lat'].shift(-1)
  df['lon1'] = df['lon'].shift(-1)
  df['distance'] = df.apply(lambda x: haversine(x.lon, x.lat, x.lon1, x.lat1), axis=1)
  df.to_csv(fname)
  print(fname)

In [None]:
import glob 
#import io
path = r'/content/gdrive/MyDrive/2020/202012/202012**/*.csv'
#path = r'/content/gdrive/MyDrive/2020/202009/20200920/20200920-04.csv'
#path = r'/content/test da.csv'
# from google.colab import files 
# uploaded = files.upload()
dis = 0
check = False
for fname in glob.glob(path):
  df = pd.read_csv(fname)
  for i , row in df.iterrows():
    if(row['VehicleID'] == row['VehicleID1']):
      if(row['pickup'] == True):
        check = True
      if(row['dropoff'] == True):
        check = False
        df.at[i,'distance'] = dis 
        dis = 0
      if(check == True):
        dis = dis + row['distance']
    else:
      check = False
      dis = 0
  df = df.loc[:, ~df.columns.str.contains('Unnamed')]
  df.to_csv(fname)
  print(fname)
#df.to_csv('test2.csv')
#from google.colab import files
#files.download("test2.csv")

* 01 : 5-11
* 02 : 16-22
* 03 : 15-21
* 04 : 19-25
* 05 : 17-23
* 06 : 7-13
* 07 : 12-18
* 08 : 16-22
* 09 : 6-12
* 10 : 4-10
* 11 : 8-14
* 12 : 13-19

In [1]:
from google.colab import drive
import pandas as pd
import numpy as np
from pathlib import Path
import tqdm
import glob
import os
from datetime import datetime
pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
# Mount Google Drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
df = pd.read_csv('/content/gdrive/MyDrive/2020/clusters/ALL.csv')

In [4]:
#select if pickup or dropoff
df = df[(df['dropoff'] == True)]
df

Unnamed: 0,VehicleID,gpsvalid,lat,lon,timestamp,speed,heading,for_hire_light,engine_acc,lat-lon,BKK,clusterID,pickup,dropoff,lat1,lon1,distance,VehicleID1
0,+2L7LBRmCGckWqgtLFypdGuQ5i8,1,13.82797,100.55295,2020-01-05 00:35:53,0,73,0,1,13.828-100.553,True,8.0,False,True,13.82953,100.55319,0.000000,+2L7LBRmCGckWqgtLFypdGuQ5i8
2,+3rQ3PUgUtPmONvyj8DeEzIjdgw,1,13.72644,100.52874,2020-01-05 00:20:28,9,248,0,1,13.7264-100.5287,True,10.0,False,True,13.72593,100.52851,879.525396,+3rQ3PUgUtPmONvyj8DeEzIjdgw
4,+3rQ3PUgUtPmONvyj8DeEzIjdgw,1,13.69664,100.51378,2020-01-05 00:37:58,0,169,0,1,13.6966-100.5138,True,,False,True,13.69190,100.51106,3748.049877,+3rQ3PUgUtPmONvyj8DeEzIjdgw
7,+BAgYWCbvz0z377ef3Yp687Pp+0,1,13.72337,100.51911,2020-01-05 00:42:35,0,327,0,1,13.7234-100.5191,True,519.0,False,True,13.72352,100.51927,7512.533000,+BAgYWCbvz0z377ef3Yp687Pp+0
8,+BHIGcG913RfL0g1vwotKmyi/RY,1,13.69658,100.64764,2020-01-05 00:15:00,19,244,0,1,13.6966-100.6476,True,,False,True,13.68936,100.64707,0.000000,+BHIGcG913RfL0g1vwotKmyi/RY
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5105848,zniGk3q31dWIPmhhp23+D2TrPDw,1,13.67019,100.68982,2020-02-20 20:01:41,21,286,0,1,13.6702-100.6898,True,,False,True,13.66722,100.68331,0.000000,zniGk3q31dWIPmhhp23+D2TrPDw
5105850,zniGk3q31dWIPmhhp23+D2TrPDw,1,13.68521,100.66958,2020-02-20 20:30:41,35,285,0,1,13.6852-100.6696,True,,False,True,13.68533,100.66727,2996.992985,zniGk3q31dWIPmhhp23+D2TrPDw
5105853,zntRKlvXILigTYkZUAupZHGUi64,1,13.85464,100.54894,2020-02-20 20:57:14,0,120,0,0,13.8546-100.5489,True,,False,True,13.85464,100.54894,12.681182,zntRKlvXILigTYkZUAupZHGUi64
5105855,zyd/BtiSWwgOzssLuCRUl6a5NSk,1,13.71537,100.47772,2020-02-20 20:46:19,68,59,0,1,13.7154-100.4777,True,,False,True,13.72302,100.49560,8044.104195,zyd/BtiSWwgOzssLuCRUl6a5NSk


In [5]:
df = df.sort_values(["VehicleID","timestamp"])

In [6]:
df['hour'] = df['timestamp'].str[11:13]
df['day'] = df['timestamp'].str[8:10] 
df

Unnamed: 0,VehicleID,gpsvalid,lat,lon,timestamp,speed,heading,for_hire_light,engine_acc,lat-lon,BKK,clusterID,pickup,dropoff,lat1,lon1,distance,VehicleID1,hour,day
484243,+/8n5w1tt5+Bhu9gPPyN44wy7wQ,1,13.84244,100.84088,2020-01-10 18:07:52,0,170,0,1,13.8424-100.8409,True,,False,True,13.84244,100.84088,0.000000,+/8n5w1tt5+Bhu9gPPyN44wy7wQ,18,10
5006143,+/8n5w1tt5+Bhu9gPPyN44wy7wQ,1,13.81696,100.75165,2020-02-16 10:07:31,49,181,0,1,13.817-100.7516,True,,False,True,13.81534,100.75162,0.000000,+/8n5w1tt5+Bhu9gPPyN44wy7wQ,10,16
5006145,+/8n5w1tt5+Bhu9gPPyN44wy7wQ,1,13.76851,100.80795,2020-02-16 10:37:10,74,48,0,1,13.7685-100.808,True,,False,True,13.77229,100.81223,15012.937643,+/8n5w1tt5+Bhu9gPPyN44wy7wQ,10,16
4975044,+/8n5w1tt5+Bhu9gPPyN44wy7wQ,1,13.85576,100.85476,2020-02-16 11:35:10,0,264,0,1,13.8558-100.8548,True,,False,True,13.85578,100.85346,0.000000,+/8n5w1tt5+Bhu9gPPyN44wy7wQ,11,16
5014186,+/8n5w1tt5+Bhu9gPPyN44wy7wQ,1,13.84442,100.84064,2020-02-16 12:49:10,30,172,0,1,13.8444-100.8406,True,,False,True,13.84250,100.84088,11672.596688,+/8n5w1tt5+Bhu9gPPyN44wy7wQ,12,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2826065,zzJcCfa6nuUF9A02Sud5fASxowM,1,13.72212,100.72271,2020-09-10 05:04:50,50,91,0,1,13.7221-100.7227,True,,False,True,13.72206,100.71849,0.000000,zzJcCfa6nuUF9A02Sud5fASxowM,05,10
2828566,zzJcCfa6nuUF9A02Sud5fASxowM,1,13.72098,100.71859,2020-09-10 06:24:50,8,181,0,1,13.721-100.7186,True,,False,True,13.72211,100.71779,0.000000,zzJcCfa6nuUF9A02Sud5fASxowM,06,10
2828567,zzJcCfa6nuUF9A02Sud5fASxowM,1,13.72264,100.74395,2020-09-10 06:45:50,0,88,0,1,13.7226-100.744,True,,False,True,13.72547,100.74645,5331.010280,zzJcCfa6nuUF9A02Sud5fASxowM,06,10
2821093,zzJcCfa6nuUF9A02Sud5fASxowM,1,13.72635,100.74634,2020-09-10 07:13:50,0,358,0,1,13.7264-100.7463,True,,False,True,13.72706,100.74635,0.000000,zzJcCfa6nuUF9A02Sud5fASxowM,07,10


In [7]:
df2 = df[["lat","lon"]]
df2

Unnamed: 0,lat,lon
484243,13.84244,100.84088
5006143,13.81696,100.75165
5006145,13.76851,100.80795
4975044,13.85576,100.85476
5014186,13.84442,100.84064
...,...,...
2826065,13.72212,100.72271
2828566,13.72098,100.71859
2828567,13.72264,100.74395
2821093,13.72635,100.74634


DBSCAN

In [8]:
from sklearn.cluster import DBSCAN

In [9]:
dbscan = DBSCAN(eps=0.0002, min_samples=16)

In [10]:
cluster = dbscan.fit_predict(df2)
cluster

array([ -1,  -1,  -1, ...,  87, 576,  -1])

In [11]:
df3 =  pd.DataFrame(cluster,columns=["clusterID"])

In [None]:
df3

In [12]:
df5 = pd.DataFrame.join(df2,df3)
df5 = df5.assign(counts=1)
df5 = df5.join(df["day"])
df5 = df5.join(df["hour"])
df5
df5 = df5[df5.clusterID != -1]

In [13]:
df5

Unnamed: 0,lat,lon,clusterID,counts,day,hour
484243,13.84244,100.84088,4.0,1,10,18
5006143,13.81696,100.75165,,1,16,10
5006145,13.76851,100.80795,,1,16,10
4975044,13.85576,100.85476,,1,16,11
5014186,13.84442,100.84064,,1,16,12
...,...,...,...,...,...,...
2826065,13.72212,100.72271,,1,10,05
2828566,13.72098,100.71859,,1,10,06
2828567,13.72264,100.74395,,1,10,06
2821093,13.72635,100.74634,,1,10,07


In [14]:
df5.to_csv('/content/gdrive/MyDrive/2020/clusters/cluster_dropoff_details.csv',index=False)

In [15]:
df6 = df5.groupby(['clusterID']).agg({"lat":"mean","lon":"mean","counts":"sum"})
df6

Unnamed: 0_level_0,lat,lon,counts
clusterID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,13.769948,100.582352,71
1.0,13.762828,100.555109,269
2.0,13.761622,100.555669,1803
3.0,13.762743,100.557143,8384
4.0,13.763332,100.557349,845263
...,...,...,...
5283.0,13.748622,100.499895,6
5284.0,13.776149,100.569337,10
5285.0,13.776287,100.562827,9
5286.0,13.783423,100.553613,9


In [16]:
df6.to_csv("/content/gdrive/MyDrive/2020/clusters/cluster_dropoff.csv",index=False)

# **Cleaning**

In [None]:
from google.colab import drive
import pandas as pd
import numpy as np
from pathlib import Path
import tqdm
import glob
import os
from datetime import datetime
pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
# Mount Google Drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
path = r'/content/gdrive/MyDrive/2020/202006/202006**/*.csv'

for fname in glob.glob(path):
  df = pd.read_csv(fname)
  df = df.loc[:, ~df.columns.str.contains('Unnamed')]
  df.to_csv(fname)
  print(fname)

In [None]:
# # Directory path
# path = '/content/gdrive/MyDrive/2020/clusters'

# directories = []
# for d in os.listdir(dir_path):
#   directories.append(d)

# # List to store the dataframes
# df_list = []
# file_path = dir_path+ '/'

# # Loop through the directories in the main directory
# for dir in directories[7:14]:
#   for file in os.listdir(file_path+dir):
#     if file.endswith('.csv'):
#       df = pd.read_csv(os.path.join(file_path+dir, file))
#       df = df[(df['BKK'] == True)]
#       df = df[(df['pickup'] == True) | (df['dropoff'] == True)]
#       df = df.loc[:, ~df.columns.str.contains('Unnamed')]
#       df_list.append(df)

# df2 = pd.concat(df_list)

# df2.to_csv('/content/gdrive/MyDrive/2020/FEB.csv')