In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import glob

column_names = [
    'VehicleID','gpsvalid','lat','lon','timestamp',
    'speed','heading','for_hire_light','engine_acc'
]


files = glob.glob("/content/drive/MyDrive/DA_Project/PROBE-202402/PROBE-202402/202402*.csv.out")

print(f"Found {len(files)} files")


df_list = []
for f in files:
    print(f"Loading {f}...")
    df = pd.read_csv(f, names=column_names)
    df_list.append(df)

traffic_all = pd.concat(df_list, ignore_index=True)


traffic_all['timestamp'] = pd.to_datetime(traffic_all['timestamp'])

print(f" Combined dataset: {len(traffic_all):,} rows from {len(files)} days")


Found 29 files
Loading /content/drive/MyDrive/DA_Project/PROBE-202402/PROBE-202402/20240202.csv.out...
Loading /content/drive/MyDrive/DA_Project/PROBE-202402/PROBE-202402/20240201.csv.out...
Loading /content/drive/MyDrive/DA_Project/PROBE-202402/PROBE-202402/20240203.csv.out...
Loading /content/drive/MyDrive/DA_Project/PROBE-202402/PROBE-202402/20240204.csv.out...
Loading /content/drive/MyDrive/DA_Project/PROBE-202402/PROBE-202402/20240210.csv.out...
Loading /content/drive/MyDrive/DA_Project/PROBE-202402/PROBE-202402/20240206.csv.out...
Loading /content/drive/MyDrive/DA_Project/PROBE-202402/PROBE-202402/20240205.csv.out...
Loading /content/drive/MyDrive/DA_Project/PROBE-202402/PROBE-202402/20240208.csv.out...
Loading /content/drive/MyDrive/DA_Project/PROBE-202402/PROBE-202402/20240207.csv.out...
Loading /content/drive/MyDrive/DA_Project/PROBE-202402/PROBE-202402/20240209.csv.out...
Loading /content/drive/MyDrive/DA_Project/PROBE-202402/PROBE-202402/20240211.csv.out...
Loading /content/

In [None]:
traffic_sample = traffic_all.sample(n=2000000, random_state=42)
traffic_sample.to_csv("/content/drive/MyDrive/DA_Project/traffic_feb2024_sample.csv", index=False)

In [None]:
traffic_sample.head()

Unnamed: 0,VehicleID,gpsvalid,lat,lon,timestamp,speed,heading,for_hire_light,engine_acc
12267685,jMeH1+Sj7iNsqnwTrvnZfXHHtpQ,1,13.59135,100.36354,2024-02-05 04:21:20,0,174,0,0
21207132,d0kSSld76MrToncLj5D+fEbQv1c,1,8.10484,98.30932,2024-02-11 15:27:11,0,110,0,0
30154524,Cz3ORMyG/uHkfR+erT3f8hV9crs,1,7.87432,98.29988,2024-02-14 04:12:51,0,121,0,0
56417208,BfHt2c4llhPSRk2f2RG4rTmiY8c,1,13.80487,100.77665,2024-02-28 04:58:35,0,265,0,1
8057026,IZyH325Y+JZVTFVXpu8DOH7W+k0,1,13.84252,100.596,2024-02-10 00:51:56,52,238,0,1


In [None]:
traffic_sample.describe()

Unnamed: 0,gpsvalid,lat,lon,timestamp,speed,heading,for_hire_light,engine_acc
count,2000000.0,2000000.0,2000000.0,2000000,2000000.0,2000000.0,2000000.0,2000000.0
mean,0.997417,13.41835,100.4644,2024-02-15 12:01:47.952387584,13.10432,177.0556,0.3535735,0.5354965
min,0.0,0.0,0.0,1970-01-01 07:00:00,0.0,0.0,0.0,0.0
25%,1.0,13.67956,100.4551,2024-02-08 07:27:34.750000128,0.0,89.0,0.0,0.0
50%,1.0,13.7609,100.552,2024-02-15 13:58:45.500000,0.0,180.0,0.0,1.0
75%,1.0,13.86054,100.642,2024-02-22 18:41:43.249999872,19.0,269.0,1.0,1.0
max,1.0,20.45388,109.1203,2024-03-02 19:49:25,333.0,360.0,1.0,1.0
std,0.05075756,1.689534,0.7415754,,23.07057,105.4476,0.4780789,0.4987385


In [None]:
traffic_sample.isnull().sum()

Unnamed: 0,0
VehicleID,0
gpsvalid,0
lat,0
lon,0
timestamp,0
speed,0
heading,0
for_hire_light,0
engine_acc,0


In [None]:
traffic_sample = traffic_sample[traffic_sample['gpsvalid'] == 1]

# Remove impossible speeds (NOT outliers, just impossible values)
traffic_sample = traffic_sample[(traffic_sample['speed'] >= 0) & (traffic_sample['speed'] <= 150)]

# Remove outside Bangkok area
bangkok_bounds = {'lat_min': 13.5, 'lat_max': 14.0, 'lon_min': 100.3, 'lon_max': 100.9}
traffic_sample = traffic_sample[
    (traffic_sample['lat'] >= bangkok_bounds['lat_min']) &
    (traffic_sample['lat'] <= bangkok_bounds['lat_max']) &
    (traffic_sample['lon'] >= bangkok_bounds['lon_min']) &
    (traffic_sample['lon'] <= bangkok_bounds['lon_max'])
]

In [None]:
traffic_sample.to_csv("/content/drive/MyDrive/DA_Project/cleaned/traffic_feb2024_sample.csv", index=False)

In [None]:
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')

file_path = "/content/drive/MyDrive/DA_Project/bangkok_bus_routes.csv"


# Read CSV
df = pd.read_csv(file_path)

# count rows that contain at least one null value
print("\nRows with nulls:", df.isnull().any(axis=1).sum())

# drop specific columns by name
columns_to_drop = ['from', 'to', 'operator', 'network']
df = df.drop(columns=columns_to_drop)

print(df)
print(df.columns)

# get only the columns that have at least 1 null value
null_cols = df.columns[df.isnull().any()]
print("Columns with null values:", list(null_cols))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Rows with nulls: 0


KeyError: "['from', 'to', 'operator', 'network'] not found in axis"

In [None]:
traffic_sample.to_csv("/content/drive/MyDrive/DA_Project/cleaned/cleaned_bus_stops_file.csv", index=False)

In [None]:
import pandas as pd
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Path to your original bus routes file in Google Drive
input_path = "/content/drive/MyDrive/DA_Project/bangkok_bus_routes.csv"

# Load CSV
df = pd.read_csv(input_path)

# count rows that contain at least one null value
print("\nRows with nulls:", df.isnull().any(axis=1).sum())

# drop specific columns by name
columns_to_drop = ['from', 'to', 'operator', 'network']
df = df.drop(columns=columns_to_drop)

print(df)
print(df.columns)

# get only the columns that have at least 1 null value
null_cols = df.columns[df.isnull().any()]
print("Columns with null values:", list(null_cols))


In [None]:
traffic_sample.to_csv("/content/drive/MyDrive/DA_Project/cleaned/cleaned_bus_routes_file.csv", index=False)