# Notice:
- Virtual memory (paging file) should be turned off when running this crash test

## Steps to Disable Virtual Memory (Windows):
- Open Control Panel and navigate to System.
- Click Advanced system settings.
- Under the Advanced tab, click Settings in the Performance section.
- Go to the Advanced tab in the new window, then click Change under Virtual memory.
- Uncheck Automatically manage paging file size for all drives.
- Select No paging file, click Set, and restart the computer. 

In [1]:
import pandas as pd
import glob
import time
import sys
import gc
import os

In [None]:
folder_path = "./yellow_tripdata"
limit_files = 2

if not os.path.exists(folder_path):
    os.makedirs(folder_path)

all_files = glob.glob(os.path.join(folder_path, "*.parquet"))

for y in range(2015, 2026):
    if len(all_files) >= limit_files:
        break
    for m in range(1, 13):
        all_files = glob.glob(os.path.join(folder_path, "*.parquet"))
        if len(all_files) >= limit_files:
            break

        file_name = f"yellow_tripdata_{y}-{m:02d}.parquet"
        url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/{file_name}"
        print(f"Downloading {file_name}...")
        os.system(f"curl -o {folder_path}/{file_name} {url}")

Downloading yellow_tripdata_2015-01.parquet...
Downloading yellow_tripdata_2015-02.parquet...
Downloading yellow_tripdata_2015-03.parquet...
Downloading yellow_tripdata_2015-04.parquet...


In [None]:
folder_path = "./yellow_tripdata"
# (lấy 1, 3, 5 file hoặc None để lấy tất cả)
limit_files = 2
seconds_to_sleep = 5

all_files = glob.glob(os.path.join(folder_path, "*.parquet"))

files_to_load = all_files[:limit_files] if limit_files else all_files

dfs = []

try:
    if not files_to_load:
        print(f"Không tìm thấy file nào trong thư mục: {folder_path}")
    else:
        print(f"Tiến hành nạp {len(files_to_load)} file.")

        for i, f in enumerate(files_to_load):
            df_temp = pd.read_parquet(f)
            dfs.append(df_temp)
            
            mem_usage = sys.getsizeof(df_temp) / (1024**2)
            print(f"[{i+1}/{len(files_to_load)}] Nạp file: {os.path.basename(f)}")
            print(f"   -> RAM chiếm dụng: {mem_usage:.2f} MB")
            
            # Tạm dừng để quan sát
            time.sleep(seconds_to_sleep)

        print("\n--- Đang thực hiện gộp dữ liệu (pd.concat) ---")
        df_final = pd.concat(dfs, ignore_index=True)
        # df_final.to_parquet("all.parquet")
        
        print("Hoàn tất! Cấu trúc dữ liệu sau khi nạp:")
        print(df_final.info())
        print("\n5 dòng dữ liệu đầu tiên:")
        print(df_final.head())

        del dfs
        del df_final
        gc.collect()

except MemoryError:
    print("\nLỖI: Tràn bộ nhớ RAM. Hãy giảm 'limit_files' xuống.")
    del dfs
    del df_final
    gc.collect()
except Exception as e:
    print(f"\nĐã xảy ra lỗi: {e}")
    del dfs
    del df_final
    gc.collect()

Tiến hành nạp 4 file.
[1/4] Nạp file: yellow_tripdata_2015-01.parquet
   -> RAM chiếm dụng: 2247.90 MB
[2/4] Nạp file: yellow_tripdata_2015-02.parquet
   -> RAM chiếm dụng: 2195.21 MB
[3/4] Nạp file: yellow_tripdata_2015-03.parquet
   -> RAM chiếm dụng: 2354.09 MB
[4/4] Nạp file: yellow_tripdata_2015-04.parquet
   -> RAM chiếm dụng: 2304.84 MB

--- Đang thực hiện gộp dữ liệu (pd.concat) ---
Hoàn tất! Cấu trúc dữ liệu sau khi nạp:
<class 'pandas.DataFrame'>
RangeIndex: 51590138 entries, 0 to 51590137
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[us]
 2   tpep_dropoff_datetime  datetime64[us]
 3   passenger_count        int64         
 4   trip_distance          float64       
 5   RatecodeID             int64         
 6   store_and_fwd_flag     str           
 7   PULocationID           int64         
 8   DOLocationID           int6