In [1]:
import pandas as pd
import duckdb

In [2]:
parquet_file = "data.parquet"
con = duckdb.connect('taxi_analytics.duckdb')

In [3]:
con.execute("CREATE TABLE IF NOT EXISTS temp_taxi_data AS SELECT * FROM read_parquet('data.parquet')")

<duckdb.duckdb.DuckDBPyConnection at 0x12f96b930>

In [4]:
schema_info = con.execute("PRAGMA table_info('temp_taxi_data')").fetchall()

In [5]:
con.close()

In [6]:
import mysql.connector

duckdb_con = duckdb.connect('taxi_analytics.duckdb')

In [7]:
def convert_type(duck_type):
    duck_type = duck_type.upper()
    if "BIGINT" in duck_type:
        return "BIGINT"
    elif "INT" in duck_type: 
        return "INT"
    elif "DOUBLE" in duck_type or "FLOAT" in duck_type:
        return "DOUBLE"
    elif "VARCHAR" in duck_type or "CHAR" in duck_type:
        return "VARCHAR(255)"
    elif "BOOLEAN" in duck_type:
        return "TINYINT(1)"
    elif "DATE" in duck_type:
        return "DATE"
    elif "TIMESTAMP" in duck_type:
        return "DATETIME"
    else:
        return "TEXT"

columns_def = []
for cid, col_name, col_type, notnull, dflt_value, pk in schema_info:
    mysql_type = convert_type(col_type)
    col_def = f"`{col_name}` {mysql_type}"
    if notnull:
        col_def += " NOT NULL"
    if pk:
        col_def += " PRIMARY KEY"
    columns_def.append(col_def)

create_table_sql = f"CREATE TABLE temp_taxi_data ({', '.join(columns_def)});"
print("MySQL CREATE TABLE statement:")
print(create_table_sql)

duckdb_data = duckdb_con.execute("SELECT * FROM temp_taxi_data").fetchall()
print(f"Fetched {len(duckdb_data)} rows from DuckDB's 'temp_taxi_data' table.")

mysql_conn = mysql.connector.connect(option_files="/etc/my.cnf")
cursor = mysql_conn.cursor()

new_db = "taxi_data"
cursor.execute(f"CREATE DATABASE IF NOT EXISTS {new_db};")
print(f"Database '{new_db}' created or already exists.")
cursor.execute(f"USE {new_db};")
print(f"Using database '{new_db}'.")

cursor.execute("DROP TABLE IF EXISTS temp_taxi_data;")
cursor.execute(create_table_sql)
mysql_conn.commit()
print("Created table 'temp_taxi_data' in MySQL.")

MySQL CREATE TABLE statement:
CREATE TABLE temp_taxi_data (`VendorID` BIGINT, `lpep_pickup_datetime` DATETIME, `lpep_dropoff_datetime` DATETIME, `trip_duration` BIGINT, `day_of_week` VARCHAR(255), `hour_of_day` INT, `store_and_fwd_flag` VARCHAR(255), `RatecodeID` DOUBLE, `PULocationID` BIGINT, `DOLocationID` BIGINT, `passenger_count` DOUBLE, `trip_distance` DOUBLE, `fare_amount` DOUBLE, `extra` DOUBLE, `mta_tax` DOUBLE, `tip_amount` DOUBLE, `tolls_amount` DOUBLE, `improvement_surcharge` DOUBLE, `total_amount` DOUBLE, `payment_type` DOUBLE, `trip_type` DOUBLE, `congestion_surcharge` DOUBLE, `fare_per_mile` DOUBLE, `location_pair` VARCHAR(255));
Fetched 1673170 rows from DuckDB's 'temp_taxi_data' table.
Database 'taxi_data' created or already exists.
Using database 'taxi_data'.
Created table 'temp_taxi_data' in MySQL.


In [8]:
num_cols = len(schema_info)
placeholders = ", ".join(["%s"] * num_cols)
insert_sql = f"INSERT INTO temp_taxi_data VALUES ({placeholders})"

batch_size = 10000
for i in range(0, len(duckdb_data), batch_size):
    batch = duckdb_data[i : i + batch_size]
    cursor.executemany(insert_sql, batch)
    mysql_conn.commit()
    print(f"Inserted rows {i} to {i + len(batch) - 1}") 

print("Done!")

Inserted rows 0 to 9999
Inserted rows 10000 to 19999
Inserted rows 20000 to 29999
Inserted rows 30000 to 39999
Inserted rows 40000 to 49999
Inserted rows 50000 to 59999
Inserted rows 60000 to 69999
Inserted rows 70000 to 79999
Inserted rows 80000 to 89999
Inserted rows 90000 to 99999
Inserted rows 100000 to 109999
Inserted rows 110000 to 119999
Inserted rows 120000 to 129999
Inserted rows 130000 to 139999
Inserted rows 140000 to 149999
Inserted rows 150000 to 159999
Inserted rows 160000 to 169999
Inserted rows 170000 to 179999
Inserted rows 180000 to 189999
Inserted rows 190000 to 199999
Inserted rows 200000 to 209999
Inserted rows 210000 to 219999
Inserted rows 220000 to 229999
Inserted rows 230000 to 239999
Inserted rows 240000 to 249999
Inserted rows 250000 to 259999
Inserted rows 260000 to 269999
Inserted rows 270000 to 279999
Inserted rows 280000 to 289999
Inserted rows 290000 to 299999
Inserted rows 300000 to 309999
Inserted rows 310000 to 319999
Inserted rows 320000 to 329999
In

In [9]:
cursor.close()
mysql_conn.close()
duckdb_con.close()
print("Done!")

Done!
