In [1]:
import pandas as pd
import os
import zipfile
import time
from sqlalchemy import create_engine, inspect
from tqdm import tqdm

In [2]:
# SQLAlchemy connection setup
engine = create_engine(
    "mssql+pyodbc://admin4327:Tr3m3r3Pr1nc3!@nmntserver.database.windows.net/NexusModsDB?driver=ODBC+Driver+17+for+SQL+Server&Connect Timeout=60"
)

In [3]:
output_dir = r"C:\Mod_data"
os.makedirs(output_dir, exist_ok=True)
zip_objects = {'Authors_di', 'Mods_di', 'Games'}
ROW_LIMIT = 1_000_000 

In [4]:
inspector = inspect(engine)
tables= inspector.get_table_names()
views = inspector.get_view_names()
print(tables)
print(views)

['Authors', 'Authors_Backup_20250228_235610', 'Authors_Backup_20250228_235950', 'CleanedModAuthors', 'CleanedModData', 'GameCategories', 'GameCategories_Backup_20250228_235923', 'Games', 'Games_Backup_20250228_235558', 'Mods', 'Mods_1', 'Mods_Backup_20250228_234151', 'TranslatedModData']
['Authors_di', 'Mods_di']


In [5]:
def export_and_split(df, name, output_dir):
    chunks = [df[i:i+ROW_LIMIT] for i in range(0, len(df), ROW_LIMIT)]
    filenames = []
    for idx, chunk in enumerate(chunks, start=1):
        suffix = f"_{idx}" if len(chunks) > 1 else ""
        filename = f"{name}{suffix}.parquet"
        path = os.path.join(output_dir, filename)
        chunk.to_parquet(path, index=False)
        filenames.append(filename)
    return filenames

In [6]:
all_objects = tables + views
print(f"Found {len(tables)} tables and {len(views)} views. Starting export...")
# Track export time
export_times = []

# Export loop with progress bar
for name in tqdm(all_objects, desc="Exporting SQL objects", unit="object"):
    try:
        start_time = time.time()
        df = pd.read_sql_query(f"SELECT * FROM [{name}]", engine)
        files = export_and_split(df, name, output_dir)
        duration = time.time() - start_time
        export_times.append((name, len(df), len(files), round(duration, 2)))
    except Exception as e:
        print(f"⚠️ Error exporting '{name}': {e}")

# Show timing results
print("\n⏱️ Export timing summary:")
for name, rows, parts, duration in export_times:
    print(f" - {name:<20} | Rows: {rows:<8} | Parts: {parts} | Time: {duration} sec")

# Create ZIP with selected objects only (split parts included)
zip_filename = "NexusModsDB_selected_views_and_table.zip"
with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for obj in zip_objects:
        part_idx = 1
        while True:
            suffix = f"_{part_idx}" if part_idx > 1 else ""
            filename = f"{obj}{suffix}.parquet"
            file_path = os.path.join(output_dir, filename)
            if os.path.exists(file_path):
                zipf.write(file_path, arcname=filename)
                part_idx += 1
            else:
                break

print(f"\n✅ Export complete! All Parquet files saved in '{output_dir}' and selected objects zipped to '{zip_filename}'")

Found 13 tables and 2 views. Starting export...


Exporting SQL objects: 100%|██████████████████████████████████████████████████████| 15/15 [39:48<00:00, 159.25s/object]



⏱️ Export timing summary:
 - Authors              | Rows: 122859   | Parts: 1 | Time: 16.31 sec
 - Authors_Backup_20250228_235610 | Rows: 122859   | Parts: 1 | Time: 16.26 sec
 - Authors_Backup_20250228_235950 | Rows: 122859   | Parts: 1 | Time: 16.52 sec
 - CleanedModAuthors    | Rows: 0        | Parts: 0 | Time: 0.07 sec
 - CleanedModData       | Rows: 483939   | Parts: 1 | Time: 756.14 sec
 - GameCategories       | Rows: 5089     | Parts: 1 | Time: 2.93 sec
 - GameCategories_Backup_20250228_235923 | Rows: 5089     | Parts: 1 | Time: 0.17 sec
 - Games                | Rows: 910      | Parts: 1 | Time: 0.19 sec
 - Games_Backup_20250228_235558 | Rows: 910      | Parts: 1 | Time: 0.19 sec
 - Mods                 | Rows: 605502   | Parts: 1 | Time: 430.45 sec
 - Mods_1               | Rows: 31074    | Parts: 1 | Time: 41.19 sec
 - Mods_Backup_20250228_234151 | Rows: 605502   | Parts: 1 | Time: 487.16 sec
 - TranslatedModData    | Rows: 368429   | Parts: 1 | Time: 264.82 sec
 - Authors_d

In [7]:
df = pd.read_parquet(r"C:\Mod_data/Authors_di.parquet")
df.head()

Unnamed: 0,member_id,about,country,joined,last_active,recognized_author,mod_count,collection_count,contributed_mod_count,owned_mod_count,...,deleted,show_activity_feed,show_last_active,moderation_history_count,is_blocked,is_tracked,donations_enabled,dp_opted_in,blocked_from_opting_in_mods_at,roles
0,1,About me\n\nI have always been an avid gamer. ...,GB,2003-07-15 17:01:46,2025-02-16 23:01:44,True,66,0,453,93,...,False,True,True,,False,False,False,False,,
1,8,,SE,2003-07-15 20:42:39,2024-09-28 19:17:37,True,48,0,9,50,...,False,True,True,,False,False,True,True,,
2,15,,,2003-07-15 20:42:39,2025-02-16 06:15:10,True,6,0,1,6,...,False,True,True,,False,False,False,True,,
3,22,,,2003-07-15 20:42:39,2014-11-06 04:37:49,True,1,0,1,1,...,False,True,True,,False,False,False,True,,
4,26,,,2003-07-15 20:42:39,2025-01-25 10:05:09,True,1,0,1,1,...,False,True,True,,False,False,False,True,,
