# BDGSF — data/bdgsf_classe_ru/

Shapefile set: **bdgsf_classe_ru** (soil classes / RU). Files: `.shp`, `.shx`, `.dbf`, `.prj`, `.cst` (encoding: ISO-8859-1). CRS: NTF (Paris) / France II — EPSG:27582.

In [1]:
import geopandas as gpd
from pathlib import Path

path = Path("data/bdgsf_classe_ru/bdgsf_classe_ru.shp")
gdf = gpd.read_file(path, encoding="iso-8859-1")
print("Shape:", gdf.shape)
print("CRS:", gdf.crs)
print("Geometry type:", gdf.geometry.type.unique())
gdf.head()

Shape: (1942, 6)
CRS: EPSG:27582
Geometry type: <StringArray>
['Polygon']
Length: 1, dtype: str


Unnamed: 0,area,perimeter,reserve_,reserve_id,classe,geometry
0,33602000.0,59377.9,2.0,1.0,2,"POLYGON ((588191.938 2670276.25, 589690.625 26..."
1,2249870000.0,525756.0,3.0,2.0,4,"POLYGON ((615310.375 2675706.25, 615840.625 26..."
2,137778000.0,110800.0,4.0,3.0,2,"POLYGON ((590489.125 2649901, 590859 2651829.2..."
3,2848830000.0,1042470.0,5.0,4.0,5,"POLYGON ((568467.125 2613029.75, 568262.938 26..."
4,552476000.0,288280.0,6.0,5.0,2,"POLYGON ((556926.188 2661010, 557594.438 26599..."


In [2]:
# Columns and dtypes
gdf.dtypes

area           float64
perimeter      float64
reserve_       float64
reserve_id     float64
classe           int32
geometry      geometry
dtype: object

In [None]:
# Attribute columns (non-geometry): value counts
for col in gdf.columns.drop(gdf.geometry.name):
    print(col)
    print(gdf[col].value_counts(dropna=False).head(15))
    print()

In [3]:
# Bounds and quick plot (optional: needs matplotlib)
gdf.total_bounds
# gdf.plot(figsize=(8, 8), column="classe", legend=True)

array([  47837.5546875, 1617939.75     , 1197497.125    , 2677724.       ])

### .dbf — attributes (dBase, 1942 records)

Attribute table: one row per polygon. Columns: `area`, `perimeter`, `reserve_`, `reserve_id`, `classe`.

In [None]:
# Load .dbf only (attributes); geopandas reads it from the shapefile set
dbf = gpd.read_file("data/bdgsf_classe_ru/bdgsf_classe_ru.dbf")
attrs = dbf.drop(columns="geometry")
attrs

Unnamed: 0,area,perimeter,reserve_,reserve_id,classe
0,3.360200e+07,59377.90,2.0,1.0,2
1,2.249870e+09,525756.00,3.0,2.0,4
2,1.377780e+08,110800.00,4.0,3.0,2
3,2.848830e+09,1042470.00,5.0,4.0,5
4,5.524760e+08,288280.00,6.0,5.0,2
...,...,...,...,...,...
1937,2.540460e+05,2082.25,1939.0,1938.0,3
1938,3.525960e+07,26246.40,1940.0,1939.0,1
1939,1.231230e+07,18989.60,1941.0,1940.0,3
1940,7.743870e+05,3693.05,1942.0,1941.0,2


### .shx — index (binary)

Fixed-length index: 100-byte header, then per record 8 bytes (offset + content length in 16-bit words, big-endian). Used to seek into the .shp file.

In [5]:
import struct

with open("data/bdgsf_classe_ru/bdgsf_classe_ru.shx", "rb") as f:
    shx = f.read()
n_rec = (len(shx) - 100) // 8
index = [
    (struct.unpack(">i", shx[100 + i * 8 : 104 + i * 8])[0], struct.unpack(">i", shx[104 + i * 8 : 108 + i * 8])[0])
    for i in range(n_rec)
]
print(f"Records: {n_rec}. First 10 (offset, content_length):")
for i, (off, length) in enumerate(index[:10]):
    print(f"  {i}: offset={off} content_length={length}")
# as DataFrame
import pandas as pd
pd.DataFrame(index, columns=["offset", "content_length"]).head(15)

Records: 1942. First 10 (offset, content_length):
  0: offset=50 content_length=256
  1: offset=310 content_length=2794
  2: offset=3108 content_length=632
  3: offset=3744 content_length=5864
  4: offset=9612 content_length=1618
  5: offset=11234 content_length=200
  6: offset=11438 content_length=72
  7: offset=11514 content_length=216
  8: offset=11734 content_length=136
  9: offset=11874 content_length=208


Unnamed: 0,offset,content_length
0,50,256
1,310,2794
2,3108,632
3,3744,5864
4,9612,1618
5,11234,200
6,11438,72
7,11514,216
8,11734,136
9,11874,208


### .shp — main geometry (binary)

ESRI Shapefile: 100-byte header (file code 9994, file length in 16-bit words, shape type), then for each record: 4 bytes record number + 4 bytes content length (big-endian) + geometry content.

In [6]:
with open("data/bdgsf_classe_ru/bdgsf_classe_ru.shp", "rb") as f:
    header = f.read(100)
    code = struct.unpack(">i", header[0:4])[0]
    file_len = struct.unpack(">i", header[24:28])[0]
    shape_type = struct.unpack("<i", header[28:32])[0]
    print(f"File code: {code}, file length (16-bit words): {file_len}, shape type: {shape_type}")
    records = []
    pos = 100
    for _ in range(5):
        rec_h = f.read(8)
        rec_num = struct.unpack(">i", rec_h[0:4])[0]
        cont_len = struct.unpack(">i", rec_h[4:8])[0]
        records.append((rec_num, cont_len))
        pos += 8 + cont_len
        f.seek(pos)
pd.DataFrame(records, columns=["record_number", "content_length"])

File code: 9994, file length (16-bit words): 1275106, shape type: 1000


error: unpack requires a buffer of 4 bytes

### .prj — projection (text)

WKT string defining CRS: NTF (Paris) / France II — EPSG:27582.

In [7]:
Path("data/bdgsf_classe_ru/bdgsf_classe_ru.prj").read_text(encoding="utf-8", errors="replace")

'PROJCS["NTF (Paris) / France II", GEOGCS["NTF (Paris)", DATUM["Nouvelle Triangulation Francaise (Paris)", SPHEROID["Clarke 1880 (IGN)", 6378249.2, 293.4660212936269, AUTHORITY["EPSG","7011"]], AUTHORITY["EPSG","6807"]], PRIMEM["Paris", 2.5969213, AUTHORITY["EPSG","8903"]], UNIT["grad", 0.015707963267948963], AXIS["Geodetic latitude", NORTH], AXIS["Geodetic longitude", EAST], AUTHORITY["EPSG","4807"]], PROJECTION["Lambert_Conformal_Conic_1SP", AUTHORITY["EPSG","9801"]], PARAMETER["central_meridian", 0.0], PARAMETER["latitude_of_origin", 52.0], PARAMETER["scale_factor", 0.99987742], PARAMETER["false_easting", 600000.0], PARAMETER["false_northing", 2200000.0], UNIT["m", 1.0], AXIS["Easting", EAST], AXIS["Northing", NORTH], AUTHORITY["EPSG","27582"]]'

### .cst — encoding (text)

Single line: character set for .dbf attribute strings.

In [8]:
Path("data/bdgsf_classe_ru/bdgsf_classe_ru.cst").read_text(encoding="utf-8")

'ISO-8859-1'

In [5]:
import pyarrow.parquet as pq

parquet_file = pq.ParquetFile("/Users/thomas/study/visu/hydragri/data/rpg_2023/RPG2023_sol_climat.parquet")
print("Number of row groups:", parquet_file.num_row_groups)
print("Number of rows:", parquet_file.metadata.num_rows)
print("Number of columns:", parquet_file.metadata.num_columns)
print("Schema:")
print(parquet_file.schema)
print("\nMetadata:")
print(parquet_file.metadata)


Number of row groups: 10
Number of rows: 9797405
Number of columns: 27
Schema:
<pyarrow._parquet.ParquetSchema object at 0x11754bfc0>
required group field_id=-1 schema {
  optional binary field_id=-1 id_parcel (String);
  optional binary field_id=-1 com_parc (String);
  optional double field_id=-1 pct_com;
  optional binary field_id=-1 dep_parc (String);
  optional binary field_id=-1 reg_parc (String);
  optional double field_id=-1 alt_mean;
  optional double field_id=-1 alt_min;
  optional double field_id=-1 alt_max;
  optional double field_id=-1 pente_mean;
  optional double field_id=-1 expo_mean;
  optional binary field_id=-1 expo (String);
  optional int32 field_id=-1 mf_lambx (Int(bitWidth=16, isSigned=true));
  optional int32 field_id=-1 mf_lamby (Int(bitWidth=16, isSigned=true));
  optional binary field_id=-1 mf_lambxy (String);
  optional int32 field_id=-1 mf_maille (Int(bitWidth=16, isSigned=true));
  optional binary field_id=-1 smu_1 (String);
  optional float field_id=-1 par