In [1]:
import pandas as pd

df = pd.read_csv("global.csv")


In [2]:
df["has_REE"] = df["REE_Mins"].notna()

In [3]:
null_counts = df.isnull().sum() 
null_ratios = null_counts / len(df)
print(null_ratios)

OBJECTID      0.000000
ID_No         0.000000
Name          0.000000
Name_Other    0.056840
Components    0.057161
                ...   
RR_Refs       0.826911
RR_RegCode    0.835902
RR_Note       0.961143
Region        0.000000
has_REE       0.000000
Length: 63, dtype: float64


In [4]:
null_ratios.index

Index(['OBJECTID', 'ID_No', 'Name', 'Name_Other', 'Components', 'Part_of',
       'Rec_Type', 'Rec_Note', 'Country', 'State_Prov', 'Latitude',
       'Longitude', 'Loc_Note', 'Dep_Type', 'Dep_Note', 'Dep_Form', 'Commods',
       'HREE_Note', 'LREE_Note', 'REE_Ratio', 'REE', 'Status', 'Stat_Note',
       'REE_Mins', 'Sig_Mins', 'Oth_Mins', 'Age_Mzn', 'Age_Ma', 'Host_Age',
       'HAge_Ma', 'Host_Lith', 'Host_Unit', 'Assoc_Rock', 'Alteration',
       'Company', 'Comments', 'Ref_List', 'Discov_Yr', 'Expl_Note',
       'Mine_Meth', 'P_Status', 'PStat_Note', 'P_Years', 'P_refs', 'P_Note',
       'RR_Ore_Mt', 'RR_TREO_Mt', 'RR_TREOgrd', 'RR_REE_grd', 'RR_Cutoff',
       'RR_HM_Mt', 'RR_HM_pct', 'RR_min_Mt', 'RR_min_pct', 'RR_mon_Mt',
       'RR_mon_pct', 'RR_oth_grd', 'RR_Yr_Est', 'RR_Refs', 'RR_RegCode',
       'RR_Note', 'Region', 'has_REE'],
      dtype='str')

In [5]:
valid_columns = null_ratios[null_ratios < 0.1].index
print(valid_columns)

Index(['OBJECTID', 'ID_No', 'Name', 'Name_Other', 'Components', 'Part_of',
       'Rec_Type', 'Rec_Note', 'Country', 'State_Prov', 'Latitude',
       'Longitude', 'Loc_Note', 'Dep_Type', 'Dep_Note', 'Dep_Form', 'Commods',
       'HREE_Note', 'LREE_Note', 'REE_Ratio', 'REE', 'Status', 'Stat_Note',
       'Ref_List', 'P_Status', 'Region', 'has_REE'],
      dtype='str')


In [None]:
# df['has_ree'] = (
#     df['commod1']
#     .str.contains(r'REE|rare\s*earth|lanthanide|REO', case=False, na=False)
#     .astype(int)
# )


In [6]:
df = df[valid_columns]
print(df.head())

   OBJECTID  ID_No            Name                           Name_Other  \
0         1   4190          Abenab                                        
1         2   4050       Abu Khruq                      Gabal Abu Khruq   
2         3   4052      Abu Tartar  Abu Tartour, Abu Tartor, Abu Tartur   
3         4   4126       Adiounedj                                        
4         5   4191  Agate Mountain                                        

  Components Part_of              Rec_Type       Rec_Note  Country  \
0                     intrusion or complex                 Namibia   
1                     intrusion or complex                   Egypt   
2                                     site  active P mine    Egypt   
3                     intrusion or complex                    Mali   
4                     intrusion or complex                 Namibia   

         State_Prov  ...  HREE_Note  LREE_Note REE_Ratio  \
0      Otjozondjupa  ...                                   
1  Al Ba

In [7]:
df.to_csv("global_clean.csv")

In [8]:
df.columns

Index(['OBJECTID', 'ID_No', 'Name', 'Name_Other', 'Components', 'Part_of',
       'Rec_Type', 'Rec_Note', 'Country', 'State_Prov', 'Latitude',
       'Longitude', 'Loc_Note', 'Dep_Type', 'Dep_Note', 'Dep_Form', 'Commods',
       'HREE_Note', 'LREE_Note', 'REE_Ratio', 'REE', 'Status', 'Stat_Note',
       'Ref_List', 'P_Status', 'Region', 'has_REE'],
      dtype='str')

In [9]:
#Spatial distribution
df[['Latitude','Longitude']].describe()


Unnamed: 0,Latitude,Longitude
count,3114.0,3114.0
mean,20.331028,44.40145
std,33.792098,80.636301
min,-79.137,-166.5
25%,-15.58145,6.696725
50%,29.72765,68.076275
75%,48.839,110.0368
max,72.86,175.867


In [10]:
df['Region'].value_counts(normalize=True)


Region
South and Central Asia    0.170841
Oceania                   0.161850
North America             0.135838
Europe                    0.122993
East Asia                 0.114965
South America             0.081246
Africa                    0.075466
China                     0.068722
Russian Federation        0.047206
Middle East               0.020231
Antarctica                0.000642
Name: proportion, dtype: float64

In [11]:
#Rec_Type distribution
df['Rec_Type'].value_counts()


Rec_Type
district or area           1316
site                        723
intrusion or complex        528
site(?)                     320
district or area(?)         141
intrusion or complex(?)      86
Name: count, dtype: int64

## Column Descriptions – Global REE Dataset

### Identification & Naming
- **OBJECTID**: Internal database identifier (no geological meaning).
- **ID_No**: Unique dataset-specific identifier for each REE record.
- **Name**: Primary name of the deposit, occurrence, or mineralized complex.
- **Name_Other**: Alternative or historical names used in literature.

---

### Mineral System Structure & Hierarchy
- **Components**: Sub-units belonging to this record; indicates a composite mineral system.
- **Part_of**: Indicates the record is part of a larger mineral system or complex.
- **Rec_Type**: Record classification (e.g., site, intrusion, district).
- **Rec_Note**: Free-text notes describing geology, development stage, or mining status.

---

### Location & Geography
- **Country**: Country where the REE site is located (administrative).
- **State_Prov**: State or province of the site.
- **Latitude**: Geographic latitude (decimal degrees).
- **Longitude**: Geographic longitude (decimal degrees).
- **Loc_Note**: Notes on location accuracy or descriptive placement.
- **Region**: Broad continental region (e.g., Africa, Asia).

---

### Deposit Geology & Mineralization
- **Dep_Type**: Deposit type (e.g., carbonatite, alkaline intrusion, placer).
- **Dep_Note**: Geological description of the deposit (host rock, alteration, mineralogy).
- **Dep_Form**: Physical or genetic form of mineralization (e.g., vein, disseminated, placer).
- **Commods**: Commodities present (REEs and associated elements such as Nb, Ta, Th, P).

---

### REE-Specific Information
- **REE**: Indicates presence of rare earth elements as a commodity.
- **HREE_Note**: Notes specific to heavy rare earth element (HREE) enrichment.
- **LREE_Note**: Notes specific to light rare earth element (LREE) enrichment.
- **REE_Ratio**: Qualitative indication of HREE vs LREE dominance.

---

### Development & Status
- **Status**: General development status (occurrence, prospect, mine, producer).
- **Stat_Note**: Additional notes on development or production history.
- **P_Status**: Project or production status (exploration, development, operating).

---

### References & Metadata
- **Ref_List**: References to reports, publications, or data sources.


### Feature Engineering

In [15]:
#Spatial features (standardized)
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[['lat_z', 'lon_z']] = scaler.fit_transform(df[['Latitude', 'Longitude']])


In [16]:
#Region (one-hot encoding)
region_dummies = pd.get_dummies(df['Region'], prefix='region')
df = pd.concat([df, region_dummies], axis=1)

In [17]:
#Mineral system hierarchy
df['is_composite_system'] = df['Components'].notna().astype(int)
df['is_part_of_complex'] = df['Part_of'].notna().astype(int)

In [18]:
#Record type (scale of system)
rec_type_dummies = pd.get_dummies(df['Rec_Type'], prefix='rec_type')
df = pd.concat([df, rec_type_dummies], axis=1)

In [19]:
def classify_system(row):
    text = ' '.join([
        str(row['Dep_Type']),
        str(row['Dep_Note']),
        str(row['Rec_Note'])
    ]).lower()
    
    if 'carbonatite' in text:
        return 'carbonatite'
    if 'alkaline' in text:
        return 'alkaline_intrusive'
    if 'placer' in text:
        return 'placer'
    if any(k in text for k in ['clay', 'laterite', 'ion adsorption']):
        return 'clay_laterite'
    if row['Dep_Type'] and str(row['Dep_Type']).strip().lower() != 'nan':
        return 'other'
    return 'unknown'

df['system_class'] = df.apply(classify_system, axis=1)

system_class_dummies = pd.get_dummies(df['system_class'], prefix='system')
df = pd.concat([df, system_class_dummies], axis=1)

In [20]:
#Physical form of mineralization
dep_form_dummies = pd.get_dummies(df['Dep_Form'], prefix='dep_form')
df = pd.concat([df, dep_form_dummies], axis=1)

In [21]:
#Commodity association features
def has_element(series, element):
    return series.str.contains(element, case=False, na=False).astype(int)

df['has_Nb'] = has_element(df['Commods'], 'Nb')
df['has_Ta'] = has_element(df['Commods'], 'Ta')
df['has_Th'] = has_element(df['Commods'], 'Th')
df['has_P']  = has_element(df['Commods'], 'P')

df['commodity_count'] = (
    df[['has_Nb', 'has_Ta', 'has_Th', 'has_P']]
    .sum(axis=1)
)

In [22]:
#REE fractionation indicators
df['is_hree_enriched'] = df['HREE_Note'].notna().astype(int)
df['is_lree_enriched'] = df['LREE_Note'].notna().astype(int)

def classify_ree_ratio(x):
    if pd.isna(x):
        return 'unknown'
    x = str(x).lower()
    if 'hree' in x:
        return 'hree'
    if 'lree' in x:
        return 'lree'
    if 'mixed' in x:
        return 'mixed'
    return 'unknown'

df['ree_ratio_class'] = df['REE_Ratio'].apply(classify_ree_ratio)

ree_ratio_dummies = pd.get_dummies(df['ree_ratio_class'], prefix='ree_ratio')
df = pd.concat([df, ree_ratio_dummies], axis=1)


In [23]:
df["has_REE"].value_counts()

has_REE
True     2026
False    1088
Name: count, dtype: int64

In [24]:
def keyword_flag(series, keywords):
    return series.str.contains('|'.join(keywords), case=False, na=False).astype(int)

df['is_producing'] = keyword_flag(
    df['Rec_Note'].fillna('') + df['Stat_Note'].fillna(''),
    ['producer', 'producing', 'mine']
)

df['is_prospect'] = keyword_flag(
    df['Status'].fillna('') + df['P_Status'].fillna(''),
    ['prospect']
)

df['is_occurrence_only'] = keyword_flag(
    df['Status'].fillna('') + df['Rec_Note'].fillna(''),
    ['occurrence']
)


In [25]:
gnn_features = (
    ['lat_z', 'lon_z',
     'is_composite_system', 'is_part_of_complex',
     'has_Nb', 'has_Ta', 'has_Th', 'has_P', 'commodity_count',
     'is_hree_enriched', 'is_lree_enriched',
     'is_producing', 'is_prospect', 'is_occurrence_only']
    + list(region_dummies.columns)
    + list(rec_type_dummies.columns)
    + list(system_class_dummies.columns)
    + list(dep_form_dummies.columns)
    + list(ree_ratio_dummies.columns)
)


In [26]:
X = df[gnn_features].astype(float)


In [27]:
X.head()

Unnamed: 0,lat_z,lon_z,is_composite_system,is_part_of_complex,has_Nb,has_Ta,has_Th,has_P,commodity_count,is_hree_enriched,...,"dep_form_veins, loads","dep_form_veins, nests","dep_form_veins, other(?)","dep_form_veins, pegmatite","dep_form_veins, pegmatites","dep_form_veins, pegmatites, dikes","dep_form_veins, stockwork","dep_form_veins, veinlets, stockworks",ree_ratio_hree,ree_ratio_unknown
0,-1.171922,-0.326226,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.127789,-0.125648,1.0,1.0,1.0,1.0,1.0,0.0,3.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.150233,-0.177799,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-0.004381,-0.549896,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,-1.147841,-0.400883,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [28]:
X.shape, X.isnull().sum().sum()

((3114, 155), np.int64(0))

### Spatial k-NN edges (core graph backbone)

- Build k-nearest neighbors using lat/lon

In [29]:
from sklearn.neighbors import NearestNeighbors
import numpy as np

coords = df[['Latitude', 'Longitude']].values

k = 10  # recommended: 5–15
nbrs = NearestNeighbors(n_neighbors=k+1, metric='haversine')
nbrs.fit(np.radians(coords))

distances, indices = nbrs.kneighbors(np.radians(coords))

edge_list = []

for i in range(indices.shape[0]):
    for j_idx, j in enumerate(indices[i][1:]):  # skip self
        edge_list.append((i, j))


In [None]:
#Same mineral system (Part_of)
# part_groups = df.dropna(subset=['Part_of']).groupby('Part_of').groups

# for _, nodes in part_groups.items():
#     nodes = list(nodes)
#     for i in nodes:
#         for j in nodes:
#             if i != j:
#                 edge_list.append((i, j))


In [None]:
#Same system class (carbonatite, alkaline, etc.)
# system_groups = df.groupby('system_class').groups

# for system, nodes in system_groups.items():
#     if system == 'unknown':
#         continue
#     nodes = list(nodes)
#     for i in nodes:
#         for j in nodes:
#             if i != j:
#                 edge_list.append((i, j))


In [None]:
#Same REE fractionation (HREE/LREE)
# ree_groups = df.groupby('ree_ratio_class').groups

# for ratio, nodes in ree_groups.items():
#     if ratio == 'unknown':
#         continue
#     nodes = list(nodes)
#     for i in nodes:
#         for j in nodes:
#             if i != j:
#                 edge_list.append((i, j))

In [33]:
edge_index = np.array(list(set(edge_list))).T
print(len(edge_index[0]) / len(df))
edge_index = np.concatenate([edge_index, edge_index[::-1]], axis=1)


10.0


In [30]:
y = df["has_REE"]

In [32]:
regions = df['Region'].unique()

region = regions[-1]  # choose region to hold out

test_idx = df['Region'] == region
train_idx = ~test_idx

train_idx = np.where(train_idx)[0]
test_idx  = np.where(test_idx)[0]


In [34]:
X_train = X.iloc[train_idx] if hasattr(X, "iloc") else X[train_idx]
X_test  = X.iloc[test_idx]  if hasattr(X, "iloc") else X[test_idx]

y_train = y[train_idx]
y_test  = y[test_idx]


In [35]:
import torch

num_nodes = len(df)

train_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask  = torch.zeros(num_nodes, dtype=torch.bool)

train_mask[train_idx] = True
test_mask[test_idx]   = True


In [36]:
y_test.value_counts()

has_REE
False    332
True     200
Name: count, dtype: int64

In [37]:
print("Train nodes:", len(train_idx))
print("Test nodes:", len(test_idx))
print("Positive rate (train):", y_train.mean())
print("Positive rate (test):", y_test.mean())


Train nodes: 2582
Test nodes: 532
Positive rate (train): 0.7072037180480247
Positive rate (test): 0.37593984962406013
