In [1]:
import pandas as pd

df = pd.read_csv("global.csv")


In [2]:
len(df)

3114

In [None]:
df["REE_mins"]

In [None]:
null_counts = df.isnull().sum() 
null_ratios = null_counts / len(df)
print(null_ratios)

In [None]:
null_ratios.index

In [None]:
valid_columns = null_ratios[null_ratios < 0.1].index
print(valid_columns)

In [None]:
df['has_ree'] = (
    df['commod1']
    .str.contains(r'REE|rare\s*earth|lanthanide|REO', case=False, na=False)
    .astype(int)
)

In [None]:
df['has_ree'].value_counts()

In [None]:
df = df[valid_columns]
print(df.head())

In [None]:
df.to_csv("global_clean.csv")

In [None]:
df.columns

In [None]:
#Spatial distribution
df[['Latitude','Longitude']].describe()


In [None]:
df['Region'].value_counts(normalize=True)


In [None]:
#Rec_Type distribution
df['Rec_Type'].value_counts()


## Column Descriptions – Global REE Dataset

### Identification & Naming
- **OBJECTID**: Internal database identifier (no geological meaning).
- **ID_No**: Unique dataset-specific identifier for each REE record.
- **Name**: Primary name of the deposit, occurrence, or mineralized complex.
- **Name_Other**: Alternative or historical names used in literature.

---

### Mineral System Structure & Hierarchy
- **Components**: Sub-units belonging to this record; indicates a composite mineral system.
- **Part_of**: Indicates the record is part of a larger mineral system or complex.
- **Rec_Type**: Record classification (e.g., site, intrusion, district).
- **Rec_Note**: Free-text notes describing geology, development stage, or mining status.

---

### Location & Geography
- **Country**: Country where the REE site is located (administrative).
- **State_Prov**: State or province of the site.
- **Latitude**: Geographic latitude (decimal degrees).
- **Longitude**: Geographic longitude (decimal degrees).
- **Loc_Note**: Notes on location accuracy or descriptive placement.
- **Region**: Broad continental region (e.g., Africa, Asia).

---

### Deposit Geology & Mineralization
- **Dep_Type**: Deposit type (e.g., carbonatite, alkaline intrusion, placer).
- **Dep_Note**: Geological description of the deposit (host rock, alteration, mineralogy).
- **Dep_Form**: Physical or genetic form of mineralization (e.g., vein, disseminated, placer).
- **Commods**: Commodities present (REEs and associated elements such as Nb, Ta, Th, P).

---

### REE-Specific Information
- **REE**: Indicates presence of rare earth elements as a commodity.
- **HREE_Note**: Notes specific to heavy rare earth element (HREE) enrichment.
- **LREE_Note**: Notes specific to light rare earth element (LREE) enrichment.
- **REE_Ratio**: Qualitative indication of HREE vs LREE dominance.

---

### Development & Status
- **Status**: General development status (occurrence, prospect, mine, producer).
- **Stat_Note**: Additional notes on development or production history.
- **P_Status**: Project or production status (exploration, development, operating).

---

### References & Metadata
- **Ref_List**: References to reports, publications, or data sources.


### Feature Engineering

In [None]:
#Spatial features (standardized)
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[['lat_z', 'lon_z']] = scaler.fit_transform(df[['Latitude', 'Longitude']])


In [None]:
#Region (one-hot encoding)
region_dummies = pd.get_dummies(df['Region'], prefix='region')
df = pd.concat([df, region_dummies], axis=1)

In [None]:
#Mineral system hierarchy
df['is_composite_system'] = df['Components'].notna().astype(int)
df['is_part_of_complex'] = df['Part_of'].notna().astype(int)

In [None]:
#Record type (scale of system)
rec_type_dummies = pd.get_dummies(df['Rec_Type'], prefix='rec_type')
df = pd.concat([df, rec_type_dummies], axis=1)

In [None]:
def classify_system(row):
    text = ' '.join([
        str(row['Dep_Type']),
        str(row['Dep_Note']),
        str(row['Rec_Note'])
    ]).lower()
    
    if 'carbonatite' in text:
        return 'carbonatite'
    if 'alkaline' in text:
        return 'alkaline_intrusive'
    if 'placer' in text:
        return 'placer'
    if any(k in text for k in ['clay', 'laterite', 'ion adsorption']):
        return 'clay_laterite'
    if row['Dep_Type'] and str(row['Dep_Type']).strip().lower() != 'nan':
        return 'other'
    return 'unknown'

df['system_class'] = df.apply(classify_system, axis=1)

system_class_dummies = pd.get_dummies(df['system_class'], prefix='system')
df = pd.concat([df, system_class_dummies], axis=1)

In [None]:
#Physical form of mineralization
dep_form_dummies = pd.get_dummies(df['Dep_Form'], prefix='dep_form')
df = pd.concat([df, dep_form_dummies], axis=1)

In [None]:
#Commodity association features
def has_element(series, element):
    return series.str.contains(element, case=False, na=False).astype(int)

df['has_Nb'] = has_element(df['Commods'], 'Nb')
df['has_Ta'] = has_element(df['Commods'], 'Ta')
df['has_Th'] = has_element(df['Commods'], 'Th')
df['has_P']  = has_element(df['Commods'], 'P')

df['commodity_count'] = (
    df[['has_Nb', 'has_Ta', 'has_Th', 'has_P']]
    .sum(axis=1)
)

In [None]:
#REE fractionation indicators
df['is_hree_enriched'] = df['HREE_Note'].notna().astype(int)
df['is_lree_enriched'] = df['LREE_Note'].notna().astype(int)

def classify_ree_ratio(x):
    if pd.isna(x):
        return 'unknown'
    x = str(x).lower()
    if 'hree' in x:
        return 'hree'
    if 'lree' in x:
        return 'lree'
    if 'mixed' in x:
        return 'mixed'
    return 'unknown'

df['ree_ratio_class'] = df['REE_Ratio'].apply(classify_ree_ratio)

ree_ratio_dummies = pd.get_dummies(df['ree_ratio_class'], prefix='ree_ratio')
df = pd.concat([df, ree_ratio_dummies], axis=1)

df['has_ree'] = (
    df['Commods']
    .str.contains(r'REE', case=False, na=False)
    .astype(int)
)


In [None]:
df["has_ree"].value_counts()

In [None]:
def keyword_flag(series, keywords):
    return series.str.contains('|'.join(keywords), case=False, na=False).astype(int)

df['is_producing'] = keyword_flag(
    df['Rec_Note'].fillna('') + df['Stat_Note'].fillna(''),
    ['producer', 'producing', 'mine']
)

df['is_prospect'] = keyword_flag(
    df['Status'].fillna('') + df['P_Status'].fillna(''),
    ['prospect']
)

df['is_occurrence_only'] = keyword_flag(
    df['Status'].fillna('') + df['Rec_Note'].fillna(''),
    ['occurrence']
)


In [None]:
gnn_features = (
    ['lat_z', 'lon_z',
     'is_composite_system', 'is_part_of_complex',
     'has_Nb', 'has_Ta', 'has_Th', 'has_P', 'commodity_count',
     'is_hree_enriched', 'is_lree_enriched', 'has_ree',
     'is_producing', 'is_prospect', 'is_occurrence_only']
    + list(region_dummies.columns)
    + list(rec_type_dummies.columns)
    + list(system_class_dummies.columns)
    + list(dep_form_dummies.columns)
    + list(ree_ratio_dummies.columns)
)


In [None]:
X = df[gnn_features].astype(float)


In [None]:
X.head()

In [None]:
X.shape, X.isnull().sum().sum()

### Spatial k-NN edges (core graph backbone)

- Build k-nearest neighbors using lat/lon

In [None]:
from sklearn.neighbors import NearestNeighbors
import numpy as np

coords = df[['Latitude', 'Longitude']].values

k = 10  # recommended: 5–15
nbrs = NearestNeighbors(n_neighbors=k+1, metric='haversine')
nbrs.fit(np.radians(coords))

distances, indices = nbrs.kneighbors(np.radians(coords))

edge_list = []

for i in range(indices.shape[0]):
    for j_idx, j in enumerate(indices[i][1:]):  # skip self
        edge_list.append((i, j))


In [None]:
#Same mineral system (Part_of)
# part_groups = df.dropna(subset=['Part_of']).groupby('Part_of').groups

# for _, nodes in part_groups.items():
#     nodes = list(nodes)
#     for i in nodes:
#         for j in nodes:
#             if i != j:
#                 edge_list.append((i, j))


In [None]:
#Same system class (carbonatite, alkaline, etc.)
# system_groups = df.groupby('system_class').groups

# for system, nodes in system_groups.items():
#     if system == 'unknown':
#         continue
#     nodes = list(nodes)
#     for i in nodes:
#         for j in nodes:
#             if i != j:
#                 edge_list.append((i, j))


In [None]:
#Same REE fractionation (HREE/LREE)
# ree_groups = df.groupby('ree_ratio_class').groups

# for ratio, nodes in ree_groups.items():
#     if ratio == 'unknown':
#         continue
#     nodes = list(nodes)
#     for i in nodes:
#         for j in nodes:
#             if i != j:
#                 edge_list.append((i, j))

In [None]:
edge_index = np.array(list(set(edge_list))).T
print(len(edge_index[0]) / len(df))
edge_index = np.concatenate([edge_index, edge_index[::-1]], axis=1)


In [None]:
df["has_ree"].value_counts()

In [None]:
y = df["has_ree"]

In [None]:
regions = df['Region'].unique()

region = regions[-1]  # choose region to hold out

test_idx = df['Region'] == region
train_idx = ~test_idx

train_idx = np.where(train_idx)[0]
test_idx  = np.where(test_idx)[0]


In [None]:
X_train = X.iloc[train_idx] if hasattr(X, "iloc") else X[train_idx]
X_test  = X.iloc[test_idx]  if hasattr(X, "iloc") else X[test_idx]

y_train = y[train_idx]
y_test  = y[test_idx]


In [None]:
import torch

num_nodes = len(df)

train_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask  = torch.zeros(num_nodes, dtype=torch.bool)

train_mask[train_idx] = True
test_mask[test_idx]   = True


In [None]:
y_test.value_counts()

In [None]:
print("Train nodes:", len(train_idx))
print("Test nodes:", len(test_idx))
print("Positive rate (train):", y_train.mean())
print("Positive rate (test):", y_test.mean())
