# 🐍 Machine Learning Project Script

In [1]:
import pandas as pd
import ast
import networkx as nx
from sklearn.preprocessing import MinMaxScaler

# 1. Upload data

In [2]:
df_train_raw = pd.read_csv("train.csv", sep=',')
df_train_raw["edgelist"].head(1)
df_train_raw["edgelist"] = df_train_raw["edgelist"].apply(ast.literal_eval)

df_test_raw = pd.read_csv("test.csv", sep=',')
df_test_raw["edgelist"].head(1)
df_test_raw["edgelist"] = df_test_raw["edgelist"].apply(ast.literal_eval)

In [3]:
print(df_train_raw.columns)


Index(['language', 'sentence', 'n', 'edgelist', 'root'], dtype='object')


# 2. Pre-Processing


In [3]:
import networkx as nx
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from itertools import combinations
from community import community_louvain  # pip install python-louvain

# Feature processing and normalization
def normalize_group(df_group):
    numeric_cols = [
        'deg', 'degree', 'closeness', 'betweenness', 'pagerank',
        'eigenvector', 'harmonic', 'eccentricity', 
        'clustering', 'avg_neighbor_degree', 'community'
    ]
    scaler = MinMaxScaler()
    df_group[numeric_cols] = scaler.fit_transform(df_group[numeric_cols])
    return df_group

def pre_processing(data):
    training_data = []
    
    for idx, row in data.iterrows():
        edgelist = row["edgelist"]
        T = nx.Graph()
        T.add_edges_from(edgelist)

        if not nx.is_connected(T):
            continue
        
        root_node = row.get("root", None)
        
        # Essential centrality measures
        closeness = nx.closeness_centrality(T)
        betweenness = nx.betweenness_centrality(T)
        pagerank = nx.pagerank(T, max_iter=1000)
        deg_centrality = nx.degree_centrality(T)
        
        # Robust eigenvector computation
        try:
            eigenvector = nx.eigenvector_centrality(T, max_iter=10000, tol=1e-06)
        except nx.PowerIterationFailedConvergence:
            eigenvector = {n: 0.0 for n in T.nodes}
            
        harmonic = nx.harmonic_centrality(T)
        
        # Structural properties
        degree = dict(T.degree())
        eccentricity = nx.eccentricity(T)
        clustering = nx.clustering(T)
        avg_neighbor_degree = nx.average_neighbor_degree(T)
        
        # Community detection
        partition = community_louvain.best_partition(T)
        
        for v in T.nodes:
            features = {
                "sentence": row["sentence"],
                "language": row["language"],
                "n": row["n"],
                "node": v,
                "deg": deg_centrality[v],
                "degree": degree[v],
                "closeness": closeness[v],
                "betweenness": betweenness[v],
                "pagerank": pagerank[v],
                "eigenvector": eigenvector[v],
                "harmonic": harmonic[v],
                "eccentricity": eccentricity[v],
                "clustering": clustering[v],
                "avg_neighbor_degree": avg_neighbor_degree[v],
                "community": partition[v],
            }
            
            if "id" in row:
                features["id"] = row["id"]

            if root_node is not None:
                features["is_root"] = 1 if v == root_node else 0

            training_data.append(features)

    training_data = pd.DataFrame(training_data)
    
    df_normalized = training_data.groupby(["sentence", "language"], group_keys=True).apply(
        normalize_group, include_groups=False
    )
    df_normalized.reset_index(inplace=True)
    df_normalized.drop(columns=["level_2"], inplace=True)

    return df_normalized


In [4]:
# Load and preprocess data
df_train = pre_processing(df_train_raw.copy())
features = [col for col in df_train.columns if col not in ['sentence', 'language', 'node', 'is_root', 'n', 'group_id']]

# 3. Models

**K-Fold Cross Validation**

In [None]:
import networkx as nx
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold, RandomizedSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.metrics import precision_recall_fscore_support
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.tree import DecisionTreeClassifier 


def enhanced_training_pipeline(df, features, n_folds=5):
    df['group_id'] = df["sentence"].astype(str) + '_' + df["language"]
    X = df[features]
    y = df['is_root']
    groups = df["group_id"]

    # Simplified hyperparameter spaces
    models = {
        "RandomForest": {
            "model": RandomForestClassifier(class_weight='balanced', random_state=42),
            "params": {
                'n_estimators': [100, 200],
                'max_depth': [10, 20],
                'min_samples_split': [2, 5],
                'min_samples_leaf': [1, 2]
            }
        },
        # "XGBoost": {
        #     "model": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
        #     "params": {
        #         'n_estimators': [100, 200],
        #         'max_depth': [3, 6],
        #         'learning_rate': [0.05, 0.1],
        #         'subsample': [0.7, 1.0]
        #     }
        # },
        "DecisionTree": {  
            "model": DecisionTreeClassifier(class_weight='balanced', random_state=42),
            "params": {
                'max_depth': [5, 10, 20],
                'min_samples_split': [2, 5],
                'min_samples_leaf': [1, 2]
            }
        }
    }

    results = {}

    for model_name, config in models.items():
        print(f"\n=== Tuning {model_name} ===")
        # SMOTE resampling on entire dataset
        smote = SMOTE(sampling_strategy='minority', random_state=42)
        X_res, y_res = smote.fit_resample(X, y)

        # Feature selection on resampled data
        selector = SelectFromModel(
            estimator=RandomForestClassifier(n_estimators=100, random_state=42),
            max_features=10
        ).fit(X_res, y_res)

        X_fs = selector.transform(X_res)

        # Hyperparameter tuning on full resampled + selected features
        search = RandomizedSearchCV(
            config["model"],
            config["params"],
            n_iter=5,
            scoring='f1',
            cv=3,
            random_state=42,
            n_jobs=-1
        )
        search.fit(X_fs, y_res)
        best_params = search.best_params_

        print(f"Best params found: {best_params}")

        # Now perform cross-validation with fixed best params
        fold_metrics = {'acc': [], 'prec': [], 'rec': [], 'f1': []}
        cv = StratifiedGroupKFold(n_splits=n_folds)

        for train_idx, val_idx in cv.split(X, y, groups=groups):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

            # Apply SMOTE on training fold only
            smote_fold = SMOTE(sampling_strategy='minority', random_state=42)
            X_train_res, y_train_res = smote_fold.fit_resample(X_train, y_train)

            # Apply previously fit selector to training and validation
            X_train_fs = selector.transform(X_train_res)
            X_val_fs = selector.transform(X_val)

            # Train model with best hyperparameters
            model = config["model"].set_params(**best_params)
            model.fit(X_train_fs, y_train_res)

            # Prepare val_df for evaluation
            val_df = df.iloc[val_idx].copy()
            val_df['proba'] = model.predict_proba(X_val_fs)[:, 1]

            predicted_roots = val_df.loc[val_df.groupby(['sentence', 'language'])['proba'].idxmax()]
            true_roots = val_df[val_df['is_root'] == 1]

            merged = predicted_roots.merge(
                true_roots,
                on=['sentence', 'language'],
                suffixes=('_pred', '_true')
            )

            correct = merged['node_pred'] == merged['node_true']
            acc = correct.mean()
            prec, rec, f1, _ = precision_recall_fscore_support(
                correct, [True] * len(correct), average='binary'
            )

            fold_metrics['acc'].append(acc)
            fold_metrics['prec'].append(prec)
            fold_metrics['rec'].append(rec)
            fold_metrics['f1'].append(f1)

        results[model_name] = {
            'mean_f1': np.mean(fold_metrics['f1']),
            'mean_precision': np.mean(fold_metrics['prec']),
            'mean_recall': np.mean(fold_metrics['rec']),
            'best_params': best_params,
            'selector': selector
        }

        print(f"Mean F1: {results[model_name]['mean_f1']:.4f}")

    # Train final model on full dataset with best model and best params
    best_model_name = max(results.items(), key=lambda x: x[1]['mean_f1'])[0]
    best_config = models[best_model_name]
    best_params = results[best_model_name]['best_params']
    final_selector = results[best_model_name]['selector']

    print(f"\nTraining final {best_model_name} model...")

    smote = SMOTE(sampling_strategy='minority', random_state=42)
    X_res, y_res = smote.fit_resample(X, y)
    X_fs = final_selector.transform(X_res)

    final_model = best_config["model"].set_params(**best_params)
    final_model.fit(X_fs, y_res)

    return results, best_model_name, final_model, final_selector


XGBoostError: 
XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed
    - vcomp140.dll or libgomp-1.dll for Windows
    - libomp.dylib for Mac OSX
    - libgomp.so for Linux and other UNIX-like OSes
    Mac OSX users: Run `brew install libomp` to install OpenMP runtime.

  * You are running 32-bit Python on a 64-bit OS

Error message(s): ["dlopen(/Users/tania_priv/Documents/ProjectML/.venv/lib/python3.9/site-packages/xgboost/lib/libxgboost.dylib, 0x0006): Library not loaded: @rpath/libomp.dylib\n  Referenced from: <89AD948E-E564-3266-867D-7AF89D6488F0> /Users/tania_priv/Documents/ProjectML/.venv/lib/python3.9/site-packages/xgboost/lib/libxgboost.dylib\n  Reason: tried: '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file)"]


In [None]:
# Train model
results, best_model_name, best_model, feature_selector = enhanced_training_pipeline(df_train, features)


=== Tuning RandomForest ===


In [None]:
# Process test data
df_test_processed = pre_processing(df_test_raw.copy())
X_test = df_test_processed[features]
X_test_fs = feature_selector.transform(X_test)

# Predict
df_test_processed['pred_proba'] = best_model.predict_proba(X_test_fs)[:, 1]
predicted_roots = df_test_processed.loc[
    df_test_processed.groupby('id')['pred_proba'].idxmax()
]

# Format output
output_df = predicted_roots[['id', 'node']].rename(columns={'node': 'root'})
output_df = output_df.sort_values('id').reset_index(drop=True)
output_df.to_csv('predicted_roots.csv', index=False)

print(y_pred)

[0 0 0 ... 0 0 0]
