# 🐍 Machine Learning Project Script

In [3]:
import pandas as pd
import ast
import networkx as nx
from sklearn.preprocessing import MinMaxScaler

# 1. Upload data

In [4]:
df_train_raw = pd.read_csv("train.csv", sep=',')
df_train_raw["edgelist"].head(1)
df_train_raw["edgelist"] = df_train_raw["edgelist"].apply(ast.literal_eval)

# 2. Pre-Processing


In [5]:
import networkx as nx
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

def normalize_group(df_group):
    # Normalize all numeric centrality and graph metrics
    numeric_cols = [
        'deg', 'degree', 'closeness', 'betweenness', 'pagerank',
        'eigenvector', 'katz', 'eccentricity', 'clustering', 'distance_to_root'
    ]
    scaler = MinMaxScaler()
    df_group[numeric_cols] = scaler.fit_transform(df_group[numeric_cols])
    return df_group

def pre_processing(data):
    training_data = []

    for idx, row in data.iterrows():
        edgelist = row["edgelist"]

        T = nx.from_edgelist(edgelist)

        # Skip disconnected graphs
        if not nx.is_connected(T):
            continue

        children = set(v for _, v in edgelist)
        parents = set(v for v, _ in edgelist)
        root_candidates = list(parents - children)
        root = root_candidates[0] if root_candidates else list(T.nodes)[0]

        # Compute centralities
        deg_centrality = nx.degree_centrality(T)
        closeness = nx.closeness_centrality(T)
        betweenness = nx.betweenness_centrality(T)
        pagerank = nx.pagerank(T, max_iter=1000)
        try:
            eigenvector = nx.eigenvector_centrality(T, max_iter=10000, tol=1e-06)
        except nx.PowerIterationFailedConvergence:
            eigenvector = {n: 0.0 for n in T.nodes}
        try:
            katz = nx.katz_centrality(T, alpha=0.1)
        except nx.NetworkXException:
            katz = {n: 0.0 for n in T.nodes}

        # Additional graph properties
        degree = dict(T.degree())
        eccentricity = nx.eccentricity(T)
        clustering = nx.clustering(T)
        shortest_paths = nx.shortest_path_length(T, source=root)

        for v in T.nodes:
            features = {
                "sentence": row["sentence"],
                "language": row["language"],
                "n": row["n"],
                "node": v,
                "deg": deg_centrality[v],
                "degree": degree[v],
                "closeness": closeness[v],
                "betweenness": betweenness[v],
                "pagerank": pagerank[v],
                "eigenvector": eigenvector[v],
                "katz": katz[v],
                "eccentricity": eccentricity[v],
                "clustering": clustering[v],
                "distance_to_root": shortest_paths[v],
                "is_root": 1 if v == root else 0
            }
            training_data.append(features)

    training_data = pd.DataFrame(training_data)

    # Normalize the numeric features by group
    df_normalized = training_data.groupby(["sentence", "language"], group_keys=True).apply(
        normalize_group, include_groups=False
    )
    df_normalized.reset_index(inplace=True)
    df_normalized.drop(columns=["level_2"], inplace=True)

    return df_normalized


In [6]:
df_train = pre_processing(df_train_raw)

KeyboardInterrupt: 

# 3. Models

**K-Fold Cross Validation**

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupKFold
from sklearn.feature_selection import SelectFromModel
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

# You may want to fix model key in MLPClassifier entry:
models = {
    "Logistic Regression": LogisticRegression(class_weight='balanced', max_iter=1000),
    "Random Forest": RandomForestClassifier(class_weight='balanced', random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "XGBoost": XGBClassifier(scale_pos_weight=None, use_label_encoder=False, eval_metric='logloss'),
    "SVM": SVC(class_weight='balanced', probability=True, kernel='rbf'),
    "Neural Network": MLPClassifier(hidden_layer_sizes=(64, 32), early_stopping=True)
}

def enhanced_training_pipeline(df, features, n_folds=10):
    # Create group_id for grouping by sentence and language (adapt column names if needed)
    df['group_id'] = df["sentence"].astype(str) + '_' + df["language"]

    X = df[features]
    y = df['is_root']
    groups = df["group_id"]

    # Compute scale_pos_weight dynamically for XGBoost inside loop per fold (better for imbalance)
    results = {}

    for model_name, model in models.items():
        print(f"\n=== Evaluating {model_name} ===")
        fold_metrics = {'acc': [], 'prec': [], 'rec': [], 'f1': []}

        for fold, (train_idx, val_idx) in enumerate(GroupKFold(n_splits=n_folds).split(X, y, groups=groups)):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

            # For XGBoost update scale_pos_weight based on training fold
            if model_name == "XGBoost":
                pos = y_train.sum()
                neg = len(y_train) - pos
                model.set_params(scale_pos_weight=neg / pos if pos > 0 else 1)
                X_train_fs, X_val_fs = X_train, X_val
            else:
                # Feature selection with logistic regression for other models
                max_feats = min(15, X_train.shape[1])
                selector = SelectFromModel(
                    estimator=LogisticRegression(max_iter=1000, class_weight='balanced'),
                    max_features=max_feats
                ).fit(X_train, y_train)
                X_train_fs = selector.transform(X_train)
                X_val_fs = selector.transform(X_val)

            sample_weights = compute_sample_weight(
                class_weight='balanced',
                y=y_train
            )

            # Fit model
            if model_name == "Neural Network":
                model.fit(X_train_fs, y_train)
            else:
                model.fit(X_train_fs, y_train, sample_weight=sample_weights)

            val_df = df.iloc[val_idx].copy()
            val_df['proba'] = model.predict_proba(X_val_fs)[:, 1]

            # For each sentence-language group, pick node with highest probability as predicted root
            predicted_roots = val_df.loc[val_df.groupby(['sentence', 'language'])['proba'].idxmax()]
            true_roots = val_df[val_df['is_root'] == 1]

            # Merge predicted roots with true roots by group
            merged = predicted_roots.merge(
                true_roots, 
                on=['sentence', 'language'], 
                suffixes=('_pred', '_true')
            )

            # Correct prediction if predicted node matches true root node
            correct = merged['node_pred'] == merged['node_true']
            acc = correct.mean()

            fold_metrics['acc'].append(acc)
            fold_metrics['prec'].append(precision_score(correct, [True]*len(correct)))
            fold_metrics['rec'].append(recall_score(correct, [True]*len(correct)))
            fold_metrics['f1'].append(f1_score(correct, [True]*len(correct)))

        results[model_name] = {
            'accuracy': np.mean(fold_metrics['acc']),
            'precision': np.mean(fold_metrics['prec']),
            'recall': np.mean(fold_metrics['rec']),
            'f1': np.mean(fold_metrics['f1'])
        }

        print(f"Average Accuracy: {results[model_name]['accuracy']:.4f}")
        print(f"Average F1: {results[model_name]['f1']:.4f}")

    best_model_name = max(results, key=lambda x: results[x]['f1'])
    print(f"\nBest model: {best_model_name}")

    # Fit best model on full data
    best_model = models[best_model_name]

    if best_model_name == "XGBoost":
        pos = y.sum()
        neg = len(y) - pos
        best_model.set_params(scale_pos_weight=neg / pos if pos > 0 else 1)

    best_model.fit(X[features], y)

    if best_model_name == "XGBoost":
        plt.figure(figsize=(10, 8))
        xgb.plot_importance(best_model, max_num_features=15)
        plt.title('Feature Importance')
        plt.show()

    return results, best_model_name, best_model

In [None]:
# === Run pipeline ===

features = [col for col in df_train.columns if col not in ['sentence', 'language', 'node', 'n', 'is_root', 'group_id']]

results, best_model_name, best_model = enhanced_training_pipeline(df_train, features)


NameError: name 'df_train' is not defined

In [None]:
print(features)
print(df_train[features].dtypes)



['deg', 'degree', 'closeness', 'betweenness', 'pagerank', 'eigenvector', 'katz', 'eccentricity', 'clustering', 'distance_to_root', 'group_id']
deg                 float64
degree              float64
closeness           float64
betweenness         float64
pagerank            float64
eigenvector         float64
katz                float64
eccentricity        float64
clustering          float64
distance_to_root    float64
group_id             object
dtype: object


In [None]:
# Cross-validation training function
def enhanced_GKFold_training(model, features, df, n=10):
    gkf = GroupKFold(n_splits=n)
    df['group_id'] = df["sentence"].astype(str) + '_' + df["language"]
    
    metrics = {
        'accuracy': [],
        'precision': [],
        'recall': [],
        'f1': [],
        'roc_auc': []
    }
    
    df['proba'] = 0  # Initialize column for storing probabilities
    
    for train_idx, val_idx in gkf.split(df, df['is_root'], groups=df["group_id"]):
        # Prepare data
        X_train = df.iloc[train_idx][features]
        y_train = df.iloc[train_idx]['is_root']
        X_val = df.iloc[val_idx][features]
        y_val = df.iloc[val_idx]['is_root']
        
        # Train and predict
        model.fit(X_train, y_train)
        y_proba = model.predict_proba(X_val)[:, 1]
        
        # Store probabilities for tree-level evaluation
        df.loc[val_idx, 'proba'] = y_proba
        
        # Node-level metrics
        y_pred = model.predict(X_val)
        metrics['accuracy'].append(accuracy_score(y_val, y_pred))
        metrics['precision'].append(precision_score(y_val, y_pred, zero_division=0))
        metrics['recall'].append(recall_score(y_val, y_pred))
        metrics['f1'].append(f1_score(y_val, y_pred))
        metrics['roc_auc'].append(roc_auc_score(y_val, y_proba))
    
    # Tree-level evaluation
    predicted_roots = df.loc[df.groupby(['sentence', 'language'])['proba'].idxmax()]
    true_roots = df[df['is_root'] == 1]
    merged = predicted_roots.merge(true_roots, on=['sentence', 'language'], 
                                   suffixes=('_pred', '_true'))
    tree_accuracy = (merged['node_pred'] == merged['node_true']).mean()
    
    # Print results
    print("Node-Level Metrics:")
    for metric, values in metrics.items():
        print(f"{metric.capitalize()}: {np.mean(values):.4f} ± {np.std(values):.4f}")
    
    print(f"\nTree-Level Accuracy: {tree_accuracy:.4f}")
    
    return metrics, tree_accuracy


In [None]:
# Train all models and store results
all_metrics = {}

for name, model in models.items():
    print(f"\n=== Training {name} ===")
    metrics, tree_acc = enhanced_GKFold_training(model, features_n, df_train)
    all_metrics[name] = {
        "metrics": metrics,
        "tree_accuracy": tree_acc
    }

# Optional: Plot performance
f1_scores = [np.mean(m['metrics']['f1']) for m in all_metrics.values()]
tree_accs = [m['tree_accuracy'] for m in all_metrics.values()]
model_names = list(all_metrics.keys())

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.barh(model_names, f1_scores, color='skyblue')
plt.title("Average F1 Score")
plt.xlabel("F1 Score")
plt.xlim(0, 1)

plt.subplot(1, 2, 2)
plt.barh(model_names, tree_accs, color='orange')
plt.title("Tree-Level Accuracy")
plt.xlabel("Accuracy")
plt.xlim(0, 1)

plt.tight_layout()
plt.show()


=== Training Logistic Regression ===


KeyError: "['deg', 'closeness', 'betweenness', 'n'] not in index"

In [None]:
from sklearn.ensemble import VotingClassifier

ensemble = VotingClassifier(
    estimators=[
        ('lr', LogisticRegression(class_weight='balanced')),
        ('rf', RandomForestClassifier(class_weight='balanced')),
        ('xgb', XGBClassifier(scale_pos_weight=...))
    ],
    voting='soft'  # Use probabilities
)

## Logistic Regression


In [None]:
model_lr = LogisticRegression(class_weight='balanced')
# model_lr_adv = LogisticRegression(penalty = 'l2', 
#                                   solver = 'saga',   # Sparse features: have mostly 0-values
#                                   C=1.0, 
#                                   class_weight='balanced',
#                                   max_iter=1000,
#                                   random_state=42)

#features = ['deg', 'closeness', 'betweenness', 'pagerank']
features_n = ['deg', 'closeness', 'betweenness', 'pagerank', 'n']
#features_add = ['deg', 'closeness', 'betweenness', 'pagerank', 'n','eigenvector', 'katz']

#acc_lr, prec_lr, recall_lr, f1_lr = GKFold_training(model_lr, features, df_normalized)
acc_lr, prec_lr, recall_lr, f1_lr = GKFold_training(model_lr, features_n, df_train)
#acc_lr, prec_lr, recall_lr, f1_lr = GKFold_training(model_lr_adv, features_n, df_normalized)

KeyError: "['deg', 'closeness', 'betweenness', 'n'] not in index"