# 🐍 Machine Learning Project Script

In [2]:
import pandas as pd
import ast
import networkx as nx
from sklearn.preprocessing import MinMaxScaler

# 1. Upload data

In [3]:
df_train_raw = pd.read_csv("train.csv", sep=',')
df_train_raw["edgelist"].head(1)
df_train_raw["edgelist"] = df_train_raw["edgelist"].apply(ast.literal_eval)

# 2. Pre-Processing


In [92]:

# Función para normalizar por grupo
def normalize_group(df_group):
    #centrality_cols = ['deg', 'closeness', 'betweenness', 'pagerank', 'eigenvector', 'katz']
    centrality_cols = ['deg', 'closeness', 'betweenness', 'pagerank']
    scaler = MinMaxScaler()
    df_group[centrality_cols] = scaler.fit_transform(df_group[centrality_cols])
    return df_group

def pre_processing(data):
    # Container for the final dataset
    training_data = []

    # Iterate over each sentence
    for idx, row in data.iterrows():
        edgelist = row["edgelist"]
        
        # Build tree
        T = nx.from_edgelist(edgelist)
        
        # Skip disconnected graphs
        if not nx.is_connected(T):
            continue

        # Assume root is the node that is never a child
        children = set(v for _, v in edgelist)
        parents = set(v for v, _ in edgelist)
        root_candidates = list(parents - children)
        root = root_candidates[0] if root_candidates else list(T.nodes)[0]  # fallback

        # Compute centralities
        deg_centrality = nx.degree_centrality(T)
        closeness = nx.closeness_centrality(T)
        betweenness = nx.betweenness_centrality(T)
        pagerank = nx.pagerank(T, max_iter=1000)
        eigenvector = nx.eigenvector_centrality(T, max_iter=10000, tol=1e-06)
        katz = nx.katz_centrality(T, alpha=0.1)
        
        # Generate a row for each vertex
        for v in T.nodes:
            # features = {
            #     "sentence": row["sentence"],
            #     "language": row["language"],
            #     "n": row["n"],
            #     "node": v,
            #     "deg": deg_centrality[v],
            #     "closeness": closeness[v],
            #     "betweenness": betweenness[v],
            #     "pagerank": pagerank[v],
            #     "eigenvector" : eigenvector[v],
            #     "katz" : katz[v],
            #     "is_root": 1 if v == root else 0
            # }
            features = {
                "sentence": row["sentence"],
                "language": row["language"],
                "n": row["n"],
                "node": v,
                "deg": deg_centrality[v],
                "closeness": closeness[v],
                "betweenness": betweenness[v],
                "pagerank": pagerank[v],
                "is_root": 1 if v == root else 0
            }
            training_data.append(features)
        
    training_data = pd.DataFrame(training_data)

    df_normalized = training_data.groupby(["sentence", "language"], group_keys=True).apply(normalize_group, include_groups=False)
    df_normalized.reset_index(inplace=True)
    df_normalized.drop(columns=["level_2"], inplace=True)
    return df_normalized


In [93]:
df_train = pre_processing(df_train_raw)

# 3. Models

**K-Fold Cross Validation**

In [66]:
from sklearn.model_selection import GroupKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
import warnings

warnings.filterwarnings("ignore", category=RuntimeWarning)

GKFolf_training allowing multiple roots per sentence (we don't want that)

In [None]:
def GKFold_training(model, features, df, n = 10):
    gkf = GroupKFold(n_splits=n)

    X = df[features]
    Y = df['is_root']

    df['group_id'] = df["sentence"].astype(str) + '_' + df["language"]

    accs, precs, recalls, f1s = [], [], [], []

    for fold, (train_idx, val_idx) in enumerate(gkf.split(X, Y, groups = df["group_id"])):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = Y.iloc[train_idx], Y.iloc[val_idx]

        
        model.fit(X_train, y_train)

        y_pred = model.predict(X_val)

        accs.append(accuracy_score(y_val, y_pred))
        precs.append(precision_score(y_val, y_pred))
        recalls.append(recall_score(y_val, y_pred))
        f1s.append(f1_score(y_val, y_pred))
    
    print(f"Average Accuracy:  {np.mean(accs):.4f}")
    print(f"Average Precision: {np.mean(precs):.4f}")
    print(f"Average Recall:    {np.mean(recalls):.4f}")
    print(f"Average F1 Score:  {np.mean(f1s):.4f}")
    
    return np.mean(accs), np.mean(precs), np.mean(recalls), np.mean(f1s)
    


GKFolf_training not allowing multiple roots per sentence

In [96]:
def GKFold_training(model, features, df, n=10):
    gkf = GroupKFold(n_splits=n)

    X = df[features]
    Y = df['is_root']
    df['group_id'] = df["sentence"].astype(str) + '_' + df["language"]

    accs, precs, recalls, f1s = [], [], [], []

    for fold, (train_idx, val_idx) in enumerate(gkf.split(X, Y, groups=df["group_id"])):
        # Training
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = Y.iloc[train_idx], Y.iloc[val_idx]

        model.fit(X_train, y_train)

        # Predict probabilities
        val_df = df.iloc[val_idx].copy()
        val_df['proba'] = model.predict_proba(X_val)[:, 1]

        # Select node with highest proba per (sentence, language)
        predicted_roots = val_df.loc[val_df.groupby(['sentence', 'language'])['proba'].idxmax()]

        # True root per tree = where is_root == 1
        true_roots = val_df[val_df['is_root'] == 1]
        merged = predicted_roots.merge(true_roots, on=['sentence', 'language'], suffixes=('_pred', '_true'))

        # Compare predicted node to true root
        correct = merged['node_pred'] == merged['node_true']
        acc = correct.mean()

        # Classification metrics at sentence level
        precision = precision_score(correct, [True] * len(correct))
        recall = recall_score(correct, [True] * len(correct))
        f1 = f1_score(correct, [True] * len(correct))

        accs.append(acc)
        precs.append(precision)
        recalls.append(recall)
        f1s.append(f1)

    print(f"Average Accuracy:  {np.mean(accs):.4f}")
    print(f"Average Precision: {np.mean(precs):.4f}")
    print(f"Average Recall:    {np.mean(recalls):.4f}")
    print(f"Average F1 Score:  {np.mean(f1s):.4f}")

    return np.mean(accs), np.mean(precs), np.mean(recalls), np.mean(f1s)


## Logistic Regression


In [97]:
model_lr = LogisticRegression(class_weight='balanced')
# model_lr_adv = LogisticRegression(penalty = 'l2', 
#                                   solver = 'saga',   # Sparse features: have mostly 0-values
#                                   C=1.0, 
#                                   class_weight='balanced',
#                                   max_iter=1000,
#                                   random_state=42)

#features = ['deg', 'closeness', 'betweenness', 'pagerank']
features_n = ['deg', 'closeness', 'betweenness', 'pagerank', 'n']
#features_add = ['deg', 'closeness', 'betweenness', 'pagerank', 'n','eigenvector', 'katz']

#acc_lr, prec_lr, recall_lr, f1_lr = GKFold_training(model_lr, features, df_normalized)
acc_lr, prec_lr, recall_lr, f1_lr = GKFold_training(model_lr, features_n, df_train)
#acc_lr, prec_lr, recall_lr, f1_lr = GKFold_training(model_lr_adv, features_n, df_normalized)

Average Accuracy:  0.2695
Average Precision: 0.2695
Average Recall:    1.0000
Average F1 Score:  0.4244
