# 🐍 Machine Learning Project Script

In [2]:
import pandas as pd
import ast
import networkx as nx
from sklearn.preprocessing import MinMaxScaler

# 1. Upload data

In [3]:
df_train_raw = pd.read_csv("train.csv", sep=',')
df_train_raw["edgelist"].head(1)
df_train_raw["edgelist"] = df_train_raw["edgelist"].apply(ast.literal_eval)

df_test_raw = pd.read_csv("test.csv", sep=',')
df_test_raw["edgelist"].head(1)
df_test_raw["edgelist"] = df_test_raw["edgelist"].apply(ast.literal_eval)

# 2. Pre-Processing


In [4]:
import networkx as nx
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from itertools import combinations
from community import community_louvain  # pip install python-louvain

def normalize_group(df_group):
    numeric_cols = [
        'degree', 'closeness', 'betweenness', 'pagerank',
        'eigenvector', 'katz', 'load',
        'eccentricity', 'avg_neighbor_degree',
         'community', 'is_leaf'
        #'shortest_path_length', 'is_leaf', 'neighbor_connectivity'
    ]
    scaler = MinMaxScaler()
    df_group[numeric_cols] = scaler.fit_transform(df_group[numeric_cols])
    return df_group

def pre_processing(data):
    training_data = []

    for idx, row in data.iterrows():
        edgelist = row["edgelist"]
        
        # Create undirected graph
        T = nx.Graph()
        T.add_edges_from(edgelist)

        if not nx.is_connected(T):
            continue
        
        root_node = row.get("root", None)
        
        # Compute centralities
        closeness = nx.closeness_centrality(T)
        betweenness = nx.betweenness_centrality(T)
        pagerank = nx.pagerank(T, max_iter=1000)
        
        # Additional centrality measures with fallbacks
        try:
            eigenvector = nx.eigenvector_centrality(T, max_iter=10000, tol=1e-06)
        except nx.PowerIterationFailedConvergence:
            eigenvector = {n: 0.0 for n in T.nodes}
            
        try:
            katz = nx.katz_centrality(T, alpha=0.1)
        except nx.NetworkXException:
            katz = {n: 0.0 for n in T.nodes}
            
        try:
            load = nx.load_centrality(T)
        except:
            load = {n: 0.0 for n in T.nodes}

        # Structural properties
        degree = dict(T.degree())
        eccentricity = nx.eccentricity(T)
        avg_neighbor_degree = nx.average_neighbor_degree(T)
        
        # Community detection
        partition = community_louvain.best_partition(T)
        
        for v in T.nodes:
            features = {
                "sentence": row["sentence"],
                "language": row["language"],
                "n": row["n"],
                "node": v,

                # Centrality measures
                "degree": degree[v],
                "closeness": closeness[v],
                "betweenness": betweenness[v],
                "pagerank": pagerank[v],
                "eigenvector": eigenvector[v],
                "katz": katz[v],
                "load": load[v],

                # Structural properties
                "eccentricity": eccentricity[v],
                "avg_neighbor_degree": avg_neighbor_degree[v],

                # Community information
                "community": partition[v],

                "is_leaf": 1 if T.degree(v) == 1 else 0,
            }

            if "id" in row:
                features["id"] = row["id"]

            if root_node is not None:
                features["is_root"] = 1 if v == root_node else 0

            training_data.append(features)

    training_data = pd.DataFrame(training_data)
    
    # Normalize features by group
    df_normalized = training_data.groupby(["sentence", "language"], group_keys=True).apply(
        normalize_group, include_groups=False
    )
    df_normalized.reset_index(inplace=True)
    df_normalized.drop(columns=["level_2"], inplace=True)

    return df_normalized

In [5]:
df_train = pre_processing(df_train_raw)

df_train

Unnamed: 0,sentence,language,n,node,degree,closeness,betweenness,pagerank,eigenvector,katz,load,eccentricity,avg_neighbor_degree,community,is_leaf,is_root
0,2,Arabic,21,10,1.00,0.730183,0.724771,0.932971,0.990346,0.996388,0.724771,0.571429,0.555556,0.000000,0.0,1
1,2,Arabic,21,8,1.00,0.908084,0.990826,0.891309,1.000000,1.000000,0.990826,0.428571,0.555556,0.666667,0.0,0
2,2,Arabic,21,5,0.50,0.598665,0.174312,0.518343,0.547862,0.477072,0.174312,0.571429,0.333333,0.666667,0.0,0
3,2,Arabic,21,13,0.00,0.356589,0.000000,0.042182,0.236520,0.004953,0.000000,0.714286,0.333333,0.666667,1.0,0
4,2,Arabic,21,6,0.50,0.976170,0.908257,0.415764,0.703950,0.567473,0.908257,0.285714,1.000000,0.666667,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197474,995,Turkish,16,14,0.25,0.356543,0.200000,0.277926,0.417002,0.302740,0.200000,0.666667,0.411765,0.000000,0.0,0
197475,995,Turkish,16,10,0.00,0.061625,0.000000,0.029047,0.103774,0.004653,0.000000,1.000000,0.117647,0.000000,1.0,0
197476,995,Turkish,16,2,0.50,1.000000,0.885714,0.467346,0.842910,0.614483,0.885714,0.000000,0.509804,0.666667,0.0,0
197477,995,Turkish,16,1,0.00,0.304498,0.000000,0.007093,0.335177,0.074379,0.000000,0.666667,1.000000,0.000000,1.0,0


# 3. Models

**K-Fold Cross Validation**

Evaluate different models

In [6]:
comparisons_rf = [
    {"max_depth": 10, "min_samples_split": 5, "n_estimators": 50, "class_weight": "balanced"},  # árbol más pequeño
    {"max_depth": 20, "min_samples_split": 10, "n_estimators": 50, "class_weight": "balanced"}, # más regularizado
    {"max_depth": 20, "min_samples_split": 5, "n_estimators": 100, "class_weight": "balanced"}, # más árboles
    {"max_depth": None, "min_samples_split": 5, "n_estimators": 50, "class_weight": "balanced"}, # sin límite de profundidad
]


comparisons_dt = [
    {"max_depth": 50, "min_samples_split": 5, "class_weight": "balanced"},   # más limitado que el óptimo
    {"max_depth": None, "min_samples_split": 10, "class_weight": "balanced"}, # menos sobreajuste
    {"max_depth": 20, "min_samples_split": 5, "class_weight": "balanced"},    # muy limitado
]

comparisons_logreg_large = [
    {"penalty": "l2", "C": 0.1, "solver": "saga", "class_weight": "balanced", "max_iter": 1000, "n_jobs": -1},  # strong regularization
    {"penalty": "l2", "C": 1.0, "solver": "saga", "class_weight": "balanced", "max_iter": 1000, "n_jobs": -1},  # default
    {"penalty": "l1", "C": 1.0, "solver": "saga", "class_weight": "balanced", "max_iter": 1000, "n_jobs": -1},  # sparse features
    {"penalty": "elasticnet", "C": 1.0, "solver": "saga", "l1_ratio": 0.5, "class_weight": "balanced", "max_iter": 1000, "n_jobs": -1},  # mix L1/L2
    {"penalty": "l2", "C": 0.1, "solver": "lbfgs", "class_weight": "balanced"},   # stronger regularization
    {"penalty": "l2", "C": 1.0, "solver": "lbfgs", "class_weight": "balanced"},   # default regularization
    {"penalty": "l2", "C": 10.0, "solver": "lbfgs", "class_weight": "balanced"}
]


features = [col for col in df_train.columns if col not in ['id', 'sentence', 'language', 'is_root', 'group_id']]

To check if it overfits

He visto que dentro de la función de RandomForestClassifier, tu especificas un n_estimator = n, que te indica el número de árboles de decisión que quieres tener en tu random forest. El problema viene que cuando el árbol coje los datos que le das, este no respeta el group_id, así que puede ser que nodos de la misma frase estén en diferentes árboles de decisión. Es por eso que he creado la classe de random forest des de cero, teniendo en cuenta este group id a la hora de hacer el bootstrap.

In [None]:
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from collections import defaultdict

# Clase personalizada para Random Forest que respeta los grupos
class GroupAwareRandomForest:
    def __init__(self, n_estimators=50, max_depth=None, min_samples_split=2, class_weight=None, random_state=42):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.class_weight = class_weight
        self.random_state = random_state
        self.trees = []

    def fit(self, X, y, group_ids):
        np.random.seed(self.random_state)
        self.trees = []

        unique_groups = np.unique(group_ids)
        group_to_indices = defaultdict(list)
        for idx, group in enumerate(group_ids):
            group_to_indices[group].append(idx)

        for _ in range(self.n_estimators):
            sampled_groups = np.random.choice(unique_groups, size=int(0.8 * len(unique_groups)), replace=True)
            sampled_indices = []
            for group in sampled_groups:
                sampled_indices.extend(group_to_indices[group])
            sampled_indices = np.array(sampled_indices)

            tree = DecisionTreeClassifier(
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                class_weight=self.class_weight,
                random_state=self.random_state
            )
            tree.fit(X.iloc[sampled_indices], y.iloc[sampled_indices])
            self.trees.append(tree)

    def predict(self, X):
        preds = np.zeros((len(self.trees), len(X)))
        for i, tree in enumerate(self.trees):
            preds[i] = tree.predict(X)
        return (np.mean(preds, axis=0) > 0.5).astype(int)

# Función de evaluación general
def evaluate_models(df, features, comparisons_rf, comparisons_dt, comparisons_group_rf=None):
    df['group_id'] = df["sentence"].astype(str) + '_' + df["language"]
    X = df[features]
    y = df['is_root']
    groups = df["group_id"]

    results = []
    cv = StratifiedGroupKFold(n_splits=10)

    def evaluate_model(name, model, config, use_group_fit=False):
        train_scores = []
        val_scores = []

        for train_idx, val_idx in cv.split(X, y, groups):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
            group_train = groups.iloc[train_idx]

            if use_group_fit:
                model.fit(X_train, y_train, group_train)
            else:
                model.fit(X_train, y_train)

            y_train_pred = model.predict(X_train)
            y_val_pred = model.predict(X_val)

            f1_train = f1_score(y_train, y_train_pred, average='weighted')
            f1_val = f1_score(y_val, y_val_pred, average='weighted')

            train_scores.append(f1_train)
            val_scores.append(f1_val)

        avg_train = np.mean(train_scores)
        avg_val = np.mean(val_scores)
        std_train = np.std(train_scores)
        std_val = np.std(val_scores)
        avg_diff = np.mean(np.array(train_scores) - np.array(val_scores))

        results.append({
            "Model": name,
            "Params": config,
            "Train_F1_mean": round(avg_train, 4),
            "Train_F1_std": round(std_train, 4),
            "Val_F1_mean": round(avg_val, 4),
            "Val_F1_std": round(std_val, 4),
            "Gap_Train-Val": round(avg_diff, 4)
        })

    # Random Forest estándar
    for cfg in comparisons_rf:
        model = RandomForestClassifier(
            max_depth=cfg["max_depth"],
            min_samples_split=cfg["min_samples_split"],
            n_estimators=cfg["n_estimators"],
            class_weight=cfg["class_weight"],
            random_state=42,
            n_jobs=-1
        )
        evaluate_model("Random Forest", model, cfg)

    # Decision Tree estándar
    for cfg in comparisons_dt:
        model = DecisionTreeClassifier(
            max_depth=cfg["max_depth"],
            min_samples_split=cfg["min_samples_split"],
            class_weight=cfg["class_weight"],
            random_state=42
        )
        evaluate_model("Decision Tree", model, cfg)

    # Random Forest con respeto a group_id
    if comparisons_group_rf is not None:
        for cfg in comparisons_group_rf:
            model = GroupAwareRandomForest(
                n_estimators=cfg["n_estimators"],
                max_depth=cfg["max_depth"],
                min_samples_split=cfg["min_samples_split"],
                class_weight=cfg["class_weight"],
                random_state=42
            )
            evaluate_model("GroupAware Random Forest", model, cfg, use_group_fit=True)
    for lr in comparisons_logreg_large:
        model = LogisticRegression(
            penalty=lr["penalty"],
            solver=lr["solver"],
            C=lr["C"],
            class_weight="balanced",
            max_iter=1000
        )
        evaluate_model("Logistic Regression", model, lr)

    return results


In [9]:
results = evaluate_models(
    df_train,
    features,
    comparisons_rf,
    comparisons_dt,
    comparisons_rf
)


KeyboardInterrupt: 

In [22]:
pd.DataFrame(results).sort_values(by="Val_F1_mean", ascending=False).to_csv("best_parameters.csv")

In [23]:
pd.DataFrame(results).sort_values(by="Val_F1_mean", ascending=False)


Unnamed: 0,Model,Params,Train_F1_mean,Train_F1_std,Val_F1_mean,Val_F1_std,Gap_Train-Val
3,Random Forest,"{'max_depth': None, 'min_samples_split': 5, 'n...",0.9901,0.0002,0.9323,0.0011,0.0577
10,GroupAware Random Forest,"{'max_depth': None, 'min_samples_split': 5, 'n...",0.9892,0.0002,0.9311,0.0008,0.0581
2,Random Forest,"{'max_depth': 20, 'min_samples_split': 5, 'n_e...",0.9576,0.0008,0.9234,0.0016,0.0341
1,Random Forest,"{'max_depth': 20, 'min_samples_split': 10, 'n_...",0.9524,0.0007,0.9216,0.0016,0.0307
9,GroupAware Random Forest,"{'max_depth': 20, 'min_samples_split': 5, 'n_e...",0.9512,0.0008,0.9205,0.0012,0.0307
8,GroupAware Random Forest,"{'max_depth': 20, 'min_samples_split': 10, 'n_...",0.9483,0.0009,0.9191,0.0015,0.0291
4,Decision Tree,"{'max_depth': 50, 'min_samples_split': 5, 'cla...",0.981,0.0002,0.9141,0.0018,0.0669
5,Decision Tree,"{'max_depth': None, 'min_samples_split': 10, '...",0.9601,0.0003,0.9046,0.0018,0.0555
6,Decision Tree,"{'max_depth': 20, 'min_samples_split': 5, 'cla...",0.9226,0.0024,0.8877,0.0024,0.0349
0,Random Forest,"{'max_depth': 10, 'min_samples_split': 5, 'n_e...",0.8666,0.001,0.8608,0.0023,0.0059
