# 🐍 Machine Learning Project Script

In [1]:
import pandas as pd
import ast
import networkx as nx
from sklearn.preprocessing import MinMaxScaler

# 1. Upload data

In [2]:
df_train_raw = pd.read_csv("train.csv", sep=',')
df_train_raw["edgelist"].head(1)
df_train_raw["edgelist"] = df_train_raw["edgelist"].apply(ast.literal_eval)

df_test_raw = pd.read_csv("test.csv", sep=',')
df_test_raw["edgelist"].head(1)
df_test_raw["edgelist"] = df_test_raw["edgelist"].apply(ast.literal_eval)

# 2. Pre-Processing


In [3]:
import networkx as nx
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from itertools import combinations
from community import community_louvain  # pip install python-louvain

def normalize_group(df_group):
    numeric_cols = [
        'degree', 'closeness', 'betweenness', 'pagerank',
        'eigenvector', 'katz', 'load',
        'eccentricity', 'avg_neighbor_degree',
         'community', 'is_leaf'
        #'shortest_path_length', 'is_leaf', 'neighbor_connectivity'
    ]
    scaler = MinMaxScaler()
    df_group[numeric_cols] = scaler.fit_transform(df_group[numeric_cols])
    return df_group

def pre_processing(data):
    training_data = []

    for idx, row in data.iterrows():
        edgelist = row["edgelist"]
        
        # Create undirected graph
        T = nx.Graph()
        T.add_edges_from(edgelist)

        if not nx.is_connected(T):
            continue
        
        root_node = row.get("root", None)
        
        # Compute centralities
        closeness = nx.closeness_centrality(T)
        betweenness = nx.betweenness_centrality(T)
        pagerank = nx.pagerank(T, max_iter=1000)
        
        # Additional centrality measures with fallbacks
        try:
            eigenvector = nx.eigenvector_centrality(T, max_iter=10000, tol=1e-06)
        except nx.PowerIterationFailedConvergence:
            eigenvector = {n: 0.0 for n in T.nodes}
            
        try:
            katz = nx.katz_centrality(T, alpha=0.1)
        except nx.NetworkXException:
            katz = {n: 0.0 for n in T.nodes}
            
        try:
            load = nx.load_centrality(T)
        except:
            load = {n: 0.0 for n in T.nodes}

        # Structural properties
        degree = dict(T.degree())
        eccentricity = nx.eccentricity(T)
        avg_neighbor_degree = nx.average_neighbor_degree(T)
        
        # Community detection
        partition = community_louvain.best_partition(T)
        
        for v in T.nodes:
            features = {
                "sentence": row["sentence"],
                "language": row["language"],
                "n": row["n"],
                "node": v,

                # Centrality measures
                "degree": degree[v],
                "closeness": closeness[v],
                "betweenness": betweenness[v],
                "pagerank": pagerank[v],
                "eigenvector": eigenvector[v],
                "katz": katz[v],
                "load": load[v],

                # Structural properties
                "eccentricity": eccentricity[v],
                "avg_neighbor_degree": avg_neighbor_degree[v],

                # Community information
                "community": partition[v],

                "is_leaf": 1 if T.degree(v) == 1 else 0,
            }

            if "id" in row:
                features["id"] = row["id"]

            if root_node is not None:
                features["is_root"] = 1 if v == root_node else 0

            training_data.append(features)

    training_data = pd.DataFrame(training_data)
    
    # Normalize features by group
    df_normalized = training_data.groupby(["sentence", "language"], group_keys=True).apply(
        normalize_group, include_groups=False
    )
    df_normalized.reset_index(inplace=True)
    df_normalized.drop(columns=["level_2"], inplace=True)

    return df_normalized

In [4]:
df_train = pre_processing(df_train_raw)

df_train

Unnamed: 0,sentence,language,n,node,degree,closeness,betweenness,pagerank,eigenvector,katz,load,eccentricity,avg_neighbor_degree,community,is_leaf,is_root
0,2,Arabic,21,10,1.00,0.730183,0.724771,0.932971,0.990346,0.996388,0.724771,0.571429,0.555556,0.000000,0.0,1
1,2,Arabic,21,8,1.00,0.908084,0.990826,0.891309,1.000000,1.000000,0.990826,0.428571,0.555556,0.666667,0.0,0
2,2,Arabic,21,5,0.50,0.598665,0.174312,0.518343,0.547862,0.477072,0.174312,0.571429,0.333333,0.666667,0.0,0
3,2,Arabic,21,13,0.00,0.356589,0.000000,0.042182,0.236520,0.004953,0.000000,0.714286,0.333333,0.666667,1.0,0
4,2,Arabic,21,6,0.50,0.976170,0.908257,0.415764,0.703950,0.567473,0.908257,0.285714,1.000000,0.666667,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197474,995,Turkish,16,14,0.25,0.356543,0.200000,0.277926,0.417002,0.302740,0.200000,0.666667,0.411765,0.666667,0.0,0
197475,995,Turkish,16,10,0.00,0.061625,0.000000,0.029047,0.103774,0.004653,0.000000,1.000000,0.117647,0.666667,1.0,0
197476,995,Turkish,16,2,0.50,1.000000,0.885714,0.467346,0.842910,0.614483,0.885714,0.000000,0.509804,0.000000,0.0,0
197477,995,Turkish,16,1,0.00,0.304498,0.000000,0.007093,0.335177,0.074379,0.000000,0.666667,1.000000,0.666667,1.0,0


# 3. Models

**K-Fold Cross Validation**

In [None]:
from sklearn.model_selection import StratifiedGroupKFold, GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
# from xgboost import XGBClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline

def f1_weighted(y_true, y_pred):
    from sklearn.metrics import classification_report
    report = classification_report(y_true, y_pred, output_dict=True)
    return report['weighted avg']['f1-score']

def grid_search_models(df, features, n_folds=5):
    """
    Perform grid search on three models and return best parameters for each.
    
    Args:
        df: DataFrame containing the data
        features: List of feature columns to use
        n_folds: Number of cross-validation folds
        
    Returns:
        Dictionary with model names as keys and their best parameters as values
    """
    df['group_id'] = df["sentence"].astype(str) + '_' + df["language"]
    X = df[features]
    y = df['is_root']
    groups = df["group_id"]
    
    # Define parameter grids for each model
    param_grids = {
        "Random Forest": {
            'classifier__max_depth': [10, 20, 30],
            'classifier__min_samples_split': [5, 10, 15],
            'classifier__min_samples_leaf': [2, 5, 10],
            'classifier__n_estimators': [50, 100],
            'classifier__class_weight': ['balanced']
        },
        "Decision Tree": {
            'classifier__max_depth': [50, 100, None],
            'classifier__min_samples_split': [5, 10, 20],
            'classifier__class_weight': ['balanced']
        }
        # "XGB Classifier": {
        #     'classifier__max_depth': [3, 4, 5],
        #     'classifier__learning_rate': [0.01, 0.1],
        #     'classifier__subsample': [0.7, 0.8],
        #     'classifier__n_estimators': [50, 100],
        #     'classifier__reg_alpha': [0, 0.5],
        #     'classifier__reg_lambda': [0.5, 1]
        # }
    }
    
    # Base models
    base_models = {
        "Random Forest": RandomForestClassifier(random_state=42),
        "Decision Tree": DecisionTreeClassifier(random_state=42)
        # "XGB Classifier": XGBClassifier(random_state=42, eval_metric='logloss')
    }
    
    best_params = {}
    scorer = make_scorer(f1_weighted, greater_is_better=True)

    for model_name, model in base_models.items():
        print(f"\n=== Grid Search for {model_name} ===")
        
        # Create pipeline
        pipeline = Pipeline([
            ('feature_selector', SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42))),
            ('classifier', model)
        ])
        
        # Custom CV that preserves groups
        cv = StratifiedGroupKFold(n_splits=n_folds)
        
        # Grid search
        grid_search = GridSearchCV(
            estimator=pipeline,
            param_grid=param_grids[model_name],
            cv=cv,
            scoring=scorer,
            n_jobs=-1,
            verbose=1
        )
        
        # Fit grid search
        grid_search.fit(X, y, groups=groups)
        
        # Store best parameters
        best_params[model_name] = grid_search.best_params_
        print(f"Best parameters: {grid_search.best_params_}")
    
    return best_params

# Example usage:
# best_params = grid_search_models(df, features)
# print("\nBest parameters for all models:")
# for model, params in best_params.items():
#     print(f"{model}: {params}")

Grid Search optimizado

In [8]:
from sklearn.model_selection import StratifiedGroupKFold, GridSearchCV
from sklearn.metrics import make_scorer, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline

def f1_weighted(y_true, y_pred):
    report = classification_report(y_true, y_pred, output_dict=True)
    return report['weighted avg']['f1-score']

def grid_search_models(df, features, n_folds=5):
    """
    Fast grid search for Random Forest and Decision Tree on large datasets.

    Args:
        df: DataFrame with the dataset
        features: List of feature column names
        n_folds: Number of folds for cross-validation

    Returns:
        Dictionary with best parameters for each model
    """
    df['group_id'] = df["sentence"].astype(str) + '_' + df["language"]
    X = df[features]
    y = df['is_root']
    groups = df["group_id"]

    # Simplified hyperparameter grids
    param_grids = {
        "Random Forest": {
            'classifier__max_depth': [10, 20],
            'classifier__min_samples_split': [5, 10],
            'classifier__n_estimators': [50],
            'classifier__class_weight': ['balanced']
        },
        "Decision Tree": {
            'classifier__max_depth': [50, None],
            'classifier__min_samples_split': [5, 10],
            'classifier__class_weight': ['balanced']
        }
    }

    # Base models
    base_models = {
        "Random Forest": RandomForestClassifier(random_state=42),
        "Decision Tree": DecisionTreeClassifier(random_state=42)
    }

    best_params = {}
    scorer = make_scorer(f1_weighted, greater_is_better=True)

    for model_name, model in base_models.items():
        print(f"\n=== Grid Search for {model_name} ===")

        # Pipeline (sin SelectFromModel para acelerar)
        pipeline = Pipeline([
            ('classifier', model)
        ])

        cv = StratifiedGroupKFold(n_splits=n_folds)

        grid_search = GridSearchCV(
            estimator=pipeline,
            param_grid=param_grids[model_name],
            cv=cv,
            scoring=scorer,
            n_jobs=-1,  # Usa todos los núcleos disponibles
            verbose=1
        )

        grid_search.fit(X, y, groups=groups)

        best_params[model_name] = grid_search.best_params_
        print(f"Best parameters: {grid_search.best_params_}")

    return best_params


In [9]:
# === Run pipeline ===

features = [col for col in df_train.columns if col not in ['id', 'sentence', 'language', 'is_root', 'group_id']]

best_params = grid_search_models(df_train, features)

print("\nBest parameters for all models:")
for model, params in best_params.items():
     print(f"{model}: {params}")



=== Grid Search for Random Forest ===
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best parameters: {'classifier__class_weight': 'balanced', 'classifier__max_depth': 20, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 50}

=== Grid Search for Decision Tree ===
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best parameters: {'classifier__class_weight': 'balanced', 'classifier__max_depth': None, 'classifier__min_samples_split': 5}

Best parameters for all models:
Random Forest: {'classifier__class_weight': 'balanced', 'classifier__max_depth': 20, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 50}
Decision Tree: {'classifier__class_weight': 'balanced', 'classifier__max_depth': None, 'classifier__min_samples_split': 5}


Evaluate different models

In [13]:
comparisons_rf = [
    {"max_depth": 10, "min_samples_split": 5, "n_estimators": 50, "class_weight": "balanced"},  # árbol más pequeño
    {"max_depth": 20, "min_samples_split": 10, "n_estimators": 50, "class_weight": "balanced"}, # más regularizado
    {"max_depth": 20, "min_samples_split": 5, "n_estimators": 100, "class_weight": "balanced"}, # más árboles
    {"max_depth": None, "min_samples_split": 5, "n_estimators": 50, "class_weight": "balanced"}, # sin límite de profundidad
]


comparisons_dt = [
    {"max_depth": 50, "min_samples_split": 5, "class_weight": "balanced"},   # más limitado que el óptimo
    {"max_depth": None, "min_samples_split": 10, "class_weight": "balanced"}, # menos sobreajuste
    {"max_depth": 20, "min_samples_split": 5, "class_weight": "balanced"},    # muy limitado
]


from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

def evaluate_models(df, features, comparisons_rf, comparisons_dt):
    df['group_id'] = df["sentence"].astype(str) + '_' + df["language"]
    X = df[features]
    y = df['is_root']
    groups = df["group_id"]

    results = []
    cv = StratifiedGroupKFold(n_splits=10)

    def evaluate_model(name, model, config):
        f1s = []
        for train_idx, test_idx in cv.split(X, y, groups):
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            report = classification_report(y_test, y_pred, output_dict=True)
            f1 = report['weighted avg']['f1-score']
            f1s.append(f1)

        avg_f1 = sum(f1s) / len(f1s)
        results.append({
            "Model": name,
            "Params": config,
            "F1_score": round(avg_f1, 4)
        })

    # Evaluate Random Forest configs
    for cfg in comparisons_rf:
        model = RandomForestClassifier(
            max_depth=cfg["max_depth"],
            min_samples_split=cfg["min_samples_split"],
            n_estimators=cfg["n_estimators"],
            class_weight=cfg["class_weight"],
            random_state=42,
            n_jobs=-1
        )
        evaluate_model("Random Forest", model, cfg)

    # Evaluate Decision Tree configs
    for cfg in comparisons_dt:
        model = DecisionTreeClassifier(
            max_depth=cfg["max_depth"],
            min_samples_split=cfg["min_samples_split"],
            class_weight=cfg["class_weight"],
            random_state=42
        )
        evaluate_model("Decision Tree", model, cfg)

    return results


In [14]:
results = evaluate_models(df_train, features, comparisons_rf, comparisons_dt)
import pandas as pd
pd.DataFrame(results).sort_values(by="F1_score", ascending=False)


Unnamed: 0,Model,Params,F1_score
3,Random Forest,"{'max_depth': None, 'min_samples_split': 5, 'n...",0.9323
2,Random Forest,"{'max_depth': 20, 'min_samples_split': 5, 'n_e...",0.9234
1,Random Forest,"{'max_depth': 20, 'min_samples_split': 10, 'n_...",0.9216
4,Decision Tree,"{'max_depth': 50, 'min_samples_split': 5, 'cla...",0.9141
5,Decision Tree,"{'max_depth': None, 'min_samples_split': 10, '...",0.9046
6,Decision Tree,"{'max_depth': 20, 'min_samples_split': 5, 'cla...",0.8877
0,Random Forest,"{'max_depth': 10, 'min_samples_split': 5, 'n_e...",0.8608


To check if it overfits

In [15]:
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import numpy as np

def evaluate_models(df, features, comparisons_rf, comparisons_dt):
    df['group_id'] = df["sentence"].astype(str) + '_' + df["language"]
    X = df[features]
    y = df['is_root']
    groups = df["group_id"]

    results = []
    cv = StratifiedGroupKFold(n_splits=10)

    def evaluate_model(name, model, config):
        train_scores = []
        val_scores = []

        for train_idx, val_idx in cv.split(X, y, groups):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

            model.fit(X_train, y_train)

            y_train_pred = model.predict(X_train)
            y_val_pred = model.predict(X_val)

            f1_train = f1_score(y_train, y_train_pred, average='weighted')
            f1_val = f1_score(y_val, y_val_pred, average='weighted')

            train_scores.append(f1_train)
            val_scores.append(f1_val)

        avg_train = np.mean(train_scores)
        avg_val = np.mean(val_scores)
        std_train = np.std(train_scores)
        std_val = np.std(val_scores)
        avg_diff = np.mean(np.array(train_scores) - np.array(val_scores))

        results.append({
            "Model": name,
            "Params": config,
            "Train_F1_mean": round(avg_train, 4),
            "Train_F1_std": round(std_train, 4),
            "Val_F1_mean": round(avg_val, 4),
            "Val_F1_std": round(std_val, 4),
            "Gap_Train-Val": round(avg_diff, 4)
        })

    # Evaluate Random Forest configs
    for cfg in comparisons_rf:
        model = RandomForestClassifier(
            max_depth=cfg["max_depth"],
            min_samples_split=cfg["min_samples_split"],
            n_estimators=cfg["n_estimators"],
            class_weight=cfg["class_weight"],
            random_state=42,
            n_jobs=-1
        )
        evaluate_model("Random Forest", model, cfg)

    # Evaluate Decision Tree configs
    for cfg in comparisons_dt:
        model = DecisionTreeClassifier(
            max_depth=cfg["max_depth"],
            min_samples_split=cfg["min_samples_split"],
            class_weight=cfg["class_weight"],
            random_state=42
        )
        evaluate_model("Decision Tree", model, cfg)

    return results


In [None]:
results = evaluate_models(df_train, features, comparisons_rf, comparisons_dt)
import pandas as pd
pd.DataFrame(results).sort_values(by="Val_F1_mean", ascending=False)


KeyError: 'F1_score'