# 🐍 Machine Learning Project Script

In [58]:
import pandas as pd
import ast
import networkx as nx
from sklearn.preprocessing import MinMaxScaler

# 1. Upload data

In [59]:
df_train_raw = pd.read_csv("train.csv", sep=',')
df_train_raw["edgelist"].head(1)
df_train_raw["edgelist"] = df_train_raw["edgelist"].apply(ast.literal_eval)

# 2. Pre-Processing


In [62]:
import networkx as nx
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from itertools import combinations
from community import community_louvain  # pip install python-louvain

def normalize_group(df_group):
    numeric_cols = [
        'deg', 'degree', 'closeness', 'betweenness', 'pagerank',
        'eigenvector', 'katz', 'harmonic', 'load',
        'eccentricity', 'clustering', 'avg_neighbor_degree',
        'triangles', 'community', 'square_clustering',
        #'shortest_path_length', 'is_leaf', 'neighbor_connectivity'
    ]
    scaler = MinMaxScaler()
    df_group[numeric_cols] = scaler.fit_transform(df_group[numeric_cols])
    return df_group

def pre_processing(data):
    training_data = []

    for idx, row in data.iterrows():
        edgelist = row["edgelist"]
        
        # Create undirected graph
        T = nx.Graph()
        T.add_edges_from(edgelist)

        if not nx.is_connected(T):
            continue
        
        # New root detection without directionality
        deg_centrality = nx.degree_centrality(T)
        #root = max(deg_centrality.items(), key=lambda x: x[1])[0]  # Most central node
        
        # Compute centralities
        closeness = nx.closeness_centrality(T)
        betweenness = nx.betweenness_centrality(T)
        pagerank = nx.pagerank(T, max_iter=1000)
        
        # Additional centrality measures with fallbacks
        try:
            eigenvector = nx.eigenvector_centrality(T, max_iter=10000, tol=1e-06)
        except nx.PowerIterationFailedConvergence:
            eigenvector = {n: 0.0 for n in T.nodes}
            
        try:
            katz = nx.katz_centrality(T, alpha=0.1)
        except nx.NetworkXException:
            katz = {n: 0.0 for n in T.nodes}
            
        harmonic = nx.harmonic_centrality(T)
        try:
            load = nx.load_centrality(T)
        except:
            load = {n: 0.0 for n in T.nodes}

        # Structural properties
        degree = dict(T.degree())
        eccentricity = nx.eccentricity(T)
        clustering = nx.clustering(T)
        avg_neighbor_degree = nx.average_neighbor_degree(T)
        triangles = nx.triangles(T)
        square_clustering = nx.square_clustering(T)
        
        # Community detection
        partition = community_louvain.best_partition(T)
        
        # New features
        #shortest_path_length = nx.shortest_path_length(T, root)
        
        for v in T.nodes:
            features = {
                "sentence": row["sentence"],
                "language": row["language"],
                "n": row["n"],
                "node": v,
                # Centrality measures
                "deg": deg_centrality[v],
                "degree": degree[v],
                "closeness": closeness[v],
                "betweenness": betweenness[v],
                "pagerank": pagerank[v],
                "eigenvector": eigenvector[v],
                "katz": katz[v],
                "harmonic": harmonic[v],
                "load": load[v],
                # Structural properties
                "eccentricity": eccentricity[v],
                "clustering": clustering[v],
                "avg_neighbor_degree": avg_neighbor_degree[v],
                "triangles": triangles[v],
                "square_clustering": square_clustering[v],
                # Community information
                "community": partition[v],
                # New features
                #"shortest_path_length": shortest_path_length[v],
                #"is_leaf": 1 if T.degree(v) == 1 else 0,
                #"neighbor_connectivity": sum(1 for u in T.neighbors(v) for _ in nx.common_neighbors(T, v, u)),
                # Target variable
                #"is_root": 1 if v == root else 0
            }
            training_data.append(features)

    training_data = pd.DataFrame(training_data)
    
    # Normalize features by group
    df_normalized = training_data.groupby(["sentence", "language"], group_keys=True).apply(
        normalize_group, include_groups=False
    )
    df_normalized.reset_index(inplace=True)
    df_normalized.drop(columns=["level_2"], inplace=True)

    return df_normalized

In [63]:
df_train = pre_processing(df_train_raw)

# 3. Models

**K-Fold Cross Validation**

In [None]:
from sklearn.model_selection import StratifiedGroupKFold
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import precision_recall_fscore_support, recall_score, f1_score, classification_report
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectFromModel
from imblearn.ensemble import BalancedRandomForestClassifier

def enhanced_training_pipeline(df, features, n_folds=5):
    df['group_id'] = df["sentence"].astype(str) + '_' + df["language"]
    X = df[features]
    y = df['is_root']
    groups = df["group_id"]
    
    # Initialize models
    models = {
        "Logistic Regression": LogisticRegression(class_weight='balanced', max_iter=1000),
        "Random Forest": RandomForestClassifier(class_weight='balanced', random_state=42),
        "Balanced RF": BalancedRandomForestClassifier(random_state=42),
        "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
        "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    }
    
    results = {}
    
    for model_name, model in models.items():
        print(f"\n=== Evaluating {model_name} ===")
        fold_metrics = {'acc': [], 'prec': [], 'rec': [], 'f1': []}
        simple_metrics = {'precision': [], 'recall': [], 'f1': [], 'support': []}
        
        cv = StratifiedGroupKFold(n_splits=n_folds)
        
        for fold, (train_idx, val_idx) in enumerate(cv.split(X, y, groups=groups)):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
            
            # Handle class imbalance
            if model_name not in ["XGBoost", "Balanced RF"]:
                smote = SMOTE(sampling_strategy='minority', random_state=42)
                X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
            else:
                X_train_res, y_train_res = X_train, y_train
                if model_name == "XGBoost":
                    pos = y_train.sum()
                    neg = len(y_train) - pos
                    model.set_params(scale_pos_weight=neg/pos if pos > 0 else 1)
            
            # Feature selection for non-tree models
            if model_name in ["Logistic Regression"]:
                selector = SelectFromModel(
                    estimator=LogisticRegression(max_iter=1000, class_weight='balanced'),
                    max_features=15
                ).fit(X_train_res, y_train_res)
                X_train_fs = selector.transform(X_train_res)
                X_val_fs = selector.transform(X_val)
            else:
                X_train_fs, X_val_fs = X_train_res, X_val
            
            # Fit model
            model.fit(X_train_fs, y_train_res)
            
            # ===== Original Evaluation =====
            #val_df = df.iloc[val_idx].copy()
            #val_df['proba'] = model.predict_proba(X_val_fs)[:, 1]
            
            #predicted_roots = val_df.loc[val_df.groupby(['sentence', 'language'])['proba'].idxmax()]
            #true_roots = val_df[val_df['is_root'] == 1]
            
            #merged = predicted_roots.merge(
            #    true_roots, 
            #    on=['sentence', 'language'], 
            #    suffixes=('_pred', '_true')
            #)
            #
            #correct = merged['node_pred'] == merged['node_true']
            #acc = correct.mean()
            #prec, rec, f1, _ = precision_recall_fscore_support(
            #    correct, [True]*len(correct), average='binary'
            #)
            
            #fold_metrics['acc'].append(acc)
            #fold_metrics['prec'].append(prec)
            #fold_metrics['rec'].append(rec)
            #fold_metrics['f1'].append(f1)
            
            # ===== Simple Evaluation =====
            y_pred = model.predict(X_val_fs)
            report = classification_report(y_val, y_pred, output_dict=True)
            simple_metrics['precision'].append(report['weighted avg']['precision'])
            simple_metrics['recall'].append(report['weighted avg']['recall'])
            simple_metrics['f1'].append(report['weighted avg']['f1-score'])
            simple_metrics['support'].append(report['weighted avg']['support'])
        
        # Store results
        results[model_name] = {
            'simple': {
                'precision': np.mean(simple_metrics['precision']),
                'recall': np.mean(simple_metrics['recall']),
                'f1': np.mean(simple_metrics['f1']),
                'support': np.mean(simple_metrics['support'])
            }
        }
        
        print(f"\nSimple Evaluation:")
        print(f"Precision: {results[model_name]['simple']['precision']:.4f}")
        print(f"Recall: {results[model_name]['simple']['recall']:.4f}")
        print(f"F1: {results[model_name]['simple']['f1']:.4f}")
    
    best_model_name = max(
        [(name, res['simple']['f1']) for name, res in results.items()],
        key=lambda x: x[1]
    )[0]
    
    print(f"\nBest model: {best_model_name}")
    
    best_model = models[best_model_name]
    
    if best_model_name not in ["XGBoost", "Balanced RF"]:
        smote = SMOTE(sampling_strategy='minority', random_state=42)
        X_res, y_res = smote.fit_resample(X, y)
        best_model.fit(X_res, y_res)
    else:
        best_model.fit(X, y)
    
    return results, best_model_name, best_model

In [None]:
# === Run pipeline ===

features = [col for col in df_train.columns if col not in ['sentence', 'language', 'node', 'n', 'is_root', 'group_id']]

results, best_model_name, best_model = enhanced_training_pipeline(df_train, features)




=== Evaluating Logistic Regression ===

Simple Evaluation:
Precision: 0.9772
Recall: 0.9605
F1: 0.9654

=== Evaluating Random Forest ===

Simple Evaluation:
Precision: 0.9804
Recall: 0.9759
F1: 0.9774

=== Evaluating Balanced RF ===

Simple Evaluation:
Precision: 0.9783
Recall: 0.9641
F1: 0.9682

=== Evaluating XGBoost ===


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Simple Evaluation:
Precision: 0.9798
Recall: 0.9713
F1: 0.9738

=== Evaluating Gradient Boosting ===

Simple Evaluation:
Precision: 0.9788
Recall: 0.9662
F1: 0.9698

Best model: Random Forest
