# 🐍 Machine Learning Project Script

In [47]:
import pandas as pd
import ast
import networkx as nx
from sklearn.preprocessing import MinMaxScaler

# 1. Upload data

In [48]:
df_train_raw = pd.read_csv("train.csv", sep=',')
df_train_raw["edgelist"].head(1)
df_train_raw["edgelist"] = df_train_raw["edgelist"].apply(ast.literal_eval)

df_test_raw = pd.read_csv("test.csv", sep=',')
df_test_raw["edgelist"].head(1)
df_test_raw["edgelist"] = df_test_raw["edgelist"].apply(ast.literal_eval)

# 2. Pre-Processing


In [None]:
import networkx as nx
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from itertools import combinations
from community import community_louvain  # pip install python-louvain

def normalize_group(df_group):
    numeric_cols = [
        'degree', 'closeness', 'betweenness', 'pagerank',
        'eigenvector', 'katz', 'load',
        'eccentricity', 'clustering', 'avg_neighbor_degree',
        'triangles', 'community', 'square_clustering',
        #'shortest_path_length', 'is_leaf', 'neighbor_connectivity'
    ]
    scaler = MinMaxScaler()
    df_group[numeric_cols] = scaler.fit_transform(df_group[numeric_cols])
    return df_group

def pre_processing(data):
    training_data = []

    for idx, row in data.iterrows():
        edgelist = row["edgelist"]
        
        # Create undirected graph
        T = nx.Graph()
        T.add_edges_from(edgelist)

        if not nx.is_connected(T):
            continue
        
        root_node = row.get("root", None)
        
        # Compute centralities
        closeness = nx.closeness_centrality(T)
        betweenness = nx.betweenness_centrality(T)
        pagerank = nx.pagerank(T, max_iter=1000)
        
        # Additional centrality measures with fallbacks
        try:
            eigenvector = nx.eigenvector_centrality(T, max_iter=10000, tol=1e-06)
        except nx.PowerIterationFailedConvergence:
            eigenvector = {n: 0.0 for n in T.nodes}
            
        try:
            katz = nx.katz_centrality(T, alpha=0.1)
        except nx.NetworkXException:
            katz = {n: 0.0 for n in T.nodes}
            
        try:
            load = nx.load_centrality(T)
        except:
            load = {n: 0.0 for n in T.nodes}

        # Structural properties
        degree = dict(T.degree())
        eccentricity = nx.eccentricity(T)
        avg_neighbor_degree = nx.average_neighbor_degree(T)
        
        # Community detection
        partition = community_louvain.best_partition(T)
        
        for v in T.nodes:
            features = {
                "sentence": row["sentence"],
                "language": row["language"],
                "n": row["n"],
                "node": v,

                # Centrality measures
                "degree": degree[v],
                "closeness": closeness[v],
                "betweenness": betweenness[v],
                "pagerank": pagerank[v],
                "eigenvector": eigenvector[v],
                "katz": katz[v],
                "load": load[v],

                # Structural properties
                "eccentricity": eccentricity[v],
                "avg_neighbor_degree": avg_neighbor_degree[v],

                # Community information
                "community": partition[v],

                "is_leaf": 1 if T.degree(v) == 1 else 0,
            }

            if "id" in row:
                features["id"] = row["id"]

            if root_node is not None:
                features["is_root"] = 1 if v == root_node else 0

            training_data.append(features)

    training_data = pd.DataFrame(training_data)
    
    # Normalize features by group
    df_normalized = training_data.groupby(["sentence", "language"], group_keys=True).apply(
        normalize_group, include_groups=False
    )
    df_normalized.reset_index(inplace=True)
    df_normalized.drop(columns=["level_2"], inplace=True)

    return df_normalized

In [66]:
df_train = pre_processing(df_train_raw)

# 3. Models

**K-Fold Cross Validation**

In [70]:
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.dummy import DummyClassifier

def enhanced_training_pipeline(df, features, n_folds=5):
    df['group_id'] = df["sentence"].astype(str) + '_' + df["language"]
    X = df[features]
    y = df['is_root']
    groups = df["group_id"]
    
    # Initialize models
    models = {
        "Random Forest": RandomForestClassifier(class_weight='balanced', random_state=42),
        "Decision Tree": DecisionTreeClassifier(
            class_weight='balanced',
            max_depth=100,
            min_samples_split=10,
            random_state=42
        ),
        #"XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
        #"LightGBM": LGBMClassifier(class_weight='balanced', random_state=42),
        #"CatBoost": CatBoostClassifier(verbose=0, random_state=42),
        #"Logistic Regression": LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42),
        #"SVM (RBF)": SVC(probability=True, class_weight='balanced', kernel='rbf', random_state=42),
        #"MLP": MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42),
        #"Dummy": DummyClassifier(strategy="most_frequent")
}
    results = {}
    
    for model_name, model in models.items():
        print(f"\n=== Evaluating {model_name} ===")
        metrics = {'precision': [], 'recall': [], 'f1': [], 'support': []}
        
        cv = StratifiedGroupKFold(n_splits=n_folds)
        
        for fold, (train_idx, val_idx) in enumerate(cv.split(X, y, groups=groups)):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
            
            X_train_fs, X_val_fs = X_train, X_val
            
            # Fit model
            model.fit(X_train_fs, y_train)
            
            # Evaluate
            y_pred = model.predict(X_val_fs)
            report = classification_report(y_val, y_pred, output_dict=True)
            metrics['precision'].append(report['weighted avg']['precision'])
            metrics['recall'].append(report['weighted avg']['recall'])
            metrics['f1'].append(report['weighted avg']['f1-score'])
            metrics['support'].append(report['weighted avg']['support'])
        
        # Store results
        results[model_name] = {
            'precision': np.mean(metrics['precision']),
            'recall': np.mean(metrics['recall']),
            'f1': np.mean(metrics['f1']),
            'support': np.mean(metrics['support'])
        }
        
        print(f"\nSimple Evaluation:")
        print(f"Precision: {results[model_name]['precision']:.4f}")
        print(f"Recall:    {results[model_name]['recall']:.4f}")
        print(f"F1 Score:  {results[model_name]['f1']:.4f}")
        
    
    # Final model training
    best_model_name = max(results.items(), key=lambda x: x[1]['f1'])[0]
    print(f"\nBest model: {best_model_name}")
    best_model = models[best_model_name]
    best_model.fit(X, y)
    
    return results, best_model_name, best_model


In [71]:
# === Run pipeline ===

features = [col for col in df_train.columns if col not in ['id', 'sentence', 'language', 'is_root', 'group_id']]

results, best_model_name, best_model = enhanced_training_pipeline(df_train, features)



=== Evaluating Random Forest ===

Simple Evaluation:
Precision: 0.9252
Recall:    0.9435
F1 Score:  0.9308

=== Evaluating Decision Tree ===

Simple Evaluation:
Precision: 0.9227
Recall:    0.8886
F1 Score:  0.9038

Best model: Random Forest


In [72]:
# Process test data
df_test_processed = pre_processing(df_test_raw.copy())
X_test = df_test_processed[features]

# Predict
df_test_processed['pred_proba'] = best_model.predict_proba(X_test)[:, 1]
predicted_roots = df_test_processed.loc[
    df_test_processed.groupby('id')['pred_proba'].idxmax()
]

# Format output
output_df = predicted_roots[['id', 'node']].rename(columns={'node': 'root'})
output_df = output_df.sort_values('id').reset_index(drop=True)
output_df.to_csv('predicted_roots.csv', index=False)