## Initialization

In [None]:
%pip install h2o matplotlib seaborn

In [None]:
import h2o
h2o.init()

## Importing and Understanding the CSV

In [None]:
data = h2o.import_file("Top_scientists_2023.csv")
data.head()

In [None]:
data.columns

In [None]:
data.describe()

## Preprocessing

In [None]:
for col in data.columns:
    if data[col].isstring()[0]:
        data[col] = data[col].asfactor()

In [None]:
numeric_cols = [col for col in data.columns if data[col].isnumeric()[0]]
normalized = data[numeric_cols].scale(center=True, scale=True)
for col in numeric_cols:
    data[col] = normalized[col]

## Model Creation, Training & Cross-Validation

In [None]:
data["high_impact"] = (data["h23"] >= 5).asfactor()
data["high_impact"].table()

In [None]:
class_true = data[data["high_impact"].ascharacter() == "1"]
class_false = data[data["high_impact"].ascharacter() == "0"]
print("TRUE total:", class_true.nrows)
print("FALSE total:", class_false.nrows)

In [None]:
true_train, true_rest = class_true.split_frame(ratios=[0.7], seed=42)
true_valid, true_test = true_rest.split_frame(ratios=[0.5], seed=42)
false_train, false_rest = class_false.split_frame(ratios=[0.7], seed=42)
false_valid, false_test = false_rest.split_frame(ratios=[0.5], seed=42)
train = true_train.rbind(false_train)
valid = true_valid.rbind(false_valid)
test = true_test.rbind(false_test)
train["high_impact"].table()

In [None]:
predictors = [
    "sm-subfield-1-frac",
    "sm-subfield-2-frac",
    "sm-field-frac"
]

In [None]:
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
glm_cv = H2OGeneralizedLinearEstimator(
    family="binomial",
    nfolds=5,              
    keep_cross_validation_models=True,
    keep_cross_validation_predictions=True,
    keep_cross_validation_fold_assignment=True,
    lambda_search=True,
    seed=42
)
glm_cv.train(
    x=predictors,
    y="high_impact",
    training_frame=train
)

In [None]:
perf = glm_cv.model_performance(xval=True)
perf

In [None]:
print("AUC (CV):", perf.auc())
perf.confusion_matrix()

In [None]:
pred = glm_cv.predict(test)
actual = test["high_impact"].as_data_frame().values.flatten()
predicted = pred["predict"].as_data_frame().values.flatten()

## Plots

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
cm = confusion_matrix(actual, predicted)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Low Impact", "High Impact"])
disp.plot(cmap='Blues')
plt.title("Confusion Matrix")
plt.show()

In [None]:
probs = pred["p1"].as_data_frame().values.flatten()

In [None]:
from sklearn.metrics import roc_curve, auc
fpr, tpr, _ = roc_curve(actual, probs)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.2f}")
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

## Predictions (Final Output)

In [None]:
model_path = h2o.save_model(model=glm_cv, path=r"C:\users\matei\Desktop\Semester 4\PBDAS\Projects\GLM", force=True)
print("Model saved at:", model_path)

In [None]:
pred_df = pred.cbind(test["high_impact"])
h2o.export_file(pred_df, path=r"C:\users\matei\Desktop\Semester 4\PBDAS\Projects\GLM\predictions.csv", force=True)

In [None]:
h2o.export_file(data, path=r"C:\users\matei\Desktop\Semester 4\PBDAS\Projects\GLM\processed_dataset.csv", force=True)