In [16]:
import os
import pandas as pd
import joblib
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Step 1: Load dataset
df = pd.read_csv("ai_human_content_detection_dataset.csv")
df = df.dropna()  # Remove missing values if any

# Step 2: Split features and labels
X = df.drop(columns=["text_content", "content_type", "label"])
y = df["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Create output folder
if not os.path.exists("model_outputs"):
    os.makedirs("model_outputs")

# Step 4: Train and save XGBoost model
dtrain = xgb.DMatrix(X_train, label=y_train)
params = {
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "seed": 42
}
xgb_model = xgb.train(params, dtrain, num_boost_round=100)
xgb_model.save_model("model_outputs/xgboost_model.json")

# Step 5: Train and save Logistic Regression model
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
joblib.dump(lr, "model_outputs/logistic_model.pkl")



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


['model_outputs/logistic_model.pkl']

In [18]:
from sklearn.preprocessing import StandardScaler

# Scale the training and testing data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [20]:
lr = LogisticRegression(max_iter=2000)
lr.fit(X_train_scaled, y_train)
joblib.dump(lr, "model_outputs/logistic_model.pkl")

['model_outputs/logistic_model.pkl']