In [32]:
import numpy as np
import pandas as pd
import os

from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import VotingClassifier

from catboost import CatBoostClassifier
import joblib


In [33]:
base_path = "/home/rguktongole/Downloads/Major_pro/AI-VS-Human-main/Datasets/"

train_files = [
    "train_drcat_04.csv",
    "train_v2_drcat_02.csv",
    "train_essays_RDizzl3_seven_v1.csv",
    "train_drcat_01.csv"
]

test_file = "test_essays.csv"

def fix_dataframe(df):
    df = df.rename(columns={"text": "text", "label": "label", "generated": "label"})
    df["label"] = df["label"].astype(int)
    return df[["text", "label"]]

dfs = []
for f in train_files:
    df_temp = pd.read_csv(base_path + f)
    dfs.append(fix_dataframe(df_temp))
    print("Loaded:", f)

train_df = pd.concat(dfs).drop_duplicates("text").reset_index(drop=True)
test_df = pd.read_csv(base_path + test_file)[["id", "text"]]

print("\nTraining Samples:", len(train_df))
print(train_df["label"].value_counts())


Loaded: train_drcat_04.csv
Loaded: train_v2_drcat_02.csv
Loaded: train_essays_RDizzl3_seven_v1.csv
Loaded: train_drcat_01.csv

Training Samples: 56233
label
0    38411
1    17822
Name: count, dtype: int64


In [34]:
vectorizer = TfidfVectorizer(stop_words="english", max_features=50000)

X_tfidf = vectorizer.fit_transform(train_df["text"])
y = train_df["label"]

X_tfidf_test = vectorizer.transform(test_df["text"])


In [35]:
cat = CatBoostClassifier(
    iterations=500,
    depth=6,
    learning_rate=0.05,
    loss_function="Logloss",
    text_features=[0],  # column index of the text
    random_seed=42,
    verbose=200
)

X_cat = train_df[["text"]]
X_cat_test = test_df[["text"]]

# Train on raw text
cat.fit(X_cat, y)


0:	learn: 0.6004722	total: 248ms	remaining: 2m 3s
200:	learn: 0.0284563	total: 40.4s	remaining: 1m
400:	learn: 0.0190655	total: 1m 20s	remaining: 19.8s
499:	learn: 0.0164680	total: 1m 39s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7fc0e3879460>

In [36]:
lr = LogisticRegression(max_iter=500, random_state=42)

xgb = XGBClassifier(
    eval_metric="logloss",
    use_label_encoder=False,
    random_state=42
)

lr.fit(X_tfidf, y)
xgb.fit(X_tfidf, y)


Parameters: { "use_label_encoder" } are not used.



In [37]:
class HybridModel:
    def __init__(self, cat, xgb, lr, vectorizer):
        self.cat = cat
        self.xgb = xgb
        self.lr = lr
        self.vectorizer = vectorizer

    def predict_proba(self, texts):
        # Prepare features
        tfidf_features = self.vectorizer.transform(texts["text"])
        cat_features = texts[["text"]]
        
        # Predictions
        p_cat = self.cat.predict_proba(cat_features)[:,1]
        p_xgb = self.xgb.predict_proba(tfidf_features)[:,1]
        p_lr  = self.lr.predict_proba(tfidf_features)[:,1]

        # Weighted ensemble
        final = (p_cat * 0.4) + (p_xgb * 0.3) + (p_lr * 0.3)
        return final


In [38]:
hybrid = HybridModel(cat, xgb, lr, vectorizer)

train_pred = hybrid.predict_proba(train_df[["text"]])
print("Train ROC-AUC:", roc_auc_score(y, train_pred))


Train ROC-AUC: 0.9999378214507922


In [39]:
test_pred = hybrid.predict_proba(test_df[["text"]])

submission = pd.DataFrame({
    "id": test_df["id"],
    "generated": test_pred
})

submission.to_csv(base_path + "submission_catboost_ensemble.csv", index=False)
print("Saved submission!")


Saved submission!


In [41]:
joblib.dump(cat, "model/catboost_raw_text.pkl")
joblib.dump(xgb, "model/xgb_tfidf.pkl")
joblib.dump(lr, "model/logreg_tfidf.pkl")
joblib.dump(vectorizer, "model/tfidf_vectorizer1.pkl")
joblib.dump(hybrid, "model/hybrid_model.pkl")

print("All models saved successfully!")


All models saved successfully!
