<a href="https://colab.research.google.com/github/vengottip/CS598_practical_statistical_learning/blob/main/CS598_PSL_Project3_xgboost_logistic_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import os
import pandas as pd
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier

In [11]:
!pip install lime



In [12]:
!pip install nltk



In [13]:
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from lime.lime_text import LimeTextExplainer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import download
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
import numpy as np
from sklearn.impute import SimpleImputer

In [14]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
nltk.data.path.append('/root/nltk_data')  # Update this to a valid path

In [16]:
# Download NLTK stop words
# Ensure required NLTK data resources are downloaded
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    print("Downloading NLTK 'punkt' resource...")
    download('punkt')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    print("Downloading NLTK 'stopwords' resource...")
    download('stopwords')

stop_words = set(stopwords.words('english'))

In [17]:
def remove_stopwords(text):
    words = word_tokenize(text)
    return " ".join([word for word in words if word.lower() not in stop_words])

In [18]:
def select_important_features(X_train, y_train, alpha=0.0001):
    """
    Select important features using Lasso regression.

    Parameters:
        X_train (array): Training features.
        y_train (array): Training labels.
        alpha (float): Regularization strength for Lasso.

    Returns:
        array: Selected feature indices.
    """
    # Standardize features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)

    # Apply Lasso
    lasso = Lasso(alpha=alpha)
    lasso.fit(X_train, y_train)
    selected_features = np.where(lasso.coef_ != 0)[0]

    print(f"Number of features selected: {len(selected_features)}")
    return selected_features

In [19]:
def preprocess_text_column(df, column_name):
    """
    Remove stop words from a text column in a DataFrame.

    Parameters:
        df (DataFrame): Input DataFrame.
        column_name (str): Name of the text column to preprocess.

    Returns:
        DataFrame: Updated DataFrame with preprocessed text.
    """
    df[column_name] = df[column_name].apply(remove_stopwords)
    return df

In [20]:
def train_and_evaluate_xgboost_with_split_number(
    train_file, test_file, test_y_file, split_number, model_save_path="xgboost_model"
):
    """
    Train and evaluate an XGBoost model and save it with a suffix for the split number.

    Parameters:
        train_file (str): Path to the training CSV file.
        test_file (str): Path to the test CSV file.
        test_y_file (str): Path to the test labels CSV file.
        split_number (int): The split number to use as a suffix for saving the model.
        model_save_path (str): Base path for saving the model (suffix added automatically).

    Returns:
        float: AUC score of the model.
    """
    train_data = pd.read_csv(train_file)
    test_data = pd.read_csv(test_file)
    test_y = pd.read_csv(test_y_file)

    # Preprocess review text to remove stop words
    train_data = preprocess_text_column(train_data, "review")
    test_data = preprocess_text_column(test_data, "review")

    # Extract features and labels
    X_train = train_data.iloc[:, 3:].values  # Skip 'id', 'sentiment', and 'review'
    y_train = train_data['sentiment'].values
    X_test = test_data.iloc[:, 2:].values  # Skip 'id' and 'review'
    y_test = test_y['sentiment'].values

    # Impute missing values
    imputer = SimpleImputer(strategy='mean')
    X_train = imputer.fit_transform(X_train)
    X_test = imputer.transform(X_test)

    # Remove low-variance features
    feature_variances = np.var(X_train, axis=0)
    X_train = X_train[:, feature_variances > 1e-5]
    X_test = X_test[:, feature_variances > 1e-5]

    # Feature selection using Lasso
    selected_features = select_important_features(X_train, y_train, alpha=0.0001)
    if len(selected_features) == 0:
        raise ValueError("No features were selected by Lasso. Adjust the regularization strength or check the dataset.")
    X_train = X_train[:, selected_features]
    X_test = X_test[:, selected_features]

    # Save selected features indices for later use
    selected_features_file = f"selected_features_{split_number}.npy"
    np.save(selected_features_file, selected_features)
    print(f"Selected features saved to {selected_features_file}")

    # Train the XGBoost classifier
    model = XGBClassifier(
        objective='binary:logistic',
        eval_metric='auc',
        n_estimators=2300,
        learning_rate=0.02,
        max_depth=8,
        subsample=0.8,
        colsample_bytree=0.6,
        reg_lambda=0.1,
        reg_alpha=0.01,
        gamma=0,
        scale_pos_weight=3,
        random_state=42
    )

    model.fit(X_train, y_train)

    # Save the model with a suffix for the split number
    model_file_path = f"{model_save_path}_{split_number}.json"
    model.save_model(model_file_path)
    print(f"Model saved to {model_file_path}")

    # Predict probabilities and calculate AUC
    y_prob = model.predict_proba(X_test)[:, 1]
    auc_score = roc_auc_score(y_test, y_prob)
    print(f"AUC Score: {auc_score:.5f}")
    print(f"Number of selected features: {len(selected_features)}")

    # Save additional data needed for interpretation
    try:
        assert X_test.shape[0] == len(test_data["review"]), "Mismatch between X_test and test reviews."
    except AssertionError as e:
        print(f"Assertion failed: {e}")
        return
    np.save(f"X_test_{split_number}.npy", X_test)
    np.save(f"test_reviews_{split_number}.npy", test_data["review"].values)
    return auc_score


In [23]:
test_loaded_features = np.load("selected_features_1.npy")
print(test_loaded_features)


[   0    1    2 ... 1530 1531 1532]


In [9]:
import os
import time
import platform
import psutil

specs = {
        "OS": platform.system() + " " + platform.release(),
        "Processor": platform.processor(),
        "RAM": f"{round(psutil.virtual_memory().total / (1024**3), 2)} GB",
    }
print (specs)

{'OS': 'Linux 6.1.85+', 'Processor': 'x86_64', 'RAM': '83.48 GB'}


In [21]:
import os
import time
import platform
import psutil

def get_system_specs():
    """
    Retrieve system specifications (CPU, RAM, OS).

    Returns:
        str: A formatted string of system specifications.
    """
    specs = {
        "OS": platform.system() + " " + platform.release(),
        "Processor": platform.processor(),
        "RAM": f"{round(psutil.virtual_memory().total / (1024**3), 2)} GB",
    }
    return specs

def evaluate_all_splits_dynamic(base_dir=".", model_save_path="xgboost_model"):
    """
    Evaluate and save XGBoost models for all splits in the provided directory.

    Parameters:
        base_dir (str): Base directory containing split folders (e.g., split_1, split_2, etc.).
        model_save_path (str): Base name for saving models (suffix with split number will be added).

    Returns:
        list: AUC scores for all splits.
    """
    system_specs = get_system_specs()
    print("\nSystem Specifications:")
    for key, value in system_specs.items():
        print(f"{key}: {value}")

    total_start_time = time.time()  # Start total timer
    auc_scores = []

    for split_number in range(1, 6):  # Iterate through split_1 to split_5
        split_dir = os.path.join(base_dir, f"split_{split_number}")
        train_file = os.path.join(split_dir, "train.csv")
        test_file = os.path.join(split_dir, "test.csv")
        test_y_file = os.path.join(split_dir, "test_y.csv")

        print(f"\nEvaluating Split {split_number}...")

        # Measure time for the current split
        split_start_time = time.time()

        # Train and evaluate the model, save results
        auc = train_and_evaluate_xgboost_with_split_number(
            train_file=train_file,
            test_file=test_file,
            test_y_file=test_y_file,
            split_number=split_number,  # Pass split number dynamically
            model_save_path=model_save_path  # Base name for saving the model
        )
        split_end_time = time.time()
        split_execution_time = split_end_time - split_start_time

        print(f"Execution Time for Split {split_number}: {split_execution_time:.2f} seconds")
        auc_scores.append(auc)

    total_end_time = time.time()  # End total timer
    total_execution_time = total_end_time - total_start_time

    print("\nAUC Scores for all splits:")
    for i, auc in enumerate(auc_scores, 1):
        print(f"Split {i}: AUC = {auc:.5f}")

    avg_auc = sum(auc_scores) / len(auc_scores)
    print(f"\nAverage AUC across all splits: {avg_auc:.5f}")
    print(f"\nTotal Execution Time: {total_execution_time:.2f} seconds")

    return auc_scores


In [22]:
# Run evaluation across all splits
if __name__ == "__main__":
    base_directory = "."  # Replace with your base directory containing split folders
    evaluate_all_splits_dynamic(base_dir=base_directory)


System Specifications:
OS: Linux 6.1.85+
Processor: x86_64
RAM: 83.48 GB

Evaluating Split 1...


  model = cd_fast.enet_coordinate_descent(


Number of features selected: 1395
Selected features saved to selected_features_1.npy
Model saved to xgboost_model_1.json
AUC Score: 0.98600
Number of selected features: 1395
Execution Time for Split 1: 1109.97 seconds

Evaluating Split 2...


  model = cd_fast.enet_coordinate_descent(


Number of features selected: 1382
Selected features saved to selected_features_2.npy
Model saved to xgboost_model_2.json
AUC Score: 0.98518
Number of selected features: 1382
Execution Time for Split 2: 1129.41 seconds

Evaluating Split 3...


  model = cd_fast.enet_coordinate_descent(


Number of features selected: 1385
Selected features saved to selected_features_3.npy
Model saved to xgboost_model_3.json
AUC Score: 0.98525
Number of selected features: 1385
Execution Time for Split 3: 1143.96 seconds

Evaluating Split 4...


  model = cd_fast.enet_coordinate_descent(


Number of features selected: 1389
Selected features saved to selected_features_4.npy
Model saved to xgboost_model_4.json
AUC Score: 0.98535
Number of selected features: 1389
Execution Time for Split 4: 1139.28 seconds

Evaluating Split 5...


  model = cd_fast.enet_coordinate_descent(


Number of features selected: 1389
Selected features saved to selected_features_5.npy
Model saved to xgboost_model_5.json
AUC Score: 0.98535
Number of selected features: 1389
Execution Time for Split 5: 1138.78 seconds

AUC Scores for all splits:
Split 1: AUC = 0.98600
Split 2: AUC = 0.98518
Split 3: AUC = 0.98525
Split 4: AUC = 0.98535
Split 5: AUC = 0.98535

Average AUC across all splits: 0.98543

Total Execution Time: 5661.40 seconds
