# Global Surrogate Modeling using SVM on the Cervical Cancer Risk Factors Dataset

A. Setting Up the Environment

In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, export_text


B. Loading and Preliminary Exploration

In [2]:
# Load dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00383/risk_factors_cervical_cancer.csv"
data = pd.read_csv(url)

# Checking the first few rows of the dataset
data.head()


Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
0,18,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
1,15,1.0,14.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
2,34,1.0,?,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
3,52,5.0,16.0,4.0,1.0,37.0,37.0,1.0,3.0,0.0,...,?,?,1,0,1,0,0,0,0,0
4,46,3.0,21.0,4.0,0.0,0.0,0.0,1.0,15.0,0.0,...,?,?,0,0,0,0,0,0,0,0


In [3]:
# Convert '?' to NaN
data = data.replace('?', np.nan)

# Convert columns to numeric
for col in data.columns:
    data[col] = pd.to_numeric(data[col], errors='ignore')

# Impute missing values
imputer = SimpleImputer(strategy="most_frequent")
data = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# Encode categorical columns if any and split data
X = data.drop('Biopsy', axis=1)
y = data['Biopsy']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [4]:
# Initialize and train SVM
svm = SVC(kernel='linear', probability=True)
svm.fit(X_train, y_train)

# Output SVM accuracy on test data
print(f"SVM Model Accuracy: {svm.score(X_test, y_test)*100:.2f}%")


SVM Model Accuracy: 96.51%


In [5]:
# Train a decision tree as the surrogate model on SVM's predictions
y_train_predictions = svm.predict(X_train)

dt_surrogate = DecisionTreeClassifier(max_depth=5)
dt_surrogate.fit(X_train, y_train_predictions)

# Evaluate the surrogate model
print(f"Surrogate Decision Tree Accuracy: {dt_surrogate.score(X_test, y_test)*100:.2f}%")


Surrogate Decision Tree Accuracy: 96.12%


In [6]:
# Extracting decision rules from the surrogate model
rules = export_text(dt_surrogate, feature_names=list(X.columns))
print(rules)


|--- Schiller <= 1.45
|   |--- STDs:genital herpes <= 12.22
|   |   |--- class: 0.0
|   |--- STDs:genital herpes >  12.22
|   |   |--- class: 1.0
|--- Schiller >  1.45
|   |--- STDs: Number of diagnosis <= 5.99
|   |   |--- IUD (years) <= 7.12
|   |   |   |--- class: 1.0
|   |   |--- IUD (years) >  7.12
|   |   |   |--- class: 0.0
|   |--- STDs: Number of diagnosis >  5.99
|   |   |--- class: 0.0



# Global Surrogate Modeling using Random Forest on the Bike Sharing Dataset

A. Setting Up the Environment

In this cell, we initialize our Python environment with necessary libraries to manipulate data, train models, and evaluate their performance.

In [7]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, export_text


B. Loading and Preliminary Exploration

Load the dataset to get a sense of its structure and the initial rows.

In [9]:
# Load dataset
data = pd.read_csv('day.csv')

# Display the first few rows to understand the data structure
data.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,01-01-2018,1,0,1,0,1,1,2,14.110847,18.18125,80.5833,10.749882,331,654,985
1,2,02-01-2018,1,0,1,0,2,1,2,14.902598,17.68695,69.6087,16.652113,131,670,801
2,3,03-01-2018,1,0,1,0,3,1,1,8.050924,9.47025,43.7273,16.636703,120,1229,1349
3,4,04-01-2018,1,0,1,0,4,1,1,8.2,10.6061,59.0435,10.739832,108,1454,1562
4,5,05-01-2018,1,0,1,0,5,1,1,9.305237,11.4635,43.6957,12.5223,82,1518,1600


C. Data Preprocessing

This step involves preparing our dataset for modeling by handling missing values, encoding categorical variables, and binning our target into classes.

In [10]:
# As 'dteday' is a timestamp and won't be directly used in modeling, we drop it
data = data.drop(columns=['dteday'])

# Encode categorical variables
label_encoders = {}
for col in ['season', 'yr', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit']:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Binning 'cnt' into classes to convert the problem into classification
bins = [0, 1000, 5000, 9000]
labels = [0, 1, 2]
data['cnt_class'] = pd.cut(data['cnt'], bins=bins, labels=labels)

X = data.drop(columns=['cnt', 'casual', 'registered', 'cnt_class'])
y = data['cnt_class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize the dataset for consistent training
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


D. Training the Random Forest Model

We train a Random Forest model to predict our binned bike rental counts.

In [11]:
# Initialize and train a Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Output Random Forest accuracy on test data
print(f"Random Forest Model Accuracy: {rf.score(X_test, y_test)*100:.2f}%")


Random Forest Model Accuracy: 94.06%


E. Building the Global Surrogate Model

A Decision Tree will act as our surrogate model to mimic the decisions made by the Random Forest.

In [12]:
# Using the Random Forest predictions to train a surrogate decision tree
y_train_predictions = rf.predict(X_train)

dt_surrogate = DecisionTreeClassifier(max_depth=5)
dt_surrogate.fit(X_train, y_train_predictions)

# Evaluate the accuracy of the surrogate model on test data
print(f"Surrogate Decision Tree Accuracy: {dt_surrogate.score(X_test, y_test)*100:.2f}%")


Surrogate Decision Tree Accuracy: 90.41%


F. Extracting and Displaying Decision Rules

This step involves translating the surrogate model's decisions into human-readable rules.

In [13]:
# Extract decision rules from the surrogate model
rules = export_text(dt_surrogate, feature_names=list(X.columns))
print(rules)


|--- instant <= 0.27
|   |--- instant <= -1.63
|   |   |--- hum <= 0.06
|   |   |   |--- hum <= -1.42
|   |   |   |   |--- class: 0
|   |   |   |--- hum >  -1.42
|   |   |   |   |--- holiday <= 2.90
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- holiday >  2.90
|   |   |   |   |   |--- class: 0
|   |   |--- hum >  0.06
|   |   |   |--- class: 0
|   |--- instant >  -1.63
|   |   |--- temp <= -0.16
|   |   |   |--- weathersit <= 2.00
|   |   |   |   |--- hum <= 2.18
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- hum >  2.18
|   |   |   |   |   |--- class: 0
|   |   |   |--- weathersit >  2.00
|   |   |   |   |--- windspeed <= 0.80
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- windspeed >  0.80
|   |   |   |   |   |--- class: 0
|   |   |--- temp >  -0.16
|   |   |   |--- hum <= -0.90
|   |   |   |   |--- temp <= 1.12
|   |   |   |   |   |--- class: 2
|   |   |   |   |--- temp >  1.12
|   |   |   |   |   |--- class: 1
|   |   |   |--- hum >  -0.90
|   |   |   |   |