In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
data = pd.read_csv("aps_failure_training_set.csv")





In [8]:
# prompt: drop all the rows with "na"  string values in the dataset

data = data.replace(['na'], [None])
data = data.dropna()


In [9]:
data

Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
16,neg,31300,0,784,740,0,0,0,0,0,...,798872,112724,51736,7054,6628,27600,2,2,0,0
179,neg,97000,0,378,160,0,0,0,0,0,...,1078982,313334,511330,552328,871528,871104,1980,42,0,0
225,neg,124656,2,278,170,0,0,0,0,0,...,1205696,866148,697610,700400,1900386,437532,3680,0,0,0
394,pos,281324,2,3762,2346,0,0,4808,215720,967572,...,624606,269976,638838,1358354,819918,262804,2824,0,0,0
413,pos,43482,0,1534,1388,0,0,0,0,40024,...,497196,121166,202272,232636,645690,50,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59432,neg,118028,0,740,714,618,690,0,0,0,...,838952,631338,541036,1285274,1832658,165838,3022,0,0,0
59562,neg,229916,0,616,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59843,neg,224084,0,912,766,0,0,0,0,0,...,413576,209524,469894,2233992,5933084,364450,12422,0,0,0
59870,neg,197332,0,658,616,216,346,0,0,0,...,73940,49896,90454,575264,104600,10352,36,0,0,0


In [10]:
# Split the dataset into features and target variable
X = data.drop('class', axis=1)
y = data['class']

# Split the data into train and test partitions
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Display the shape of the train and test sets
print(f"Train set: {X_train.shape}, {y_train.shape}")
print(f"Test set: {X_test.shape}, {y_test.shape}")

Train set: (476, 170), (476,)
Test set: (119, 170), (119,)


In [11]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Define the parameter grids for each classifier
param_grid_svc = {
    'kernel': ['linear', 'rbf'],
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto']
}

param_grid_logreg = {
    'penalty': ['l1', 'l2'],
    'C': [0.1, 1, 10],
    'solver': ['liblinear']
}

param_grid_dt = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the classifiers
svc = SVC()
logreg = LogisticRegression()
dt = DecisionTreeClassifier()

# Initialize GridSearchCV for each classifier
grid_svc = GridSearchCV(svc, param_grid_svc, cv=5, scoring='accuracy')
grid_logreg = GridSearchCV(logreg, param_grid_logreg, cv=5, scoring='accuracy')
grid_dt = GridSearchCV(dt, param_grid_dt, cv=5, scoring='accuracy')

# Fit the models
grid_svc.fit(X_train, y_train)
grid_logreg.fit(X_train, y_train)
grid_dt.fit(X_train, y_train)

# Print the best parameters and best scores
print("Best parameters for SVC:", grid_svc.best_params_)
print("Best score for SVC:", grid_svc.best_score_)

print("Best parameters for Logistic Regression:", grid_logreg.best_params_)
print("Best score for Logistic Regression:", grid_logreg.best_score_)

print("Best parameters for Decision Tree:", grid_dt.best_params_)
print("Best score for Decision Tree:", grid_dt.best_score_)

Best parameters for SVC: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Best score for SVC: 0.9474561403508771
Best parameters for Logistic Regression: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
Best score for Logistic Regression: 0.9495833333333333
Best parameters for Decision Tree: {'max_depth': None, 'min_samples_leaf': 1}
Best score for Decision Tree: 0.939078947368421


In [12]:
# prompt: predict the test data and print the scores

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Predict on the test data using the best models
y_pred_svc = grid_svc.predict(X_test)
y_pred_logreg = grid_logreg.predict(X_test)
y_pred_dt = grid_dt.predict(X_test)

# Calculate the scores for each model
accuracy_svc = accuracy_score(y_test, y_pred_svc)
precision_svc = precision_score(y_test, y_pred_svc, pos_label='pos')
recall_svc = recall_score(y_test, y_pred_svc, pos_label='pos')
f1_svc = f1_score(y_test, y_pred_svc, pos_label='pos')


accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
precision_logreg = precision_score(y_test, y_pred_logreg, pos_label='pos')
recall_logreg = recall_score(y_test, y_pred_logreg, pos_label='pos')
f1_logreg = f1_score(y_test, y_pred_logreg, pos_label='pos')


accuracy_dt = accuracy_score(y_test, y_pred_dt)
precision_dt = precision_score(y_test, y_pred_dt, pos_label='pos')
recall_dt = recall_score(y_test, y_pred_dt, pos_label='pos')
f1_dt = f1_score(y_test, y_pred_dt, pos_label='pos')


# Print the scores
print("SVC:")
print(f"Accuracy: {accuracy_svc}")
print(f"Precision: {precision_svc}")
print(f"Recall: {recall_svc}")
print(f"F1-score: {f1_svc}")

print("\nLogistic Regression:")
print(f"Accuracy: {accuracy_logreg}")
print(f"Precision: {precision_logreg}")
print(f"Recall: {recall_logreg}")
print(f"F1-score: {f1_logreg}")

print("\nDecision Tree:")
print(f"Accuracy: {accuracy_dt}")
print(f"Precision: {precision_dt}")
print(f"Recall: {recall_dt}")
print(f"F1-score: {f1_dt}")


SVC:
Accuracy: 0.9327731092436975
Precision: 0.7692307692307693
Recall: 0.6666666666666666
F1-score: 0.7142857142857143

Logistic Regression:
Accuracy: 0.957983193277311
Precision: 0.8125
Recall: 0.8666666666666667
F1-score: 0.8387096774193549

Decision Tree:
Accuracy: 0.9327731092436975
Precision: 0.7692307692307693
Recall: 0.6666666666666666
F1-score: 0.7142857142857143


In [13]:
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import f1_score
import numpy as np

# Approach a: Undersampling the majority class and/or oversampling the minority class
rus = RandomUnderSampler(random_state=42)
ros = RandomOverSampler(random_state=42)
smote = SMOTE(random_state=42)

X_resampled_rus, y_resampled_rus = rus.fit_resample(X_train, y_train)
X_resampled_ros, y_resampled_ros = ros.fit_resample(X_train, y_train)
X_resampled_smote, y_resampled_smote = smote.fit_resample(X_train, y_train)

# Approach b: Using class_weight
svc_weighted = SVC(class_weight='balanced')
logreg_weighted = LogisticRegression(class_weight='balanced', solver='liblinear')
dt_weighted = DecisionTreeClassifier(class_weight='balanced')

# Approach c: Using sample_weights
sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)

# Fit the models with resampled data
svc_rus = SVC()
logreg_rus = LogisticRegression(solver='liblinear')
dt_rus = DecisionTreeClassifier()

svc_ros = SVC()
logreg_ros = LogisticRegression(solver='liblinear')
dt_ros = DecisionTreeClassifier()

svc_smote = SVC()
logreg_smote = LogisticRegression(solver='liblinear')
dt_smote = DecisionTreeClassifier()

svc_rus.fit(X_resampled_rus, y_resampled_rus)
logreg_rus.fit(X_resampled_rus, y_resampled_rus)
dt_rus.fit(X_resampled_rus, y_resampled_rus)

svc_ros.fit(X_resampled_ros, y_resampled_ros)
logreg_ros.fit(X_resampled_ros, y_resampled_ros)
dt_ros.fit(X_resampled_ros, y_resampled_ros)

svc_smote.fit(X_resampled_smote, y_resampled_smote)
logreg_smote.fit(X_resampled_smote, y_resampled_smote)
dt_smote.fit(X_resampled_smote, y_resampled_smote)

svc_weighted.fit(X_train, y_train)
logreg_weighted.fit(X_train, y_train)
dt_weighted.fit(X_train, y_train)

svc_sample_weighted = SVC()
logreg_sample_weighted = LogisticRegression(solver='liblinear')
dt_sample_weighted = DecisionTreeClassifier()

svc_sample_weighted.fit(X_train, y_train, sample_weight=sample_weights)
logreg_sample_weighted.fit(X_train, y_train, sample_weight=sample_weights)
dt_sample_weighted.fit(X_train, y_train, sample_weight=sample_weights)

# Predict on the test data
y_pred_svc_rus = svc_rus.predict(X_test)
y_pred_logreg_rus = logreg_rus.predict(X_test)
y_pred_dt_rus = dt_rus.predict(X_test)

y_pred_svc_ros = svc_ros.predict(X_test)
y_pred_logreg_ros = logreg_ros.predict(X_test)
y_pred_dt_ros = dt_ros.predict(X_test)

y_pred_svc_smote = svc_smote.predict(X_test)
y_pred_logreg_smote = logreg_smote.predict(X_test)
y_pred_dt_smote = dt_smote.predict(X_test)

y_pred_svc_weighted = svc_weighted.predict(X_test)
y_pred_logreg_weighted = logreg_weighted.predict(X_test)
y_pred_dt_weighted = dt_weighted.predict(X_test)

y_pred_svc_sample_weighted = svc_sample_weighted.predict(X_test)
y_pred_logreg_sample_weighted = logreg_sample_weighted.predict(X_test)
y_pred_dt_sample_weighted = dt_sample_weighted.predict(X_test)

# Calculate the macro average F1 scores
f1_svc_rus = f1_score(y_test, y_pred_svc_rus, average='macro')
f1_logreg_rus = f1_score(y_test, y_pred_logreg_rus, average='macro')
f1_dt_rus = f1_score(y_test, y_pred_dt_rus, average='macro')

f1_svc_ros = f1_score(y_test, y_pred_svc_ros, average='macro')
f1_logreg_ros = f1_score(y_test, y_pred_logreg_ros, average='macro')
f1_dt_ros = f1_score(y_test, y_pred_dt_ros, average='macro')

f1_svc_smote = f1_score(y_test, y_pred_svc_smote, average='macro')
f1_logreg_smote = f1_score(y_test, y_pred_logreg_smote, average='macro')
f1_dt_smote = f1_score(y_test, y_pred_dt_smote, average='macro')

f1_svc_weighted = f1_score(y_test, y_pred_svc_weighted, average='macro')
f1_logreg_weighted = f1_score(y_test, y_pred_logreg_weighted, average='macro')
f1_dt_weighted = f1_score(y_test, y_pred_dt_weighted, average='macro')

f1_svc_sample_weighted = f1_score(y_test, y_pred_svc_sample_weighted, average='macro')
f1_logreg_sample_weighted = f1_score(y_test, y_pred_logreg_sample_weighted, average='macro')
f1_dt_sample_weighted = f1_score(y_test, y_pred_dt_sample_weighted, average='macro')

# Print the macro average F1 scores
print("Macro average F1 scores:")
print("SVC with RandomUnderSampler:", f1_svc_rus)
print("Logistic Regression with RandomUnderSampler:", f1_logreg_rus)
print("Decision Tree with RandomUnderSampler:", f1_dt_rus)

print("SVC with RandomOverSampler:", f1_svc_ros)
print("Logistic Regression with RandomOverSampler:", f1_logreg_ros)
print("Decision Tree with RandomOverSampler:", f1_dt_ros)

print("SVC with SMOTE:", f1_svc_smote)
print("Logistic Regression with SMOTE:", f1_logreg_smote)
print("Decision Tree with SMOTE:", f1_dt_smote)

print("SVC with class_weight='balanced':", f1_svc_weighted)
print("Logistic Regression with class_weight='balanced':", f1_logreg_weighted)
print("Decision Tree with class_weight='balanced':", f1_dt_weighted)

print("SVC with sample weights:", f1_svc_sample_weighted)
print("Logistic Regression with sample weights:", f1_logreg_sample_weighted)
print("Decision Tree with sample weights:", f1_dt_sample_weighted)

Macro average F1 scores:
SVC with RandomUnderSampler: 0.6911764705882353
Logistic Regression with RandomUnderSampler: 0.8416851441241685
Decision Tree with RandomUnderSampler: 0.6735837805605247
SVC with RandomOverSampler: 0.6994949494949495
Logistic Regression with RandomOverSampler: 0.8380952380952381
Decision Tree with RandomOverSampler: 0.9237179487179488
SVC with SMOTE: 0.7181578947368421
Logistic Regression with SMOTE: 0.8380952380952381
Decision Tree with SMOTE: 0.8146417445482865
SVC with class_weight='balanced': 0.7086715629429197
Logistic Regression with class_weight='balanced': 0.8380952380952381
Decision Tree with class_weight='balanced': 0.8537826926452519
SVC with sample weights: 0.7086715629429197
Logistic Regression with sample weights: 0.8380952380952381
Decision Tree with sample weights: 0.8272859216255443
