problem 1 Iris

In [2]:
from sklearn.datasets import load_iris
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, export_text, plot_tree
from sklearn.metrics import recall_score,precision_score,classification_report

In [3]:
# Load the dataset
iris = load_iris()

In [4]:
# Convert to DataFrame
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['target'] = iris.target

# Optional: Map target to species names for readability
df['species'] = df['target'].map(lambda x: iris.target_names[x])

print(df.head())

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target species  
0       0  setosa  
1       0  setosa  
2       0  setosa  
3       0  setosa  
4       0  setosa  


In [5]:
X = df[iris.feature_names]
y_true=df['target']
print("y_true:")
print(y_true.head())

y_true:
0    0
1    0
2    0
3    0
4    0
Name: target, dtype: int32


In [6]:
# Train decision trees with different max_depth values 
recall_results = {}
for depth in range(1, 6):
    # Initialize the decision tree classifier
    clf = DecisionTreeClassifier(
        max_depth=depth,
        min_samples_leaf=2,
        min_samples_split=5,
        random_state=42  # for reproducibility
    )
    
    # Train the model
    clf.fit(X, y_true)
    
    # Make predictions
    y_predictions = clf.predict(X)

    # Store the classification report
    report = classification_report(y_true, y_predictions,output_dict=True, zero_division=0)#avoid warning messages
    recall_results[depth] = report

# Print results for each depth
for depth in recall_results:
    print(f"Max Depth: {depth}")
    print("Classification Report:")
    print(pd.DataFrame(recall_results[depth]).transpose())
    print("\n" + "="*50 + "\n")

Max Depth: 1
Classification Report:
              precision    recall  f1-score     support
0              1.000000  1.000000  1.000000   50.000000
1              0.500000  1.000000  0.666667   50.000000
2              0.000000  0.000000  0.000000   50.000000
accuracy       0.666667  0.666667  0.666667    0.666667
macro avg      0.500000  0.666667  0.555556  150.000000
weighted avg   0.500000  0.666667  0.555556  150.000000


Max Depth: 2
Classification Report:
              precision  recall  f1-score  support
0              1.000000    1.00  1.000000    50.00
1              0.907407    0.98  0.942308    50.00
2              0.978261    0.90  0.937500    50.00
accuracy       0.960000    0.96  0.960000     0.96
macro avg      0.961889    0.96  0.959936   150.00
weighted avg   0.961889    0.96  0.959936   150.00


Max Depth: 3
Classification Report:
              precision    recall  f1-score     support
0              1.000000  1.000000  1.000000   50.000000
1              0.979167  0.

In [7]:
best_depth = max(recall_results, key=lambda k: recall_results[k]['macro avg']['recall'])
best_recall = recall_results[best_depth]['macro avg']['recall']
print(f"Best depth: {best_depth}, with Recall: {best_recall:.2f}")

Best depth: 4, with Recall: 0.98


In [8]:
best_depth = min(recall_results, key=lambda k: recall_results[k]['macro avg']['precision'])
best_precision = recall_results[best_depth]['macro avg']['precision']
print(f"Lowest Precision depth: {best_depth}, with Precision: {best_precision:.2f}")

Lowest Precision depth: 1, with Precision: 0.50


In [9]:
best_depth = max(recall_results, key=lambda k: recall_results[k]['macro avg']['f1-score'])
best_f1score = recall_results[best_depth]['macro avg']['f1-score']
print(f"Best F1 score depth: {best_depth}, with F1 score: {best_f1score:.2f}")

Best F1 score depth: 4, with F1 score: 0.98


Problem 2 Breast Cancer Wisconsin

In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier,plot_tree
from sklearn.model_selection import train_test_split

In [43]:
column_names = [
    'id', 'clump_thickness', 'uniformity_cell_size',
    'uniformity_cell_shape', 'marginal_adhesion',
    'single_epithelial_cell_size', 'bare_nuclei',
    'bland_chromatin', 'normal_nucleoli', 'mitoses', 'class'
]

In [44]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data'
df = pd.read_csv(url, names=column_names, na_values='?')

In [45]:
# Drop rows with missing values
df.dropna(inplace=True)

In [46]:
# Convert 'bare Nuclei' to numeric
df['bare_nuclei'] = df['bare_nuclei'].astype(int)
# Drop the 'id' column as it is not useful for training
data = df.drop(columns=['id'])

# Convert 'class' to binary (2 -> 1, 4 -> 0)
data['Class'] = data['class'].map({2: 0, 4: 1})

In [47]:
print(df.head())

        id  clump_thickness  uniformity_cell_size  uniformity_cell_shape  \
0  1000025                5                     1                      1   
1  1002945                5                     4                      4   
2  1015425                3                     1                      1   
3  1016277                6                     8                      8   
4  1017023                4                     1                      1   

   marginal_adhesion  single_epithelial_cell_size  bare_nuclei  \
0                  1                            2            1   
1                  5                            7           10   
2                  1                            2            2   
3                  1                            3            4   
4                  3                            2            1   

   bland_chromatin  normal_nucleoli  mitoses  class  
0                3                1        1      2  
1                3                2   

In [48]:
# Split the data into features and target
X = data.drop(columns=['class'])
y = data['class']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the decision tree
clf = DecisionTreeClassifier(
    min_samples_leaf=2,# Minimum 2 instances in leaves
    min_samples_split=5,# No splits of subsets 
    max_depth=2,  # Maximum tree depth of 2
    criterion='gini'# Gini criterion
)
clf.fit(X_train, y_train)

In [49]:
# Calculate Gini index for the entire dataset
def gini_index(y): # 1−∑(pi)^2
    p = y.value_counts(normalize=True)
    return 1 - np.sum(np.square(p))

# Calculate metrics for the entire dataset
gini_before_split = gini_index(y_train)
print(f"Gini before split: {gini_before_split}")

Gini before split: 0.44321673442552567


In [50]:
# Calculate Entropy for the entire dataset
def entropy(y): #−∑pi*log2*pi*
    p = y.value_counts(normalize=True)
    return -np.sum([p_i * np.log2(p_i) for p_i in p])

entropy_before_split = entropy(y_train)
print(f"Entropy before split: {entropy_before_split}")

Entropy before split: 0.9164534336173732


In [51]:
# Calculate Misclassification Error for the entire dataset
def misclassification_error(y):# 1−max(pi)
    p = y.value_counts(normalize=True)
    return 1 - np.max(p)

misclassification_error_before_split = misclassification_error(y_train)
print(f"Misclassification Error before split: {misclassification_error_before_split}")

Misclassification Error before split: 0.33150183150183155


In [53]:
# Get the tree structure
n_nodes = clf.tree_.node_count
children_left = clf.tree_.children_left
children_right = clf.tree_.children_right
feature = clf.tree_.feature
threshold = clf.tree_.threshold

# Get the first split
first_split_feature = feature[0]#gives the index of the feature used for the first split.
first_split_threshold = threshold[0]#gives the threshold value used for the first split.

In [57]:
# Split the data based on the first split
left_split = X_train[X_train.iloc[:, first_split_feature] <= first_split_threshold]
right_split = X_train[X_train.iloc[:, first_split_feature] > first_split_threshold]

# Calculate Gini index for the subsets
gini_left = gini_index(y_train[left_split.index])
gini_right = gini_index(y_train[right_split.index])

# Calculate Entropy for each subset
entropy_left = entropy(y_train[left_split.index])
entropy_right = entropy(y_train[right_split.index])

# Calculate Misclassification Error for each subset
misclassification_error_left = misclassification_error(y_train[left_split.index])
misclassification_error_right= misclassification_error(y_train[right_split.index])

In [59]:
# Calculate the weighted average of the metrics
weight_left = len(y_train[left_split.index]) / len(y_train)
weight_right = len(y_train[right_split.index]) / len(y_train)

gini_after_split = weight_left * gini_left + weight_right * gini_right
print(f"Gini after split: {gini_after_split}")         
                
entropy_after_split = weight_left * entropy_left + weight_right * entropy_right
print(f"Entropy after split: {entropy_after_split}")
                   
misclassification_error_after_split = weight_left* misclassification_error_left + weight_right * misclassification_error_right
print(f"Misclassification Error after split: {misclassification_error_after_split}")

Gini after split: 0.0
Entropy after split: -0.0
Misclassification Error after split: 0.0


In [60]:
# Calculate Information Gain
information_gain = gini_before_split - (len(left_split) / len(X_train)) * gini_left - (len(right_split) / len(X_train)) * gini_right

print(f"Information Gain for the first split: {information_gain}")

Information Gain for the first split: 0.44321673442552567


In [62]:
# Map feature index to feature name
first_split_feature_name = X.columns[first_split_feature]

print(f"The feature selected for the first split is: {first_split_feature_name}")
print(f"The decision boundary value for the first split is: {first_split_threshold}")

The feature selected for the first split is: Class
The decision boundary value for the first split is: 0.5


Problem 3 Breast Cancer Wisconsin

In [23]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier,plot_tree
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [69]:
# Define column names for the dataset
column_names = [
    'id', 'Diagnosis', 'Radius Mean', 'Texture Mean', 'Perimeter Mean',
    'Area Mean', 'Smoothness Mean', 'Compactness Mean', 'Concavity Mean', 'Concave points Mean',
    'Symmetry Mean', 'Fractal Dimension Mean', 'Radius SE', 'Texture SE', 'Perimeter SE',
    'Area SE', 'Smoothness SE', 'Compactness SE', 'Concavity SE', 'Concave points SE',
    'Symmetry SE', 'Fractal Dimension SE', 'Radius Worst', 'Texture Worst', 'Perimeter Worst',
    'Area Worst', 'Smoothness Worst', 'Compactness Worst', 'Concavity Worst', 'Concave points Worst',
    'Symmetry Worst', 'Fractal Dimension Worst'
]


In [70]:
# Load the dataset from the UCI Machine Learning Repository
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data'
data = pd.read_csv(url, header=None, names=column_names)


In [71]:
# Display the first few rows of the dataset to inspect it
print(data.head())

         id Diagnosis  Radius Mean  Texture Mean  Perimeter Mean  Area Mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   Smoothness Mean  Compactness Mean  Concavity Mean  Concave points Mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   ...  Radius Worst  Texture Worst  Perimeter Worst  Area Wor

In [72]:
# Drop the 'id' column as it is not useful for training
data = data.drop(columns=['id'])

# Convert 'Diagnosis' to binary values (M -> 1, B -> 0)
data['Diagnosis'] = data['Diagnosis'].map({'M': 1, 'B': 0})

In [73]:
# Split the data into features and target
X = data.drop(columns=['Diagnosis'])
y = data['Diagnosis']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [74]:
# Perform PCA to extract the first principal component
pca = PCA(n_components=1)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [75]:
# Train the decision tree using the first principal component
clf_pca = DecisionTreeClassifier(
    min_samples_leaf=2, 
    min_samples_split=5, 
    max_depth=2, 
    criterion='gini'
)

clf_pca.fit(X_train_pca, y_train)

# Predict on the test set
y_pred_pca = clf_pca.predict(X_test_pca)

# Calculate F1 score, Precision, and Recall
f1_pca = f1_score(y_test, y_pred_pca)
print(f"F1 Score (PCA-based model): {f1_pca}")

precision_pca = precision_score(y_test, y_pred_pca)
print(f"Precision (PCA-based model): {precision_pca}")

recall_pca = recall_score(y_test, y_pred_pca)
print(f"Recall (PCA-based model): {recall_pca}")

F1 Score (PCA-based model): 0.9523809523809523
Precision (PCA-based model): 0.975609756097561
Recall (PCA-based model): 0.9302325581395349


In [31]:
# Train the decision tree using the original data
clf_original = DecisionTreeClassifier(
    min_samples_leaf=2, 
    min_samples_split=5,
    max_depth=2, 
    criterion='gini'
)

clf_original.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred_original = clf_original.predict(X_test_scaled)

# Calculate F1 score, Precision, and Recall
f1_original = f1_score(y_test, y_pred_original)
print(f"F1 Score (Original model): {f1_original}")

precision_original = precision_score(y_test, y_pred_original)
print(f"Precision (Original model): {precision_original}")

recall_original = recall_score(y_test, y_pred_original)
print(f"Recall (Original model): {recall_original}")

F1 Score (Original model): 0.9024390243902439
Precision (Original model): 0.9487179487179487
Recall (Original model): 0.8604651162790697


In [32]:
print("Comparison of Performance Metrics:")
print(f"F1 Score (PCA-based model): {f1_pca:.4f} vs Original model: {f1_original:.4f}")
print(f"Precision (PCA-based model): {precision_pca:.4f} vs Original model: {precision_original:.4f}")
print(f"Recall (PCA-based model): {recall_pca:.4f} vs Original model: {recall_original:.4f}")

Comparison of Performance Metrics:
F1 Score (PCA-based model): 0.9524 vs Original model: 0.9024
Precision (PCA-based model): 0.9756 vs Original model: 0.9487
Recall (PCA-based model): 0.9302 vs Original model: 0.8605


In [33]:
# Perform PCA to extract the first two principal components
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [34]:
# Train the decision tree using the first two principal components
clf_pca = DecisionTreeClassifier(min_samples_leaf=2, min_samples_split=5, max_depth=2, criterion='gini')
clf_pca.fit(X_train_pca, y_train)

# Predict on the test set
y_pred_pca = clf_pca.predict(X_test_pca)

# Calculate the Confusion Matrix
cm_pca = confusion_matrix(y_test, y_pred_pca)

In [35]:
# Extract FP, TP, FN, TN from the Confusion Matrix
FP = cm_pca[0, 1]
TP = cm_pca[1, 1]
FN = cm_pca[1, 0]
TN = cm_pca[0, 0]

# Calculate FPR and TPR
FPR = FP / (FP + TN)
TPR = TP / (TP + FN)

print(f"False Positives (FP): {FP}")
print(f"True Positives (TP): {TP}")
print(f"False Positive Rate (FPR): {FPR}")
print(f"True Positive Rate (TPR): {TPR}")

False Positives (FP): 1
True Positives (TP): 38
False Positive Rate (FPR): 0.014084507042253521
True Positive Rate (TPR): 0.8837209302325582


In [76]:
# Train the decision tree using the original data
clf_original = DecisionTreeClassifier(min_samples_leaf=2, min_samples_split=5, max_depth=2, criterion='gini')
clf_original.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred_original = clf_original.predict(X_test_scaled)

# Calculate the Confusion Matrix for the original model
cm_original = confusion_matrix(y_test, y_pred_original)

# Extract FP, TP, FN, TN from the Confusion Matrix
FP_original = cm_original[0, 1]
TP_original = cm_original[1, 1]
FN_original = cm_original[1, 0]
TN_original = cm_original[0, 0]

# Calculate FPR and TPR for the original model
FPR_original = FP_original / (FP_original + TN_original)
TPR_original = TP_original / (TP_original + FN_original)

print(f"False Positives (FP) - Original model: {FP_original}")
print(f"True Positives (TP) - Original model: {TP_original}")
print(f"False Positive Rate (FPR) - Original model: {FPR_original}")
print(f"True Positive Rate (TPR) - Original model: {TPR_original}")

False Positives (FP) - Original model: 2
True Positives (TP) - Original model: 37
False Positive Rate (FPR) - Original model: 0.028169014084507043
True Positive Rate (TPR) - Original model: 0.8604651162790697
