# 1.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

In [3]:
file_path = '/Users/anantagarwal/Downloads/breast+cancer+coimbra/dataR2.csv'
df = pd.read_csv(file_path)

In [7]:
print("First 5 rows of the dataset:")
print(df.head())

First 5 rows of the dataset:
   Age        BMI  Glucose  Insulin      HOMA   Leptin  Adiponectin  Resistin  \
0   48  23.500000       70    2.707  0.467409   8.8071     9.702400   7.99585   
1   83  20.690495       92    3.115  0.706897   8.8438     5.429285   4.06405   
2   82  23.124670       91    4.498  1.009651  17.9393    22.432040   9.27715   
3   68  21.367521       77    3.226  0.612725   9.8827     7.169560  12.76600   
4   86  21.111111       92    3.549  0.805386   6.6994     4.819240  10.57635   

     MCP.1  Classification  
0  417.114               1  
1  468.786               1  
2  554.697               1  
3  928.220               1  
4  773.920               1  


In [9]:
print("\nChecking for missing values:")
print(df.isnull().sum())


Checking for missing values:
Age               0
BMI               0
Glucose           0
Insulin           0
HOMA              0
Leptin            0
Adiponectin       0
Resistin          0
MCP.1             0
Classification    0
dtype: int64


In [15]:
df['Classification'] = df['Classification'].replace({1: 0, 2: 1})

In [17]:
X = df.drop('Classification', axis=1)
y = df['Classification']

In [19]:
from sklearn.feature_selection import SelectKBest
chi_scores = chi2(X, y)
chi2_df = pd.DataFrame({'Feature': X.columns, 'Chi2 Score': chi_scores[0]})
print("\nChi-Square Scores for each feature:")
print(chi2_df)


Chi-Square Scores for each feature:
       Feature  Chi2 Score
0          Age    0.988417
1          BMI    1.847119
2      Glucose   88.125373
3      Insulin   89.203820
4         HOMA   45.656784
5       Leptin    0.001849
6  Adiponectin    0.200949
7     Resistin   61.949833
8        MCP.1  214.917039


In [21]:
least_important_feature = chi2_df.loc[chi2_df['Chi2 Score'].idxmin(), 'Feature']
X = X.drop(least_important_feature, axis=1)
print(f"\nDropping least important feature: {least_important_feature}")


Dropping least important feature: Leptin


# 2

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3.

In [26]:
dt_clf = DecisionTreeClassifier(random_state=42)
dt_clf.fit(X_train, y_train)

# 4.

In [30]:
y_pred = dt_clf.predict(X_test)
print("\nDefault Decision Tree Evaluation:")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))


Default Decision Tree Evaluation:
Confusion Matrix:
 [[9 3]
 [3 9]]
Accuracy: 0.75
Precision: 0.75
Recall: 0.75


# 5.

In [34]:
for criterion in ['entropy', 'log_loss']:
    dt_clf = DecisionTreeClassifier(criterion=criterion, random_state=42)
    dt_clf.fit(X_train, y_train)
    y_pred = dt_clf.predict(X_test)
    print(f"\nDecision Tree with {criterion} criterion Evaluation:")
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))


Decision Tree with entropy criterion Evaluation:
Confusion Matrix:
 [[11  1]
 [ 1 11]]
Accuracy: 0.9166666666666666
Precision: 0.9166666666666666
Recall: 0.9166666666666666

Decision Tree with log_loss criterion Evaluation:
Confusion Matrix:
 [[11  1]
 [ 1 11]]
Accuracy: 0.9166666666666666
Precision: 0.9166666666666666
Recall: 0.9166666666666666


# 6.

In [37]:
from sklearn.model_selection import GridSearchCV

In [39]:
param_grid = {
    'max_depth': [10, 100],
    'min_samples_split': [4, 6, 8],
    'max_features': ['auto', 'sqrt', 'log2']
}

In [41]:
grid_search = GridSearchCV(estimator=DecisionTreeClassifier(random_state=42),
                           param_grid=param_grid,
                           scoring='accuracy', cv=5, n_jobs=-1)

In [43]:
grid_search.fit(X_train, y_train)

30 fits failed out of a total of 90.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
17 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/base.py", line 1467, in wrapper
    estimator._validate_params()
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParam

# 7.

In [46]:
print("\nBest parameters found by GridSearchCV:")
print(grid_search.best_params_)



Best parameters found by GridSearchCV:
{'max_depth': 10, 'max_features': 'log2', 'min_samples_split': 6}


In [48]:
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)
print("\nBest Model Evaluation:")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_best))
print("Accuracy:", accuracy_score(y_test, y_pred_best))
print("Precision:", precision_score(y_test, y_pred_best))
print("Recall:", recall_score(y_test, y_pred_best))


Best Model Evaluation:
Confusion Matrix:
 [[10  2]
 [ 4  8]]
Accuracy: 0.75
Precision: 0.8
Recall: 0.6666666666666666
