# 1

In [66]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV

In [67]:
data = pd.read_excel('/Users/anantagarwal/Downloads/Raisin_Dataset/Raisin_Dataset.xlsx')  # Using the provided path
print("First 5 rows of the dataset:")
print(data.head())

First 5 rows of the dataset:
    Area  MajorAxisLength  MinorAxisLength  Eccentricity  ConvexArea  \
0  87524       442.246011       253.291155      0.819738       90546   
1  75166       406.690687       243.032436      0.801805       78789   
2  90856       442.267048       266.328318      0.798354       93717   
3  45928       286.540559       208.760042      0.684989       47336   
4  79408       352.190770       290.827533      0.564011       81463   

     Extent  Perimeter    Class  
0  0.758651   1184.040  Kecimen  
1  0.684130   1121.786  Kecimen  
2  0.637613   1208.575  Kecimen  
3  0.699599    844.162  Kecimen  
4  0.792772   1073.251  Kecimen  


In [68]:
print("Missing values check:")
print(data.isnull().sum())


Missing values check:
Area               0
MajorAxisLength    0
MinorAxisLength    0
Eccentricity       0
ConvexArea         0
Extent             0
Perimeter          0
Class              0
dtype: int64


In [69]:
le = LabelEncoder()
data['Class'] = le.fit_transform(data['Class'])
print("After converting class to discrete values:")
print(data['Class'].value_counts())

After converting class to discrete values:
Class
1    450
0    450
Name: count, dtype: int64


In [70]:
X = data.drop('Class', axis=1)
y = data['Class']
chi2_values, p_values = chi2(X, y)

In [71]:
chi2_df = pd.DataFrame({'Feature': X.columns, 'Chi2': chi2_values, 'p-value': p_values})
print("Chi-Square values and p-values:")
print(chi2_df)

Chi-Square values and p-values:
           Feature          Chi2   p-value
0             Area  6.097822e+06  0.000000
1  MajorAxisLength  1.272952e+04  0.000000
2  MinorAxisLength  2.234351e+03  0.000000
3     Eccentricity  1.804260e+00  0.179198
4       ConvexArea  6.412753e+06  0.000000
5           Extent  8.791728e-02  0.766842
6        Perimeter  2.563142e+04  0.000000


In [72]:
important_features = chi2_df[chi2_df['p-value'] <= 0.05]['Feature'].values
X = data[important_features]
print("Selected important features:")
print(important_features)

Selected important features:
['Area' 'MajorAxisLength' 'MinorAxisLength' 'ConvexArea' 'Perimeter']


# 2

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3

In [76]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

# 4

In [78]:
y_pred = clf.predict(X_test)

In [79]:
cm = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

In [80]:
print("Confusion Matrix:\n", cm)
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")


Confusion Matrix:
 [[68 18]
 [25 69]]
Accuracy: 0.7611111111111111
Precision: 0.7931034482758621
Recall: 0.7340425531914894


# 5

In [117]:
dt_log_loss=DecisionTreeClassifier(criterion="log_loss", random_state=42)
dt_log_loss.fit(X_train, y_train)
y_pred_log_loss=dt_log_loss.predict(X_test)

In [119]:
accuracy_log_loss=accuracy_score(y_test, y_pred_log_loss)
precision_log_loss=precision_score(y_test, y_pred_log_loss)
recall_log_loss=recall_score(y_test, y_pred_log_loss)

In [123]:
print(f"\nLog Loss - Accuracy: {accuracy_log_loss}")
print(f"Precision: {precision_log_loss}")
print(f"Recall: {recall_log_loss}")


Log Loss - Accuracy: 0.7333333333333333
Precision: 0.7804878048780488
Recall: 0.6808510638297872


# 6

In [102]:
param_grid={
    "max_depth": [10, 100],
    "min_samples_split": [4, 6, 8],
    "max_features": ["auto", "sqrt", "log2"]
}

In [104]:
grid_search=GridSearchCV(estimator=dt_entropy, param_grid=param_grid, cv=5, scoring="accuracy")
grid_search.fit(X_train, y_train)

30 fits failed out of a total of 90.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/base.py", line 1467, in wrapper
    estimator._validate_params()
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParam

In [106]:
best_params=grid_search.best_params_
best_score=grid_search.best_score_

In [108]:
print(f"Best Parameters: {best_params}")
print(f"Best Accuracy Score: {best_score}")

Best Parameters: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_split': 8}
Best Accuracy Score: 0.8388888888888889


# 7

In [111]:
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)


In [113]:
best_accuracy = accuracy_score(y_test, y_pred_best)
best_precision = precision_score(y_test, y_pred_best)
best_recall = recall_score(y_test, y_pred_best)


In [115]:
print(f"\nBest Model - Accuracy: {best_accuracy}")
print(f"Precision: {best_precision}")
print(f"Recall: {best_recall}")


Best Model - Accuracy: 0.7944444444444444
Precision: 0.7938144329896907
Recall: 0.8191489361702128
