In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn import svm
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

## Logistic Regression

In [3]:
df = pd.read_table('D:\\5.1_Machine_Learning_Code\\Classification\\fruit_data_with_colors.txt')
df.head()

Unnamed: 0,fruit_label,fruit_name,fruit_subtype,mass,width,height,color_score
0,1,apple,granny_smith,192,8.4,7.3,0.55
1,1,apple,granny_smith,180,8.0,6.8,0.59
2,1,apple,granny_smith,176,7.4,7.2,0.6
3,2,mandarin,mandarin,86,6.2,4.7,0.8
4,2,mandarin,mandarin,84,6.0,4.6,0.79


In [4]:
#Input and Output Variables
feature_names = ['mass', 'width', 'height', 'color_score']
X = df[feature_names]
y = df['fruit_label']

In [52]:
df.shape

(59, 7)

In [5]:
#Scaling X and y
scaler = MinMaxScaler()
scaled_X = scaler.fit_transform(X)

#Splitting dataset into Train Test 70/30
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1, stratify = y)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(41, 4) (18, 4) (41,) (18,)


In [6]:
#Logistic Regression
model = LogisticRegression()
model.fit(X_train, y_train)

#Predicting
y_pred = model.predict(X_test)
y_pred

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([3, 1, 1, 3, 4, 3, 4, 1, 3, 3, 4, 3, 3, 4, 3, 1, 2, 4], dtype=int64)

In [7]:
#Accuracy of Prediction
print('Accuracy of Logistic Regression classifier on training set: {}'.format(model.score(X_train, y_train)))
print('Accuracy of Logistic Regression classifier on test set: {}'.format(model.score(X_test, y_test)))

# Combine y_test and y_pred into a DataFrame
test_cal = pd.concat([pd.DataFrame(y_test).reset_index(drop=True),pd.DataFrame(y_pred).reset_index(drop=True)],axis=1)
print('test_cal')

# Rename the predicted column
test_cal.rename(columns={0: 'predicted'}, inplace=True)
print(test_cal)

# Confusion Matrix
conf_df = confusion_matrix(test_cal['fruit_label'], test_cal['predicted'])
print(conf_df)

# Classification Report
metric_report = classification_report(test_cal['fruit_label'], test_cal['predicted'])
print(metric_report)

Accuracy of Logistic Regression classifier on training set: 0.8292682926829268
Accuracy of Logistic Regression classifier on test set: 0.8888888888888888
test_cal
    fruit_label  predicted
0             3          3
1             1          1
2             1          1
3             3          3
4             4          4
5             3          3
6             4          4
7             1          1
8             3          3
9             3          3
10            4          4
11            3          3
12            1          3
13            4          4
14            1          3
15            1          1
16            2          2
17            4          4
[[4 0 2 0]
 [0 1 0 0]
 [0 0 6 0]
 [0 0 0 5]]
              precision    recall  f1-score   support

           1       1.00      0.67      0.80         6
           2       1.00      1.00      1.00         1
           3       0.75      1.00      0.86         6
           4       1.00      1.00      1.00         5

    acc

## GRIDSEARCH CV

In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# Hyperparameter grid
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization strength
    'solver': ['liblinear', 'lbfgs', 'newton-cg'],  # Solvers to try
    'penalty': ['l2'],  # Regularization type (l1 and elasticnet are less commonly used for logistic regression)
    'max_iter': [100, 200, 500]  # Number of iterations
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=LogisticRegression(), param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)

# Fit grid search
grid_search.fit(X_train, y_train)

# Best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)

# Best model
best_model = grid_search.best_estimator_

# Predicting with the best model
y_pred_best = best_model.predict(X_test)

# Accuracy of Prediction with best model
print('Accuracy of Logistic Regression classifier on training set: {}'.format(best_model.score(X_train, y_train)))
print('Accuracy of Logistic Regression classifier on test set: {}'.format(best_model.score(X_test, y_test)))

# Combine y_test and y_pred into a DataFrame
test_cal_best = pd.concat([pd.DataFrame(y_test).reset_index(drop=True), pd.DataFrame(y_pred_best).reset_index(drop=True)], axis=1)
test_cal_best.rename(columns={0: 'predicted'}, inplace=True)

# Confusion Matrix with the best model
conf_df_best = confusion_matrix(test_cal_best['fruit_label'], test_cal_best['predicted'])
print(conf_df_best)

# Classification Report with the best model
metric_report_best = classification_report(test_cal_best['fruit_label'], test_cal_best['predicted'])
print(metric_report_best)



Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best hyperparameters: {'C': 10, 'max_iter': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
Accuracy of Logistic Regression classifier on training set: 0.8292682926829268
Accuracy of Logistic Regression classifier on test set: 0.8333333333333334
[[3 0 3 0]
 [0 1 0 0]
 [0 0 6 0]
 [0 0 0 5]]
              precision    recall  f1-score   support

           1       1.00      0.50      0.67         6
           2       1.00      1.00      1.00         1
           3       0.67      1.00      0.80         6
           4       1.00      1.00      1.00         5

    accuracy                           0.83        18
   macro avg       0.92      0.88      0.87        18
weighted avg       0.89      0.83      0.82        18



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## RANDOMIZED SEARCH CV

In [11]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

# Hyperparameter distributions
param_dist = {
    'C': uniform(0.001, 100),  # Uniform distribution for C
    'solver': ['liblinear', 'lbfgs', 'newton-cg'],
    'penalty': ['l2'],
    'max_iter': [100, 200, 500]
}

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=LogisticRegression(), param_distributions=param_dist, n_iter=100, cv=5, n_jobs=-1, verbose=1)

# Fit random search
random_search.fit(X_train, y_train)

# Best hyperparameters
print("Best hyperparameters (Randomized):", random_search.best_params_)

# Best model
best_model_random = random_search.best_estimator_

# Predicting with the best model
y_pred_random = best_model_random.predict(X_test)

# Accuracy of Prediction with best model
print('Accuracy of Logistic Regression classifier on training set: {}'.format(best_model_random.score(X_train, y_train)))
print('Accuracy of Logistic Regression classifier on test set: {}'.format(best_model_random.score(X_test, y_test)))




Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best hyperparameters (Randomized): {'C': 18.053840579652107, 'max_iter': 500, 'penalty': 'l2', 'solver': 'lbfgs'}
Accuracy of Logistic Regression classifier on training set: 0.8292682926829268
Accuracy of Logistic Regression classifier on test set: 0.8333333333333334


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Naive Bayes

In [13]:
df = pd.read_table('D:\\5.1_Machine_Learning_Code\\Classification\\fruit_data_with_colors.txt')
df.head()

Unnamed: 0,fruit_label,fruit_name,fruit_subtype,mass,width,height,color_score
0,1,apple,granny_smith,192,8.4,7.3,0.55
1,1,apple,granny_smith,180,8.0,6.8,0.59
2,1,apple,granny_smith,176,7.4,7.2,0.6
3,2,mandarin,mandarin,86,6.2,4.7,0.8
4,2,mandarin,mandarin,84,6.0,4.6,0.79


In [14]:
#Input and Output Variables
feature_names = ['mass', 'width', 'height', 'color_score']
X = df[feature_names]
y = df['fruit_label']

#Scaling X and y
scaler = MinMaxScaler()
scaled_X = scaler.fit_transform(X)

#Splitting dataset into Train Test 70/30
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1, stratify = y)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(41, 4) (18, 4) (41,) (18,)


In [15]:
#Naive Bayes Model
nb =  GaussianNB()
nb.fit(X_train, y_train)

#Predicting
y_pred = nb.predict(X_test)

In [16]:
y_pred

array([3, 1, 1, 3, 4, 3, 4, 1, 3, 3, 4, 4, 1, 4, 1, 1, 2, 4], dtype=int64)

In [17]:
#Accuracy of Prediction
print('Accuracy of Logistic Regression classifier on training set: {}'.format(nb.score(X_train, y_train)))
print('Accuracy of Logistic Regression classifier on test set: {}'.format(nb.score(X_test, y_test)))

# Combine y_test and y_pred into a DataFrame
test_calc = pd.concat([pd.DataFrame(y_test).reset_index(drop=True),pd.DataFrame(y_pred).reset_index(drop=True)],axis=1)
print('test_cal')

# Rename the predicted column
test_calc.rename(columns={0: 'predicted'}, inplace=True)
print(test_cal)

# Confusion Matrix
conf_df = confusion_matrix(test_calc['fruit_label'], test_calc['predicted'])
print(conf_df)

# Classification Report
metric_report = classification_report(test_calc['fruit_label'], test_calc['predicted'])
print(metric_report)

Accuracy of Logistic Regression classifier on training set: 0.8536585365853658
Accuracy of Logistic Regression classifier on test set: 0.9444444444444444
test_cal
    fruit_label  predicted
0             3          3
1             1          1
2             1          1
3             3          3
4             4          4
5             3          3
6             4          4
7             1          1
8             3          3
9             3          3
10            4          4
11            3          3
12            1          3
13            4          4
14            1          3
15            1          1
16            2          2
17            4          4
[[6 0 0 0]
 [0 1 0 0]
 [0 0 5 1]
 [0 0 0 5]]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         6
           2       1.00      1.00      1.00         1
           3       1.00      0.83      0.91         6
           4       0.83      1.00      0.91         5

    acc

## Stochastic Gradient Descent

In [19]:
df = pd.read_table('D:\\5.1_Machine_Learning_Code\\Classification\\fruit_data_with_colors.txt')
df.head()

#Input and Output Variables
feature_names = ['mass', 'width', 'height', 'color_score']
X = df[feature_names]
y = df['fruit_label']

#Scaling X and y
scaler = MinMaxScaler()
scaled_X = scaler.fit_transform(X)

#Splitting dataset into Train Test 70/30
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1, stratify = y)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

#Stochastic Gradient Descent Model
sgd = SGDClassifier(shuffle=True,random_state=101)
sgd.fit(X_train, y_train)

#Predicting
y_pred = sgd.predict(X_test)

#Accuracy of Prediction
print('Accuracy of Logistic Regression classifier on training set: {}'.format(sgd.score(X_train, y_train)))
print('Accuracy of Logistic Regression classifier on test set: {}'.format(sgd.score(X_test, y_test)))

# Rename the predicted column
test_calc = pd.concat([pd.DataFrame(y_test).reset_index(drop=True),pd.DataFrame(y_pred).reset_index(drop=True)],axis=1)
test_calc.rename(columns={0: 'predicted'}, inplace=True)

conf_df = confusion_matrix(test_calc['fruit_label'],test_calc['predicted'])
print (conf_df)

metric_report = classification_report(test_calc['fruit_label'], test_calc['predicted'])
print(metric_report)

(41, 4) (18, 4) (41,) (18,)
Accuracy of Logistic Regression classifier on training set: 0.3170731707317073
Accuracy of Logistic Regression classifier on test set: 0.3333333333333333
[[0 0 6 0]
 [0 0 1 0]
 [0 0 6 0]
 [0 0 5 0]]
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00         1
           3       0.33      1.00      0.50         6
           4       0.00      0.00      0.00         5

    accuracy                           0.33        18
   macro avg       0.08      0.25      0.12        18
weighted avg       0.11      0.33      0.17        18



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## GRID SEARCH CV

In [21]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
import warnings
warnings.filterwarnings('ignore')

# Hyperparameter grid for SGD
param_grid = {
    'alpha': [0.0001, 0.001, 0.01, 0.1, 1],  # Regularization strength
    'penalty': ['l2', 'l1', 'elasticnet'],  # Types of regularization
    'learning_rate': ['constant', 'optimal', 'invscaling'],  # Learning rate schedule
    'max_iter': [1000, 2000, 3000],  # Number of iterations
    'tol': [1e-4, 1e-3, 1e-2]  # Tolerance for stopping criteria
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=SGDClassifier(), param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)

# Fit grid search to training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
print("Best hyperparameters found:", grid_search.best_params_)

# Best model after hyperparameter tuning
best_sgd = grid_search.best_estimator_

# Predict using the best model
y_pred_best = best_sgd.predict(X_test)

# Accuracy of Prediction with best model
print('Accuracy of SGD classifier on training set: {}'.format(best_sgd.score(X_train, y_train)))
print('Accuracy of SGD classifier on test set: {}'.format(best_sgd.score(X_test, y_test)))

# Combine y_test and y_pred into a DataFrame for evaluation
test_calc_best = pd.concat([pd.DataFrame(y_test).reset_index(drop=True), pd.DataFrame(y_pred_best).reset_index(drop=True)], axis=1)
test_calc_best.rename(columns={0: 'predicted'}, inplace=True)

# Confusion Matrix with the best model
conf_df_best = confusion_matrix(test_calc_best['fruit_label'], test_calc_best['predicted'])
print(conf_df_best)

# Classification Report with the best model
metric_report_best = classification_report(test_calc_best['fruit_label'], test_calc_best['predicted'])
print(metric_report_best)

Fitting 5 folds for each of 405 candidates, totalling 2025 fits
Best hyperparameters found: {'alpha': 0.01, 'learning_rate': 'optimal', 'max_iter': 1000, 'penalty': 'elasticnet', 'tol': 0.0001}
Accuracy of SGD classifier on training set: 0.3170731707317073
Accuracy of SGD classifier on test set: 0.3333333333333333
[[0 0 6 0]
 [0 0 1 0]
 [0 0 6 0]
 [0 0 5 0]]
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00         1
           3       0.33      1.00      0.50         6
           4       0.00      0.00      0.00         5

    accuracy                           0.33        18
   macro avg       0.08      0.25      0.12        18
weighted avg       0.11      0.33      0.17        18



## Randomized SearchCV

In [23]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

# Hyperparameter distributions
param_dist = {
    'alpha': uniform(0.0001, 1),  # Uniform distribution for alpha
    'penalty': ['l2', 'l1', 'elasticnet'],
    'learning_rate': ['constant', 'optimal', 'invscaling'],
    'max_iter': [1000, 2000, 3000],
    'tol': [1e-4, 1e-3, 1e-2]
}

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=SGDClassifier(), param_distributions=param_dist, n_iter=100, cv=5, n_jobs=-1, verbose=1)

# Fit randomized search to training data
random_search.fit(X_train, y_train)

# Best hyperparameters from randomized search
print("Best hyperparameters (Randomized):", random_search.best_params_)

# Best model after hyperparameter tuning
best_sgd_random = random_search.best_estimator_

# Predict using the best model
y_pred_random = best_sgd_random.predict(X_test)

# Accuracy of Prediction with best model
print('Accuracy of SGD classifier on training set: {}'.format(best_sgd_random.score(X_train, y_train)))
print('Accuracy of SGD classifier on test set: {}'.format(best_sgd_random.score(X_test, y_test)))


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best hyperparameters (Randomized): {'alpha': 0.14935264752060717, 'learning_rate': 'optimal', 'max_iter': 3000, 'penalty': 'elasticnet', 'tol': 0.0001}
Accuracy of SGD classifier on training set: 0.3170731707317073
Accuracy of SGD classifier on test set: 0.3333333333333333


## K Nearest Neighbours

In [25]:
df = pd.read_table('D:\\5.1_Machine_Learning_Code\\Classification\\fruit_data_with_colors.txt')
df.head()

#Input and Output Variables
feature_names = ['mass', 'width', 'height', 'color_score']
X = df[feature_names]
y = df['fruit_label']

#Scaling X and y
scaler = MinMaxScaler()
scaled_X = scaler.fit_transform(X)

#Splitting dataset into Train Test 70/30
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1, stratify = y)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

#K Nearest Neighbour Model
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

#Predicting
y_pred = knn.predict(X_test)

#Accuracy of Prediction
print('Accuracy of Logistic Regression classifier on training set: {}'.format(knn.score(X_train, y_train)))
print('Accuracy of Logistic Regression classifier on test set: {}'.format(knn.score(X_test, y_test)))

# Rename the predicted column
test_calc = pd.concat([pd.DataFrame(y_test).reset_index(drop=True),pd.DataFrame(y_pred).reset_index(drop=True)],axis=1)
test_calc.rename(columns={0: 'predicted'}, inplace=True)

conf_df = confusion_matrix(test_calc['fruit_label'],test_calc['predicted'])
print (conf_df)

metric_report = classification_report(test_calc['fruit_label'], test_calc['predicted'])
print(metric_report)

(41, 4) (18, 4) (41,) (18,)
Accuracy of Logistic Regression classifier on training set: 0.7804878048780488
Accuracy of Logistic Regression classifier on test set: 0.7777777777777778
[[4 0 2 0]
 [0 1 0 0]
 [0 0 4 2]
 [0 0 0 5]]
              precision    recall  f1-score   support

           1       1.00      0.67      0.80         6
           2       1.00      1.00      1.00         1
           3       0.67      0.67      0.67         6
           4       0.71      1.00      0.83         5

    accuracy                           0.78        18
   macro avg       0.85      0.83      0.83        18
weighted avg       0.81      0.78      0.78        18



## GridSearchCV

In [27]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Input data (assuming the dataframe `df` is already loaded as per your code)
feature_names = ['mass', 'width', 'height', 'color_score']
X = df[feature_names]
y = df['fruit_label']

# Scaling X
scaler = MinMaxScaler()
scaled_X = scaler.fit_transform(X)

# Splitting dataset into Train Test 70/30
X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.30, random_state=1, stratify=y)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# Define KNN model
knn = KNeighborsClassifier()

# Define hyperparameter grid to search
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'metric': ['minkowski', 'euclidean', 'manhattan'],
    'leaf_size': [10, 20, 30, 40, 50],
    'p': [1, 2]
}

# Set up the GridSearchCV
grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)

# Fit the grid search
grid_search.fit(X_train, y_train)

# Print the best hyperparameters found
print("Best Hyperparameters:", grid_search.best_params_)

# Predict using the best model
y_pred = grid_search.best_estimator_.predict(X_test)

# Accuracy of Prediction
print('Accuracy on training set: {}'.format(grid_search.best_estimator_.score(X_train, y_train)))
print('Accuracy on test set: {}'.format(grid_search.best_estimator_.score(X_test, y_test)))

# Create a DataFrame for comparison
test_calc = pd.concat([pd.DataFrame(y_test).reset_index(drop=True), pd.DataFrame(y_pred).reset_index(drop=True)], axis=1)
test_calc.rename(columns={0: 'predicted'}, inplace=True)

# Confusion Matrix
conf_df = confusion_matrix(test_calc['fruit_label'], test_calc['predicted'])
print(conf_df)

# Classification Report
metric_report = classification_report(test_calc['fruit_label'], test_calc['predicted'])
print(metric_report)


(41, 4) (18, 4) (41,) (18,)
Fitting 5 folds for each of 300 candidates, totalling 1500 fits
Best Hyperparameters: {'leaf_size': 10, 'metric': 'minkowski', 'n_neighbors': 3, 'p': 2, 'weights': 'uniform'}
Accuracy on training set: 0.9512195121951219
Accuracy on test set: 0.7777777777777778
[[3 0 3 0]
 [0 1 0 0]
 [1 0 5 0]
 [0 0 0 5]]
              precision    recall  f1-score   support

           1       0.75      0.50      0.60         6
           2       1.00      1.00      1.00         1
           3       0.62      0.83      0.71         6
           4       1.00      1.00      1.00         5

    accuracy                           0.78        18
   macro avg       0.84      0.83      0.83        18
weighted avg       0.79      0.78      0.77        18



## Support Vector Machine

In [29]:
from sklearn.svm import SVC

df = pd.read_table('D:\\5.1_Machine_Learning_Code\\Classification\\fruit_data_with_colors.txt')
df.head()

#Input and Output Variables
feature_names = ['mass', 'width', 'height', 'color_score']
X = df[feature_names]
y = df['fruit_label']

#Scaling X and y
scaler = MinMaxScaler()
scaled_X = scaler.fit_transform(X)

#Splitting dataset into Train Test 70/30
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1, stratify = y)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

#Support Vector Machine Model
svm = SVC()
svm.fit(X_train, y_train)

#Predicting
y_pred = svm.predict(X_test)

#Accuracy of Prediction
print('Accuracy of Logistic Regression classifier on training set: {}'.format(svm.score(X_train, y_train)))
print('Accuracy of Logistic Regression classifier on test set: {}'.format(svm.score(X_test, y_test)))

# Rename the predicted column
test_calc = pd.concat([pd.DataFrame(y_test).reset_index(drop=True),pd.DataFrame(y_pred).reset_index(drop=True)],axis=1)
test_calc.rename(columns={0: 'predicted'}, inplace=True)

conf_df = confusion_matrix(test_calc['fruit_label'],test_calc['predicted'])
print (conf_df)

metric_report = classification_report(test_calc['fruit_label'], test_calc['predicted'])
print(metric_report)

(41, 4) (18, 4) (41,) (18,)
Accuracy of Logistic Regression classifier on training set: 0.43902439024390244
Accuracy of Logistic Regression classifier on test set: 0.5
[[6 0 0 0]
 [0 1 0 0]
 [4 0 2 0]
 [5 0 0 0]]
              precision    recall  f1-score   support

           1       0.40      1.00      0.57         6
           2       1.00      1.00      1.00         1
           3       1.00      0.33      0.50         6
           4       0.00      0.00      0.00         5

    accuracy                           0.50        18
   macro avg       0.60      0.58      0.52        18
weighted avg       0.52      0.50      0.41        18



## XG BOOST

In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report
from xgboost import XGBClassifier  # Importing XGBClassifier from xgboost

# Load dataset
df = pd.read_table('D:\\5.1_Machine_Learning_Code\\Classification\\fruit_data_with_colors.txt')
df.head()

# Input and Output Variables
feature_names = ['mass', 'width', 'height', 'color_score']
X = df[feature_names]
y = df['fruit_label']

# Scaling X
scaler = MinMaxScaler()
scaled_X = scaler.fit_transform(X)

# Splitting dataset into Train and Test (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.20, random_state=1, stratify=y)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# Encode the labels to start from 0 using LabelEncoder
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# XGBoost Model
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')  # Initialize the XGBClassifier
xgb.fit(X_train, y_train_encoded)  # Fit the model

# Predicting
y_pred = xgb.predict(X_test)

# Accuracy of Prediction
print('Accuracy of XGBoost classifier on training set: {}'.format(xgb.score(X_train, y_train_encoded)))
print('Accuracy of XGBoost classifier on test set: {}'.format(xgb.score(X_test, y_test_encoded)))

# Confusion Matrix
conf_df = confusion_matrix(y_test_encoded, y_pred)
print("Confusion Matrix:")
print(conf_df)

# Classification Report
metric_report = classification_report(y_test_encoded, y_pred)
print("Classification Report:")
print(metric_report)


(47, 4) (12, 4) (47,) (12,)
Accuracy of XGBoost classifier on training set: 1.0
Accuracy of XGBoost classifier on test set: 0.6666666666666666
Confusion Matrix:
[[2 0 0 2]
 [0 1 0 0]
 [0 0 3 1]
 [0 0 1 2]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.50      0.67         4
           1       1.00      1.00      1.00         1
           2       0.75      0.75      0.75         4
           3       0.40      0.67      0.50         3

    accuracy                           0.67        12
   macro avg       0.79      0.73      0.73        12
weighted avg       0.77      0.67      0.68        12



## Random Forest Classifier

In [33]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report

# Read the dataset
df = pd.read_table('D:\\5.1_Machine_Learning_Code\\Classification\\fruit_data_with_colors.txt')
df.head()

# Input and Output Variables
feature_names = ['mass', 'width', 'height', 'color_score']
X = df[feature_names]
y = df['fruit_label']

# Scaling X
scaler = MinMaxScaler()
scaled_X = scaler.fit_transform(X)

# Splitting dataset into Train and Test (70/30)
X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.30, random_state=1, stratify=y)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# Random Forest Classifier Model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predicting
y_pred = rf.predict(X_test)

# Accuracy of Prediction
print('Accuracy of Random Forest classifier on training set: {}'.format(rf.score(X_train, y_train)))
print('Accuracy of Random Forest classifier on test set: {}'.format(rf.score(X_test, y_test)))

# Combine y_test and y_pred into a DataFrame for evaluation
test_calc = pd.concat([pd.DataFrame(y_test).reset_index(drop=True), pd.DataFrame(y_pred).reset_index(drop=True)], axis=1)
test_calc.rename(columns={0: 'predicted'}, inplace=True)

# Confusion Matrix
conf_df = confusion_matrix(test_calc['fruit_label'], test_calc['predicted'])
print(conf_df)

# Classification Report
metric_report = classification_report(test_calc['fruit_label'], test_calc['predicted'])
print(metric_report)


(41, 4) (18, 4) (41,) (18,)
Accuracy of Random Forest classifier on training set: 1.0
Accuracy of Random Forest classifier on test set: 0.8888888888888888
[[5 0 0 1]
 [0 1 0 0]
 [0 0 5 1]
 [0 0 0 5]]
              precision    recall  f1-score   support

           1       1.00      0.83      0.91         6
           2       1.00      1.00      1.00         1
           3       1.00      0.83      0.91         6
           4       0.71      1.00      0.83         5

    accuracy                           0.89        18
   macro avg       0.93      0.92      0.91        18
weighted avg       0.92      0.89      0.89        18



#How to consider over fitting and under fitting? what are the margins

#Reagrding Parameters