# Classify Raisins with Hyperparameter Tuning Project


### 1. Explore the Dataset

In [1]:
# 1. Setup
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

raisins = pd.read_csv('Raisin_Dataset.csv')
raisins.head()

Unnamed: 0,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,Extent,Perimeter,Class
0,87524,442.246011,253.291155,0.819738,90546,0.758651,1184.04,0
1,75166,406.690687,243.032436,0.801805,78789,0.68413,1121.786,0
2,90856,442.267048,266.328318,0.798354,93717,0.637613,1208.575,0
3,45928,286.540559,208.760042,0.684989,47336,0.699599,844.162,0
4,79408,352.19077,290.827533,0.564011,81463,0.792772,1073.251,0


In [2]:
# 2. Create predictor and target variables, X and y
X = raisins.drop(columns = ['Class'])
y = raisins['Class']


In [3]:
# 3. Examine the dataset
print("Number of features:", len(X.columns))
print("Number of samples:", len(y))
print("Number of samples of Class 1:",y.sum())



Number of features: 7
Number of samples: 900
Number of samples of Class 1: 450


In [4]:
# 4. Split the data set into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=19)

### 2. Grid Search with Decision Tree Classifier

In [5]:
# 5. Create a Decision Tree model
tree = DecisionTreeClassifier()


In [6]:
# 6. Dictionary of parameters for GridSearchCV
parameters = {'min_samples_split': [2,3,4], 'max_depth': [3,5,7]}


In [7]:
# 7. Create a GridSearchCV model
grid = GridSearchCV(estimator=tree, param_grid=parameters)


#Fit the GridSearchCV model to the training data
grid.fit(X_train, y_train)



In [8]:
# 8. Print the model and hyperparameters obtained by GridSearchCV
print("Best model:", grid.best_estimator_)

# Print best score
print("Best score:", grid.best_score_)

# Print the accuracy of the final model on the test data
print("Accuracy on test data:", grid.score(X_test, y_test))


Best model: DecisionTreeClassifier(max_depth=5, min_samples_split=4)
Best score: 0.8681481481481482
Accuracy on test data: 0.8133333333333334


In [9]:
# 9. Print a table summarizing the results of GridSearchCV
hyperparameter_grid = pd.DataFrame(grid.cv_results_['params'])
grid_scores = pd.DataFrame(grid.cv_results_['mean_test_score'], columns=['score'])

df = pd.concat([hyperparameter_grid, grid_scores], axis = 1)
print(df)


   max_depth  min_samples_split     score
0          3                  2  0.862222
1          3                  3  0.862222
2          3                  4  0.859259
3          5                  2  0.866667
4          5                  3  0.863704
5          5                  4  0.868148
6          7                  2  0.851852
7          7                  3  0.844444
8          7                  4  0.847407


### 2. Random Search with Logistic Regression

In [10]:
# 10. The logistic regression model
lr = LogisticRegression(solver='liblinear', max_iter=1000)

In [11]:
# 11. Define distributions to choose hyperparameters from
from scipy.stats import uniform
distributions = {'penalty': ['l1', 'l2'], 'C': uniform(loc=0, scale=100)}


In [12]:
# 12. Create a RandomizedSearchCV model
clf = RandomizedSearchCV(estimator=lr, param_distributions=distributions, n_iter=8)

# Fit the random search model
clf.fit(X_train, y_train)

In [13]:
# 13. Print best esimatore and best score
print("Best model:", clf.best_estimator_)
print("Best score:", clf.best_score_)
# Print the accuracy of the final model on the test data
print("Accuracy on test data:", clf.score(X_test, y_test))

#Print a table summarizing the results of RandomSearchCV
hyperparameter_values = pd.DataFrame(clf.cv_results_['params'])
randomsearch_scores = pd.DataFrame(clf.cv_results_['mean_test_score'], columns=['score'])

df = pd.concat([hyperparameter_values, randomsearch_scores], axis = 1)
print(df)


Best model: LogisticRegression(C=15.956652420574136, max_iter=1000, penalty='l1',
                   solver='liblinear')
Best score: 0.8755555555555556
Accuracy on test data: 0.88
           C penalty     score
0  31.544765      l2  0.874074
1  15.956652      l1  0.875556
2  26.780675      l1  0.875556
3  31.959526      l1  0.875556
4  24.863979      l2  0.875556
5  54.311992      l1  0.874074
6  64.249220      l2  0.875556
7  37.505319      l2  0.875556
