# COMP5318 Assignment 1: Rice Classification

##### Group number: A1 group-set2 159
##### Student 1 SID: 540825875
##### Student 2 SID: 530683980

In [1]:
# Import all libraries
from sklearn.model_selection import StratifiedKFold,train_test_split,GridSearchCV
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
# Ignore future warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [3]:
# Load the rice dataset: rice-final2.csv
rice_dataset = np.genfromtxt('rice-final2.csv', delimiter=',', skip_header=1, dtype=None, encoding=None)
print(rice_dataset[:5])

[['12573' '461.4660034' '192.9033508' '84.57207489' '0.898771763' '12893'
  '0.550433397' 'class2']
 ['12845' '464.1210022' '194.3322144' '85.52433777' '0.897951961' '13125'
  '0.774962306' 'class2']
 ['14055' '488.7489929' '207.7517548' '87.25032806' '0.907536149' '14484'
  '0.550076306' 'class1']
 ['14412' '490.3240051' '207.4761353' '89.68951416' '0.901735425' '14703'
  '0.598853171' 'class1']
 ['14658' '477.1170044' '189.5666351' '99.99777985' '0.849550545' '15048'
  '0.649503708' 'class2']]


In [4]:
# Pre-process dataset

# encode the class value
rice_dataset = np.where(rice_dataset == 'class1', '0', rice_dataset)
rice_dataset = np.where(rice_dataset == 'class2', '1', rice_dataset)
rice_dataset = np.where(rice_dataset == '?', np.nan, rice_dataset)

#replace missing values with means
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(rice_dataset)
rice_dataset = imp_mean.transform(rice_dataset)

#normalization
scaler = MinMaxScaler()
scaler.fit(rice_dataset)
rice_dataset = scaler.transform(rice_dataset)

In [5]:
# Print first ten rows of pre-processed dataset to 4 decimal places as per assignment spec
# A function is provided to assist

def print_data(X, y, n_rows=10):
    """Takes a numpy data array and target and prints the first ten rows.
    
    Arguments:
        X: numpy array of shape (n_examples, n_features)
        y: numpy array of shape (n_examples)
        n_rows: numpy of rows to print
    """
    for example_num in range(n_rows):
        for feature in X[example_num]:
            print("{:.4f}".format(feature), end=",")

        if example_num == len(X)-1:
            print(y[example_num],end="")
        else:
            print(y[example_num])

#slice the dataset in X and y
X = rice_dataset[:, :-1]
y = rice_dataset[:, -1]

print_data(X, y)


0.4628,0.5406,0.5113,0.4803,0.7380,0.4699,0.1196,1.0
0.4900,0.5547,0.5266,0.5018,0.7319,0.4926,0.8030,1.0
0.6109,0.6847,0.6707,0.5409,0.8032,0.6253,0.1185,0.0
0.6466,0.6930,0.6677,0.5961,0.7601,0.6467,0.2669,0.0
0.6712,0.6233,0.4755,0.8293,0.3721,0.6803,0.4211,1.0
0.2634,0.2932,0.2414,0.4127,0.5521,0.2752,0.2825,1.0
0.8175,0.9501,0.9515,0.5925,0.9245,0.8162,0.0000,0.0
0.3174,0.3588,0.3601,0.3908,0.6921,0.3261,0.8510,1.0
0.3130,0.3050,0.2150,0.5189,0.3974,0.3159,0.4570,1.0
0.5120,0.5237,0.4409,0.6235,0.5460,0.5111,0.3155,1.0


### Part 1: Cross-validation without parameter tuning

In [6]:
## Setting the 10 fold stratified cross-validation
cvKFold=StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

# The stratified folds from cvKFold should be provided to the classifiers

In [7]:
# Logistic Regression
def logregClassifier(X, y):
    clf = LogisticRegression(random_state=0)
    scores = []

    #split data into training and test sets by cross-validation
    for train_index, test_index in cvKFold.split(X, y):
        X_train = X[train_index]
        X_test = X[test_index]
        y_train = y[train_index]
        y_test = y[test_index]

        #make predictions after training
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        #score the accuracy of the model
        score = accuracy_score(y_test, y_pred)
        scores.append(score)

    #output the mean of scores
    scores = np.array(scores)
    return scores.mean()

In [8]:
#Naïve Bayes
def nbClassifier(X, y):
    clf = GaussianNB()
    scores = []
    for train_index, test_index in cvKFold.split(X, y):
        X_train = X[train_index]
        X_test = X[test_index]
        y_train = y[train_index]
        y_test = y[test_index]
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        score = accuracy_score(y_test, y_pred)
        scores.append(score)
    scores = np.array(scores)
    return scores.mean()

In [9]:
# Decision Tree
def dtClassifier(X, y):
    clf = DecisionTreeClassifier(criterion="entropy", random_state=0)
    scores = []
    for train_index, test_index in cvKFold.split(X, y):
        X_train = X[train_index]
        X_test = X[test_index]
        y_train = y[train_index]
        y_test = y[test_index]
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        score = accuracy_score(y_test, y_pred)
        scores.append(score)
    scores = np.array(scores)
    return scores.mean()

In [10]:
# Ensembles: Bagging, Ada Boost and Gradient Boosting
def bagDTClassifier(X, y, n_estimators, max_samples, max_depth):
    clf = BaggingClassifier(estimator=DecisionTreeClassifier(criterion="entropy", max_depth=max_depth, random_state=0), n_estimators=n_estimators, max_samples=max_samples, random_state=0)
    scores = []
    for train_index, test_index in cvKFold.split(X, y):
        X_train = X[train_index]
        X_test = X[test_index]
        y_train = y[train_index]
        y_test = y[test_index]
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        score = accuracy_score(y_test, y_pred)
        scores.append(score)
    scores = np.array(scores)
    return scores.mean()

def adaDTClassifier(X, y, n_estimators, learning_rate, max_depth):
    clf = AdaBoostClassifier(estimator=DecisionTreeClassifier(criterion="entropy", max_depth=max_depth, random_state=0), n_estimators=n_estimators, learning_rate=learning_rate, random_state=0)
    scores = []
    for train_index, test_index in cvKFold.split(X, y):
        X_train = X[train_index]
        X_test = X[test_index]
        y_train = y[train_index]
        y_test = y[test_index]
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        score = accuracy_score(y_test, y_pred)
        scores.append(score)
    scores = np.array(scores)
    return scores.mean()

def gbClassifier(X, y, n_estimators, learning_rate):
    clf = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate, random_state=0)
    scores = []
    for train_index, test_index in cvKFold.split(X, y):
        X_train = X[train_index]
        X_test = X[test_index]
        y_train = y[train_index]
        y_test = y[test_index]
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        score = accuracy_score(y_test, y_pred)
        scores.append(score)
    scores = np.array(scores)
    return scores.mean()

### Part 1 Results

In [11]:
# Parameters for Part 1:

#Bagging
bag_n_estimators = 50
bag_max_samples = 100
bag_max_depth = 5

#AdaBoost
ada_n_estimators = 50
ada_learning_rate = 0.5
ada_bag_max_depth = 5

#GB
gb_n_estimators = 50
gb_learning_rate = 0.5

# Print results for each classifier in part 1 to 4 decimal places here:
print("LogR average cross-validation accuracy: ", f"{logregClassifier(X, y):.4f}")
print("NB average cross-validation accuracy: ", f"{nbClassifier(X, y):.4f}")
print("DT average cross-validation accuracy: ", f"{dtClassifier(X, y):.4f}")
print("Bagging average cross-validation accuracy: ", f"{bagDTClassifier(X, y, bag_n_estimators, bag_max_samples, bag_max_depth):.4f}")
print("AdaBoost average cross-validation accuracy: ", f"{adaDTClassifier(X, y, ada_n_estimators, ada_learning_rate, ada_bag_max_depth):.4f}")
print("GB average cross-validation accuracy: ", f"{gbClassifier(X, y, gb_n_estimators, gb_learning_rate):.4f}")

LogR average cross-validation accuracy:  0.9386
NB average cross-validation accuracy:  0.9264
DT average cross-validation accuracy:  0.9179
Bagging average cross-validation accuracy:  0.9414
AdaBoost average cross-validation accuracy:  0.9250
GB average cross-validation accuracy:  0.9300


### Part 2: Cross-validation with parameter tuning

In [12]:
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

In [13]:
# KNN
k = [1, 3, 5, 7]
p = [1, 2]

def bestKNNClassifier(X, y):    
    # Dataset split,set random_state ensures repeatability
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)    
    #10 fold stratified cross-validation
    cvKFold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
    # Create a grid search model，using knn classifier
    knn = KNeighborsClassifier()
    param_grid = {'n_neighbors': k, 'p': p}
    grid_search = GridSearchCV(knn, param_grid, cv=cvKFold, scoring='accuracy')
    
    # All parameter combinations are traversed on the training set to extract the optimal parameter group
    grid_search.fit(X_train, y_train)
    best_k = grid_search.best_params_['n_neighbors']
    best_p = grid_search.best_params_['p']
    # Output the required model evaluation metrics
    cv_acc = grid_search.best_score_
    test_acc = grid_search.score(X_test, y_test)
    
    return best_k, best_p, cv_acc, test_acc


In [14]:
# SVM
# You should use SVC from sklearn.svm with kernel set to 'rbf'
C = [0.01, 0.1, 1, 5] 
gamma = [0.01, 0.1, 1, 10]

def bestSVMClassifier(X, y):
    # Dataset split,set random_state ensures repeatability
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)    
    cvKFold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
    # Create a grid search model，using svm classifier
    svm = SVC(kernel='rbf', random_state=0)
    param_grid = {'C': C, 'gamma': gamma}
    grid_search = GridSearchCV(svm, param_grid, cv=cvKFold, scoring='accuracy')   
    # All parameter combinations are traversed on the training set to extract the optimal parameter group
    grid_search.fit(X_train, y_train)
    best_C = grid_search.best_params_['C']
    best_gamma = grid_search.best_params_['gamma']  
    #Output the required model evaluation metrics
    cv_acc = grid_search.best_score_
    test_acc = grid_search.score(X_test, y_test)
    
    return best_C, best_gamma, cv_acc, test_acc

In [15]:
# Random Forest
# You should use RandomForestClassifier from sklearn.ensemble with information gain and max_features set to ‘sqrt’.
n_estimators = [10, 30, 60, 100]
max_leaf_nodes = [6, 12]

def bestRFClassifier(X, y):
    # Dataset split,set random_state ensures repeatability
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)
    cvKFold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
    # Create a grid search model，using rf classifier
    RF = RandomForestClassifier(criterion='entropy', max_features='sqrt', random_state=0)
    param_grid = {'n_estimators': n_estimators, 'max_leaf_nodes': max_leaf_nodes}
    grid_search = GridSearchCV(RF, param_grid, cv=cvKFold, scoring='accuracy')
    
    grid_search.fit(X_train, y_train)
    best_n = grid_search.best_params_['n_estimators']
    best_leaf = grid_search.best_params_['max_leaf_nodes']
    #Output the required model evaluation metrics, have macro and weighted F1 score
    cv_acc = grid_search.best_score_
    test_acc = grid_search.score(X_test, y_test)
    y_pred = grid_search.predict(X_test)
    macro_f1 = f1_score(y_test, y_pred, average='macro')
    weighted_f1 = f1_score(y_test, y_pred, average='weighted')
    
    return best_n, best_leaf, cv_acc, test_acc, macro_f1, weighted_f1


### Part 2: Results

In [16]:
# Perform Grid Search with 10-fold stratified cross-validation (GridSearchCV in sklearn). 
# The stratified folds from cvKFold should be provided to GridSearchV

# This should include using train_test_split from sklearn.model_selection with stratification and random_state=0
# Print results for each classifier here. All results should be printed to 4 decimal places except for
# "k", "p", n_estimators" and "max_leaf_nodes" which should be printed as integers.


# call the three classification model functions
best_k, best_p, knn_cv, knn_test = bestKNNClassifier(X, y)
best_C, best_gamma, svm_cv, svm_test = bestSVMClassifier(X, y)
best_n_est, best_max_leaf, rf_cv, rf_test, rf_f1_macro, rf_f1_weighted = bestRFClassifier(X, y)
print("KNN best k: {}".format(best_k))
print("KNN best p: {}".format(best_p))
print("KNN cross-validation accuracy: {:.4f}".format(knn_cv))
print("KNN test set accuracy: {:.4f}".format(knn_test))

print()

print("SVM best C: {:.4f}".format(best_C))
print("SVM best gamma: {:.4f}".format(best_gamma))
print("SVM cross-validation accuracy: {:.4f}".format(svm_cv))
print("SVM test set accuracy: {:.4f}".format(svm_test))

print()

print("RF best n_estimators: {}".format(best_n_est)) 
print("RF best max_leaf_nodes: {}".format(best_max_leaf))
print("RF cross-validation accuracy: {:.4f}".format(rf_cv))
print("RF test set accuracy: {:.4f}".format(rf_test))
print("RF test set macro average F1: {:.4f}".format(rf_f1_macro))
print("RF test set weighted average F1: {:.4f}".format(rf_f1_weighted))

KNN best k: 5
KNN best p: 1
KNN cross-validation accuracy: 0.9371
KNN test set accuracy: 0.9257

SVM best C: 5.0000
SVM best gamma: 1.0000
SVM cross-validation accuracy: 0.9457
SVM test set accuracy: 0.9343

RF best n_estimators: 30
RF best max_leaf_nodes: 12
RF cross-validation accuracy: 0.9390
RF test set accuracy: 0.9371
RF test set macro average F1: 0.9355
RF test set weighted average F1: 0.9370


### Part 3: Reflection

##### Write one paragraph describing the most important thing that you have learned throughout this assignment.

##### Student 1: In this assignment, I’ve learned that when using classification algorithms represented by classifiers, the most complicated computational details have already been covered. Instead, the essential task for users is to preprocess the data (including dealing with missing values, normalization and encoding the class), as well as choose and adjust algorithm parameters within the core of the algorithms. The routine to follow is: import the classifier, split the data into training and test sets, make predictions after training the classifier, and score the accuracy of the model at last. On the other hand, using def to encapsulate this routine is very useful, as it provides a clear structure that is easy to use. Additionally, replacing the class values first can avoid affecting the normalization process and the handling of missing values. It’s also better to use slicing without specifying specific columns and rows to accommodate different datasets.

##### Student 2: In this assignment, the most significant aspect I have learned is the assistance of hyperparameter tuning in enhancing model performance. I have mastered the utilization of GridSearchCV and StratifiedKFold methods. Stratified cross-validation can assist in addressing the issue of class imbalance, while the automated search of GridSearchCV can facilitate the automatic comparison of multiple parameter combinations for the selection of the optimal solution. Subsequently, through my comparison of the outcomes of the three models, the Random Forest model yielded the best result. I think the reason may be that all the attributes of the dataset are numerical, and random forest based on ensemble learning of decision trees is less sensitive to the size of the data than the other two methods, so it performs best. Additionally, during the process of completing the code, I felt that encapsulating each model in the same function can help improve the logic and tidiness of the code. I should develop the good habit of encapsulating functions.

