# COMP5318 Assignment 1: Rice Classification

##### Group number: ...
##### Student 1 SID: ...
##### Student 2 SID: ...  
##### Student 3 SID: ... 
##### Student 4 SID: ... 

In [41]:
# Import all libraries
from sklearn.model_selection import StratifiedKFold

In [42]:
# Ignore future warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [43]:
# Load the rice dataset: rice-final2.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from scipy import signal
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer

pd.set_option('display.max_columns', None)
df = pd.read_csv('rice-final2.csv')


In [44]:
# Pre-process dataset
imputer =SimpleImputer(missing_values=np.nan, strategy='mean')
df = df.apply(pd.to_numeric, errors='coerce')
df = df.dropna(axis=1, how='all')
df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
scaler = MinMaxScaler()
df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
df1 = pd.read_csv('rice-final2.csv', usecols=['class'])
df1['class'] = df1['class'].replace({"class1": "0", "class2": "1"})
df = pd.concat([df, df1], axis=1)
df.loc[:, df.columns != 'class'] = df.loc[:, df.columns != 'class'].round(4)
df.to_csv("processed_rice.csv", index=False)

df.head()

Unnamed: 0,Area,Perimiter,Major_Axis_Length,Minor_Axis_Length,Eccentricity,Convex_Area,Extent,class
0,0.4628,0.5406,0.5113,0.4803,0.738,0.4699,0.1196,1
1,0.49,0.5547,0.5266,0.5018,0.7319,0.4926,0.803,1
2,0.6109,0.6847,0.6707,0.5409,0.8032,0.6253,0.1185,0
3,0.6466,0.693,0.6677,0.5961,0.7601,0.6467,0.2669,0
4,0.6712,0.6233,0.4755,0.8293,0.3721,0.6803,0.4211,1


In [45]:
# Print first ten rows of pre-processed dataset to 4 decimal places as per assignment spec
# A function is provided to assist

def print_data(X, y, n_rows=10):
    """Takes a numpy data array and target and prints the first ten rows.
    
    Arguments:
        X: numpy array of shape (n_examples, n_features)
        y: numpy array of shape (n_examples)
        n_rows: numpy of rows to print
    """
    for example_num in range(n_rows):
        for feature in X[example_num]:
            print("{:.4f}".format(feature), end=",")

        if example_num == len(X)-1:
            print(y[example_num],end="")
        else:
            print(y[example_num])
            


In [46]:
X = df.drop(columns=['class']).values
y = df['class'].values
print_data(X,y,10)

0.4628,0.5406,0.5113,0.4803,0.7380,0.4699,0.1196,1
0.4900,0.5547,0.5266,0.5018,0.7319,0.4926,0.8030,1
0.6109,0.6847,0.6707,0.5409,0.8032,0.6253,0.1185,0
0.6466,0.6930,0.6677,0.5961,0.7601,0.6467,0.2669,0
0.6712,0.6233,0.4755,0.8293,0.3721,0.6803,0.4211,1
0.2634,0.2932,0.2414,0.4127,0.5521,0.2752,0.2825,1
0.8175,0.9501,0.9515,0.5925,0.9245,0.8162,0.0000,0
0.3174,0.3588,0.3601,0.3908,0.6921,0.3261,0.8510,1
0.3130,0.3050,0.2150,0.5189,0.3974,0.3159,0.4570,1
0.5120,0.5237,0.4409,0.6235,0.5460,0.5111,0.3155,1


### Part 1: Cross-validation without parameter tuning

In [47]:
## Setting the 10 fold stratified cross-validation
cvKFold=StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

# The stratified folds from cvKFold should be provided to the classifiers

In [48]:
# Logistic Regression
def logregClassifier(X, y):

    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import cross_val_score, StratifiedKFold

    clf = LogisticRegression(random_state=0, max_iter=1000)
    scores = cross_val_score(clf, X, y, cv=cvKFold)
   
    return scores.mean()

In [49]:
#Naïve Bayes
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedKFold, cross_val_score


def nbClassifier(X, y):
    model = GaussianNB()
    scores = cross_val_score(model, X, y, cv=cvKFold)
    
    return scores.mean()

In [50]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
def dtClassifier(X, y):
    clf = DecisionTreeClassifier(criterion="entropy", random_state=0)
    scores = cross_val_score(clf, X, y, cv=cvKFold)
    
    return scores.mean()

In [51]:
# Ensembles: Bagging, Ada Boost and Gradient Boosting
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
def bagDTClassifier(X, y, n_estimators, max_samples, max_depth):
    base_dt = DecisionTreeClassifier(criterion="entropy", max_depth=max_depth, random_state=0)
    bagging = BaggingClassifier(
        estimator=base_dt,
        n_estimators=n_estimators,
        max_samples=max_samples,
        random_state=0
    )
    scores = cross_val_score(bagging, X, y, cv=cvKFold)
    return scores.mean()

def adaDTClassifier(X, y, n_estimators, learning_rate, max_depth):
    base_dt = DecisionTreeClassifier(criterion="entropy", max_depth=max_depth, random_state=0)
    ada = AdaBoostClassifier(
        estimator=base_dt,
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        random_state=0
    )
    scores = cross_val_score(ada, X, y, cv=cvKFold)
    return scores.mean()

def gbClassifier(X, y, n_estimators, learning_rate):
    gb = GradientBoostingClassifier(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        random_state=0
    )
    scores = cross_val_score(gb, X, y, cv=cvKFold)
    return scores.mean()

### Part 1 Results

In [52]:
# Parameters for Part 1:

#NB
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
naive_score = nbClassifier(X, y)

#Bagging
bag_n_estimators = 50
bag_max_samples = 100
bag_max_depth = 5

bag_score = bagDTClassifier(X, y, bag_n_estimators, bag_max_samples, bag_max_depth)

#AdaBoost
ada_n_estimators = 50
ada_learning_rate = 0.5
ada_bag_max_depth = 5

ada_score = adaDTClassifier(X, y, ada_n_estimators, ada_learning_rate, ada_bag_max_depth)

#GB
gb_n_estimators = 50
gb_learning_rate = 0.5

gb_score = gbClassifier(X, y, gb_n_estimators, gb_learning_rate)

#logR
lr_mean = logregClassifier(X, y)

#DT
dt_mean = dtClassifier(X, y)

# Print results for each classifier in part 1 to 4 decimal places here:
print("LogR average cross-validation accuracy: ",lr_mean)
print("NB average cross-validation accuracy: ",naive_score)
print("DT average cross-validation accuracy: ",dt_mean)
print("Bagging average cross-validation accuracy: ",bag_score)
print("AdaBoost average cross-validation accuracy: ",ada_score)
print("GB average cross-validation accuracy: ",gb_score)

LogR average cross-validation accuracy:  0.9385714285714284
NB average cross-validation accuracy:  0.9264285714285714
DT average cross-validation accuracy:  0.9178571428571429
Bagging average cross-validation accuracy:  0.9400000000000001
AdaBoost average cross-validation accuracy:  0.9328571428571429
GB average cross-validation accuracy:  0.9299999999999999


### Part 2: Cross-validation with parameter tuning

In [55]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score

k = [1, 3, 5, 7]
p = [1, 2]

def bestKNNClassifier(X, y):
    # use global/public k and p defined above
    param_grid = {'n_neighbors': k, 'p': p}

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=0, stratify=y
    )

    gs = GridSearchCV(KNeighborsClassifier(), param_grid,
                      cv=10, scoring='accuracy')
    gs.fit(X_train, y_train)

    best_k = gs.best_params_['n_neighbors']
    best_p = gs.best_params_['p']
    cv_acc = gs.best_score_

    test_acc = accuracy_score(y_test, gs.best_estimator_.predict(X_test))

    return best_k, best_p, cv_acc, test_acc

In [56]:
# Random Forest
# You should use RandomForestClassifier from sklearn.ensemble with information gain and max_features set to ‘sqrt’.
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
n_estimators = [10, 30, 60, 100]
max_leaf_nodes = [6, 12]

def bestRFClassifier(X, y):
# ===== Split data into train/test with stratification
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=0
    )

    # ====== Stratified 10-fold cross-validation
    cvKFold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

    # ===== Base Random Forest model 
    rf = RandomForestClassifier(
        criterion="entropy", # use information gain
        max_features="sqrt", # feature selection strategy
        random_state=0,
    )

    # ====== Parameter grid 
    param_grid = {
        "n_estimators": n_estimators,  
        "max_leaf_nodes": max_leaf_nodes,   
    }

    # ===== Grid search with cross-validation 
    gscv = GridSearchCV(
        estimator=rf,
        param_grid=param_grid,
        scoring="accuracy",
        cv=cvKFold,
        n_jobs=1,      
        refit=True,
        return_train_score=False,
    )
    gscv.fit(X_train, y_train)

    # ===== Get best model and CV score 
    best_model = gscv.best_estimator_
    best_params = gscv.best_params_
    best_cv_acc = float(gscv.best_score_)

    # ====== Evaluate on the test set 
    y_pred = best_model.predict(X_test)
    test_acc = float(accuracy_score(y_test, y_pred))
    macro_f1 = float(f1_score(y_test, y_pred, average="macro"))
    weighted_f1 = float(f1_score(y_test, y_pred, average="weighted"))

    return (
        int(best_params["n_estimators"]),
        int(best_params["max_leaf_nodes"]),
        best_cv_acc,
        test_acc,
        macro_f1,
        weighted_f1
    )


### Part 2: Results

In [58]:
# Perform Grid Search with 10-fold stratified cross-validation (GridSearchCV in sklearn). 
# The stratified folds from cvKFold should be provided to GridSearchV

# This should include using train_test_split from sklearn.model_selection with stratification and random_state=0
# Print results for each classifier here. All results should be printed to 4 decimal places except for
# "k", "p", n_estimators" and "max_leaf_nodes" which should be printed as integers.

best_k, best_p, cv_score, test_score = bestKNNClassifier(X, y)

print("KNN best k:", best_k)
print("KNN best p:", best_p)
print("KNN cross-validation accuracy:", cv_score)
print("KNN test set accuracy:", test_score)

print()

best_n, best_leaf, cv_acc, tst_acc, macro_f1, weighted_f1 = bestRFClassifier(X, y)


print("RF best n_estimators:", best_n)
print("RF best max_leaf_nodes:", best_leaf)
print("RF cross-validation accuracy:", round(cv_acc, 4))
print("RF test set accuracy:", round(tst_acc, 4))
print("RF test set macro average F1:", round(macro_f1, 4))
print("RF test set weighted average F1:", round(weighted_f1, 4))

KNN best k: 5
KNN best p: 1
KNN cross-validation accuracy: 0.9366071428571429
KNN test set accuracy: 0.9214285714285714

RF best n_estimators: 30
RF best max_leaf_nodes: 6
RF cross-validation accuracy: 0.9411
RF test set accuracy: 0.9429
RF test set macro average F1: 0.9414
RF test set weighted average F1: 0.9427
