# Drought Multiclassification SVM

#### Author: Thomas Kelly

I used a multi-classification SVM (Support Vector Machine) to predict drought score (y-label), using T2M_RANGE (Temperature at 2 meters) and PS (Surface Pressure) as our X-label.

In [None]:
#Importing the necessary packages and libaries
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn import svm, datasets
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn import utils
import pickle
import seaborn as sns
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [None]:
df = pd.read_csv("soil_dataset.csv")

# Making df a sample of df if needed to run faster
#df = df.sample(n = 40000)

In [None]:
# Making score into a multiclass classification with scores from 0 - 5
df["score"] = df.score.round()

# Dropping NaN values
df["score"] = df["score"].dropna().astype('int')

df["T2M_RANGE"] = df["T2M_RANGE"].dropna().astype('float64')
df["T2M_RANGE"] = np.nan_to_num(df["T2M_RANGE"])

In [None]:
# Creating X, y labels
X = df.loc[:, ["PS", "T2M_RANGE"]]
y = df.loc[:, "score"]

In [None]:
# Creating Train, Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 0)

In [None]:
# Iterating through test sizes
## If you already know the most accurate kernel and test size, you can comment this cell out

test_sizes = [0.5,0.10,0.15,0.20]

kernels = ["lin","poly","rbf","sig"]

for size_val in test_sizes:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = size_val, random_state = 0)

    X_train = X_train.dropna().astype('float64')
    y_train = y_train.dropna().astype('int')

    print(X_train.shape)
    print(y_train.shape)
    print(X_test.shape)
    print(y_test.shape)

    lab_enc = preprocessing.LabelEncoder()
    encoded = lab_enc.fit_transform(y)

    linear = svm.SVC(kernel="linear", C=1, decision_function_shape="ovo").fit(X_train, y_train)
    rbf = svm.SVC(kernel='rbf', gamma=1, C=1, decision_function_shape='ovo').fit(X_train, y_train)
    poly = svm.SVC(kernel='poly', degree=3, C=1, decision_function_shape='ovo').fit(X_train, y_train)
    sig = svm.SVC(kernel='sigmoid', C=1, decision_function_shape='ovo').fit(X_train, y_train)

    linear_pred = linear.predict(X_test)
    poly_pred = poly.predict(X_test)
    rbf_pred = rbf.predict(X_test)
    sig_pred = sig.predict(X_test)

    accuracy_lin = linear.score(X_test, y_test)
    accuracy_poly = poly.score(X_test, y_test)
    accuracy_rbf = rbf.score(X_test, y_test)
    accuracy_sig = sig.score(X_test, y_test)
  
    print(f"Test Size = {size_val}%")
    print("Accuracy Linear Kernel:", accuracy_lin)
    print("Accuracy Polynomial Kernel:", accuracy_poly)
    print("Accuracy Radial Basis Kernel:", accuracy_rbf)
    print("Accuracy Sigmoid Kernel:", accuracy_sig)
    print("\n\n")

    # Plotting confusion matixes for each model
    cm_lin = confusion_matrix(y_test, linear_pred)
    cm_poly = confusion_matrix(y_test, poly_pred)
    cm_rbf = confusion_matrix(y_test, rbf_pred)
    cm_sig = confusion_matrix(y_test, sig_pred)

    sns.heatmap(cm_lin, annot = True)
    sns.heatmap(cm_poly, annot = True)
    sns.heatmap(cm_rbf, annot = True)
    sns.heatmap(cm_sig, annot = True)
    
    # Exporting each model so we dont lose them
    for kernel in kernels:
        with open(f"soil_svm_v2_test_{size_val}_{kernel}", "wb") as f:
            if kernel == "rbf":
                pickle.dump(rbf,f)
            elif kernel == "poly":
                pickle.dump(poly,f)
            elif kernel == "lin":
                pickle.dump(linear,f)
            elif kernel == "sig":
                pickle.dump(sig,f)

In [None]:
# Creating Train, Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 0)

In [None]:
# Dropping any straggling NaN values from Train Split (This was needed for model to run)
X_train = X_train.dropna().astype('float64')
y_train = y_train.dropna().astype('int')

In [None]:
# Normalising Lables to get rid of 'Infinite' values
lab_enc = preprocessing.LabelEncoder()
encoded = lab_enc.fit_transform(y)

In [None]:
# Using rbf kernel as that has the best accuracy in testing
rbf = svm.SVC(kernel='rbf', gamma=1, C=1, decision_function_shape='ovo').fit(X_train, y_train)

In [None]:
# Creating our prediction model
rbf_pred = rbf.predict(X_test)

In [None]:
# Viewing accuracy of model
accuracy_rbf = rbf.score(X_test, y_test)
print("Accuracy Linear Kernel:", accuracy_rbf)

In [None]:
# Creating confusion matrix
cm_rbf = confusion_matrix(y_test, rbf_pred)
sns.heatmap(cm_rbf, annot = True)

In [None]:
# Getting precision, recall, accuracy and f1 scores from y_test and y_pred
print('Precision: %.3f' % precision_score(y_test, rbf_pred))
print('Recall: %.3f' % recall_score(y_test, rbf_pred))
print('Accuracy: %.3f' % accuracy_score(y_test, rbf_pred))
print('F1 Score: %.3f' % f1_score(y_test, rbf_pred))

In [None]:
with open(f"soil_svm_rbf_final", "wb") as f:
      pickle.dump(rbf,f)