In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Import usual libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from scipy.stats import uniform, randint

In [4]:
# Loading data
X = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/features_1000sample_400min_600cutoff.csv")
y = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/target_1000sample_400min_600cutoff.csv")

In [5]:
# Convert one-hot encoded target array back to class labels
y = y.drop(columns="Unnamed: 0")
y_not_encoded = y.columns.get_indexer(y.idxmax(1))+1
np.unique(y_not_encoded), y_not_encoded


(array([1, 2, 3, 4, 5, 6, 7]), array([1, 1, 1, ..., 7, 7, 7]))

In [6]:
# Splitting the Data
X_train, X_test, y_train, y_test = train_test_split(X, y_not_encoded, test_size=0.2, random_state=42)

In [7]:
# Define the hyperparameter grid
param_grid = {
    'C': uniform(0.1, 10),  # Penalty parameter C (regularization parameter)
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  # Kernel type
    'gamma': ['scale', 'auto'],  # Kernel coefficient for 'rbf', 'poly' and 'sigmoid'
    'degree': randint(1, 10)  # Degree of the polynomial kernel function ('poly')
}

# Create an SVM classifier
svm_classifier = SVC()

# Perform random search cross-validation
random_search = RandomizedSearchCV(svm_classifier, param_distributions=param_grid, n_iter=10, scoring='accuracy', cv=5, n_jobs=-1)
random_search.fit(X, y_not_encoded)

# Best hyperparameters found
print("Best Hyperparameters:", random_search.best_params_)

# Best cross-validation score
print("Best Cross-validation Score:", random_search.best_score_)

# Evaluate the best model on test data
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy on Test Data:", accuracy)

Best Hyperparameters: {'C': 4.168032584162171, 'degree': 1, 'gamma': 'scale', 'kernel': 'linear'}
Best Cross-validation Score: 0.8285714285714285
Accuracy on Test Data: 1.0


In [8]:
y.columns

Index(['party_Con', 'party_DUP', 'party_Lab', 'party_LibDem',
       'party_PlaidCymru', 'party_SNP', 'party_UUP'],
      dtype='object')

In [9]:
model = SVC(kernel="linear", gamma="scale", C=4.2)
model.fit(X_train, y_train)

In [10]:
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

Accuracy: 1.0
