In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error

In [12]:
# Load the dataset
df = pd.read_excel("Portuguese_sea_battles_data.xlsx",header = None)

df.columns = ["Battle","Year", "Portuguese ships", "Dutch ships","English ships","The ratio of Portuguese to Dutch/British ships","Spanish Involvement","Portuguese outcome"]

df.head()

Unnamed: 0,Battle,Year,Portuguese ships,Dutch ships,English ships,The ratio of Portuguese to Dutch/British ships,Spanish Involvement,Portuguese outcome
0,Bantam,1601,6,3,0,2.0,0,0
1,Malacca Strait,1606,14,11,0,1.273,0,0
2,Ilha das Naus,1606,6,9,0,0.667,0,-1
3,Pulo Butum,1606,7,9,0,0.778,0,1
4,Surrat,1615,6,0,4,1.5,0,0


In [3]:
# Prepare the data
X = df[['Portuguese ships', 'Dutch ships', 'English ships', 'The ratio of Portuguese to Dutch/British ships', 'Spanish Involvement']]
y = df['Portuguese outcome']

# Standardize features
scaler = StandardScaler()
features_standardized = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Use an SVM-based model to predict the Portuguese outcome of the battle from the number of ships involved on all sides and Spanish involvement

In [13]:
# Linear Kernel:
lsvm_model = SVC(kernel='linear')

# Train and evaluate the SVM
lsvm_model.fit(X_train, y_train)
lsvm_predictions = lsvm_model.predict(X_test)
lsvm_accuracy = accuracy_score(y_test, lsvm_predictions)
lsvm_classification_report = classification_report(y_test, lsvm_predictions, zero_division=1)

# Calculate Mean Squared Error
lsvm_mse = mean_squared_error(y_test, lsvm_predictions)

# Print results for model
print("Linear SVM Results:")
print("Accuracy:", lsvm_accuracy)
print("Classification Report:\n", lsvm_classification_report)
print("Mean Squared Error:", lsvm_mse)

Linear SVM Results:
Accuracy: 0.3333333333333333
Classification Report:
               precision    recall  f1-score   support

          -1       0.33      0.50      0.40         2
           0       0.33      0.33      0.33         3
           1       1.00      0.00      0.00         1

    accuracy                           0.33         6
   macro avg       0.56      0.28      0.24         6
weighted avg       0.44      0.33      0.30         6

Mean Squared Error: 0.6666666666666666


In [5]:
# Train Random Forest classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate Random Forest classifier
rf_predictions = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_classification_report = classification_report(y_test, rf_predictions)
print("Random Forest Model Accuracy:", rf_accuracy)


# Classification Report
print("\nRandom Forest Classification Report:\n", classification_report(y_test, rf_predictions))

# Perform random selection on the training data
min_samples = 10  # this is random minimum sample size
random_indices = np.random.choice(X_train.shape[0], size=min_samples, replace=False)
random_X_train = X_train.iloc[random_indices]
random_y_train = y_train.iloc[random_indices]  # Assuming y_train is a Series object

# Train the model on the randomly selected data
rf_model.fit(random_X_train, random_y_train)
# Make predictions on the testing data
y_pred_random = rf_model.predict(X_test)

# Evaluate the model using mean squared error
mse_random = mean_squared_error(y_test, y_pred_random)
print(f'Mean Squared Error (random selection): {mse_random:.2f}')

Random Forest Model Accuracy: 0.3333333333333333

Random Forest Classification Report:
               precision    recall  f1-score   support

          -1       0.50      0.50      0.50         2
           0       0.33      0.33      0.33         3
           1       0.00      0.00      0.00         1

    accuracy                           0.33         6
   macro avg       0.28      0.28      0.28         6
weighted avg       0.33      0.33      0.33         6

Mean Squared Error (random selection): 0.50


In [6]:
# Train Logistic Regression classifier with increased max_iter
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

# Evaluate Logistic Regression classifier
lr_predictions = lr_model.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_predictions)
lr_classification_report = classification_report(y_test, lr_predictions, zero_division=1)
print("\n Logistic Regression Model Accuracy:", lr_accuracy)
print("\n Logistic Regression Classification Report:\n", lr_classification_report)

# Evaluate the model using mean squared error
mse = mean_squared_error(y_test, lr_predictions)
print(f'Mean Squared Error: {mse:.2f}')


 Logistic Regression Model Accuracy: 0.3333333333333333

 Logistic Regression Classification Report:
               precision    recall  f1-score   support

          -1       0.33      0.50      0.40         2
           0       0.33      0.33      0.33         3
           1       1.00      0.00      0.00         1

    accuracy                           0.33         6
   macro avg       0.56      0.28      0.24         6
weighted avg       0.44      0.33      0.30         6

Mean Squared Error: 0.67


In [14]:
# Polynomial Kernel:
poly_svm_model = SVC(kernel='poly', degree=3)

# Train and evaluate the  SVM
poly_svm_model.fit(X_train, y_train)
poly_svm_predictions = poly_svm_model.predict(X_test)
poly_svm_accuracy = accuracy_score(y_test, poly_svm_predictions)
poly_svm_classification_report = classification_report(y_test, poly_svm_predictions, zero_division=1)

# Calculate Mean Squared Error
poly_svm_mse = mean_squared_error(y_test, poly_svm_predictions)

# Print results for  model
print("Polynomial SVM Results:")
print("Accuracy:", poly_svm_accuracy)
print("Classification Report:\n", poly_svm_classification_report)
print("Mean Squared Error:", poly_svm_mse)

Polynomial SVM Results:
Accuracy: 0.5
Classification Report:
               precision    recall  f1-score   support

          -1       1.00      0.00      0.00         2
           0       0.50      1.00      0.67         3
           1       1.00      0.00      0.00         1

    accuracy                           0.50         6
   macro avg       0.83      0.33      0.22         6
weighted avg       0.75      0.50      0.33         6

Mean Squared Error: 0.5


Report and compare their results with those from SVM.

### Model summary
|Model|Accuracy|F1-score (Class -1)|F1-score (Class 0)|F1-score (Class 1)|
|___|___|___|___|___|
|Logistic Regression |33.33%|	|0.40	           |0.33	           |0.00|
|Random Forest	    |50.00%	    |0.50	           |0.57	           |0.00|
|SVM (Linear)	    |33.33%	    |0.40	           |0.33	           |0.00|
|SVM (Polynomial)	|50.00%	    |0.00	           |0.67	           |0.00|
|SVM (RBF)	        |50.00%	    |0.00	           |0.67	           |0.00|

Linear SVM has an accuracy of 33.33% which means the linear SVM model correctly classified 2 out of 6 instances. The model struggled to distinguish between classes, especially for class 1.
The Polynomial SVM had an accuracy of 50% which says the polynomial SVM model correctly classified half of the instances. The model achieved better accuracy than the linear SVM, but it still struggled to classify instances from class -1 and 1.
The RBF SVM had an accuracy of 50% which means the model also correctly classified half of the instances similar to the polynomial SVM. The RBF SVM, like the polynomial SVM, struggled to classify instances from class -1 and 1.
The SVM models had difficulties distinguishing between classes, especially for class 1. This suggests that the dataset may be challenging to classify using SVM algorithms.

The logistic regression model performed similarly to the linear SVM, with an accuracy of 33.33%. It struggled to classify instances from class 1, resulting in low precision and F1-score for that class. The random forest model achieved an accuracy of 50%, performing slightly better than logistic regression and linear SVM but it still struggled to classify instances from class 1, resulting in a low F1-score and precision for that class. Comparing with SVM:
Both logistic regression and random forest performed similarly to SVM in terms of accuracy, with logistic regression having the lowest accuracy and random forest having the highest. Precision and F1-score for class 1 were consistently low across all models, indicating difficulties in correctly classifying instances from this class. Random forest showed slightly better performance in terms of F1-score and precision for class -1 and 0 compared to logistic regression and SVM.