In [1]:
import numpy as np
import pandas as pd

# Data preprocessing
def preprocess_data(data):
    # Drop irrelevant features
    data = data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
    
    # Fill missing values
    data['Age'].fillna(data['Age'].median(), inplace=True)
    data['Fare'].fillna(data['Fare'].median(), inplace=True)
    data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
    
    # Convert categorical variables to numerical
    data['Sex'] = data['Sex'].map({'female': 0, 'male': 1})
    data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
    
    return data

# Splitting the data
def train_test_split(inputs, target, test_size):
    indices = np.arange(len(inputs))
    np.random.shuffle(indices)
    split_index = int(len(inputs) * (1 - test_size))
    train_indices = indices[:split_index]
    test_indices = indices[split_index:]
    X_train = inputs.iloc[train_indices].values
    X_test = inputs.iloc[test_indices].values
    y_train = target[train_indices]
    y_test = target[test_indices]
    return X_train, X_test, y_train, y_test

# Calculate prior probabilities
def calculate_prior_probabilities(y_train):
    unique_classes, class_counts = np.unique(y_train, return_counts=True)
    total_samples = len(y_train)
    prior_probabilities = {}
    for cls, count in zip(unique_classes, class_counts):
        prior_probabilities[cls] = count / total_samples
    return prior_probabilities

# Calculate likelihood probabilities
def calculate_likelihood_probabilities(X_train, y_train):
    num_classes = np.unique(y_train).shape[0]
    num_features = X_train.shape[1]
    likelihood_probabilities = {}
    for cls in np.unique(y_train):
        X_cls = X_train[y_train == cls]
        likelihood_probabilities[cls] = {}
        for feature in range(num_features):
            unique_values, value_counts = np.unique(X_cls[:, feature], return_counts=True)
            total_samples = X_cls.shape[0]
            likelihood_probabilities[cls][feature] = {}
            for value, count in zip(unique_values, value_counts):
                likelihood_probabilities[cls][feature][value] = count / total_samples
    return likelihood_probabilities

# Naive Bayes classifier
def naive_bayes_classifier(X_train, y_train, X_test):
    prior_probabilities = calculate_prior_probabilities(y_train)
    likelihood_probabilities = calculate_likelihood_probabilities(X_train, y_train)
    num_classes = len(prior_probabilities)
    num_samples = X_test.shape[0]
    predictions = []
    for sample in X_test:
        class_scores = {}
        for cls in prior_probabilities:
            class_scores[cls] = np.log(prior_probabilities[cls])
            for feature, value in enumerate(sample):
                if value in likelihood_probabilities[cls][feature]:
                    class_scores[cls] += np.log(likelihood_probabilities[cls][feature][value])
                else:
                    # If a feature value is unseen in training, assume a small probability
                    class_scores[cls] += np.log(1e-9)
        predicted_class = max(class_scores, key=class_scores.get)
        predictions.append(predicted_class)
    return predictions

# Load and preprocess the data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)

# Split the data into inputs and target
inputs = train_data.drop('Survived', axis=1)
target = train_data['Survived']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.2)

# Convert target values to numpy array
y_train = y_train.values

# Run the Naive Bayes classifier
predictions = naive_bayes_classifier(X_train, y_train, X_test)
print("Predictions:", predictions)

Predictions: [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [2]:
# Calculate accuracy
def calculate_accuracy(predictions, y_test):
    correct_predictions = np.sum(predictions == y_test)
    total_predictions = len(predictions)
    accuracy = correct_predictions / total_predictions
    return accuracy

# Calculate accuracy of the model
accuracy = calculate_accuracy(predictions, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.8156424581005587
