In [1]:
import numpy as np
import pandas as pd

# Data preprocessing
def preprocess_data(data):
    # Drop irrelevant features
    data = data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
    
    # Fill missing values
    data['Age'].fillna(data['Age'].median(), inplace=True)
    data['Fare'].fillna(data['Fare'].median(), inplace=True)
    data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
    
    # Convert categorical variables to numerical
    data['Sex'] = data['Sex'].map({'female': 0, 'male': 1})
    data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
    
    return data

# Splitting the data
def train_test_split(inputs, target, test_size):
    indices = np.arange(len(inputs))
    np.random.shuffle(indices)
    split_index = int(len(inputs) * (1 - test_size))
    train_indices = indices[:split_index]
    test_indices = indices[split_index:]
    X_train = inputs.iloc[train_indices].values
    X_test = inputs.iloc[test_indices].values
    y_train = target[train_indices]
    y_test = target[test_indices]
    return X_train, X_test, y_train, y_test

# Euclidean distance
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

# KNN classifier
def knn_classifier(X_train, y_train, X_test, k):
    predictions = []
    for test_sample in X_test:
        distances = []
        for train_sample, train_label in zip(X_train, y_train):
            distance = euclidean_distance(test_sample, train_sample)
            distances.append((distance, train_label))
        distances.sort(key=lambda x: x[0])
        k_nearest_neighbors = distances[:k]
        labels = [neighbor[1] for neighbor in k_nearest_neighbors]
        predicted_label = max(labels, key=labels.count)
        predictions.append(predicted_label)
    return predictions

# Load and preprocess the data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)

# Split the data into inputs and target
inputs = train_data.drop('Survived', axis=1)
target = train_data['Survived']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.2)

# Convert target values to numpy array
y_train = y_train.values

# Set the number of nearest neighbors (k)
k = 5

# Run the KNN classifier
predictions = knn_classifier(X_train, y_train, X_test, k)
print("Predictions:", predictions)


Predictions: [0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0]


In [2]:
# Calculate accuracy
def calculate_accuracy(predictions, y_test):
    correct_predictions = np.sum(predictions == y_test)
    total_predictions = len(predictions)
    accuracy = correct_predictions / total_predictions
    return accuracy

# Calculate accuracy of the model
accuracy = calculate_accuracy(predictions, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.6815642458100558
