In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier

# Load the dataset
try:
    train_tfidf = pd.read_csv('/kaggle/input/50-007-machine-learning-summer-2024/train_tfidf_features.csv')
    test_tfidf = pd.read_csv('/kaggle/input/50-007-machine-learning-summer-2024/test_tfidf_features.csv')
    train_labels = pd.read_csv('/kaggle/input/50-007-machine-learning-summer-2024/train.csv')['label']
    test_ids = pd.read_csv('/kaggle/input/50-007-machine-learning-summer-2024/test.csv')['id']
    print("Files loaded successfully.")
except FileNotFoundError as e:
    print(f"File not found: {e}")
    raise
except KeyError as e:
    print(f"Column not found: {e}")
    raise

# Check the shape of the loaded data
print(f"Shape of train_tfidf: {train_tfidf.shape}")
print(f"Shape of test_tfidf: {test_tfidf.shape}")
print(f"Shape of train_labels: {train_labels.shape}")

# Prepare the data
X_train = train_tfidf.drop(columns=['label'], errors='ignore')
y_train = train_labels
X_test = test_tfidf

# Ensure the number of features matches
print(f"Shape of X_train after dropping 'label': {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")

# Components to try
components = [2000, 1000, 500, 100]
results = {}

for n_components in components:
    try:
        # Apply PCA
        pca = PCA(n_components=n_components)
        X_train_pca = pca.fit_transform(X_train)
        X_test_pca = pca.transform(X_test)

        # Verify PCA transformation
        print(f"Explained variance for {n_components} components: {pca.explained_variance_ratio_.sum()}")
        print(f"Shape of X_train_pca: {X_train_pca.shape}")
        print(f"Shape of X_test_pca: {X_test_pca.shape}")

        # Train KNN classifier
        knn = KNeighborsClassifier(n_neighbors=2)
        knn.fit(X_train_pca, y_train)

        # Predict on the test set
        y_pred = knn.predict(X_test_pca)

        # Save predictions
        output = pd.DataFrame({'id': test_ids, 'label': y_pred})
        output.to_csv(f'knn_predictions_{n_components}_components.csv', index=False)

        print(f'Predictions saved for {n_components} components.')
    except Exception as e:
        print(f"An error occurred for {n_components} components: {e}")
        raise

print("Task completed.")

# Load the predictions
pred_2000 = pd.read_csv('knn_predictions_2000_components.csv')
pred_1000 = pd.read_csv('knn_predictions_1000_components.csv')
pred_500 = pd.read_csv('knn_predictions_500_components.csv')
pred_100 = pd.read_csv('knn_predictions_100_components.csv')

# Compare the predictions
print("2000 vs 1000: ", (pred_2000['label'] == pred_1000['label']).all())
print("2000 vs 500: ", (pred_2000['label'] == pred_500['label']).all())
print("2000 vs 100: ", (pred_2000['label'] == pred_100['label']).all())

# Count the number of differing predictions
print("2000 vs 1000 differences: ", (pred_2000['label'] != pred_1000['label']).sum())
print("2000 vs 500 differences: ", (pred_2000['label'] != pred_500['label']).sum())
print("2000 vs 100 differences: ", (pred_2000['label'] != pred_100['label']).sum())

Files loaded successfully.
Shape of train_tfidf: (17184, 5002)
Shape of test_tfidf: (4296, 5001)
Shape of train_labels: (17184,)
Shape of X_train after dropping 'label': (17184, 5001)
Shape of X_test: (4296, 5001)
Explained variance for 2000 components: 0.9999999929508931
Shape of X_train_pca: (17184, 2000)
Shape of X_test_pca: (4296, 2000)
Predictions saved for 2000 components.
Explained variance for 1000 components: 0.9999999858404957
Shape of X_train_pca: (17184, 1000)
Shape of X_test_pca: (4296, 1000)
Predictions saved for 1000 components.
Explained variance for 500 components: 0.999999979348994
Shape of X_train_pca: (17184, 500)
Shape of X_test_pca: (4296, 500)
Predictions saved for 500 components.
Explained variance for 100 components: 0.9999999681647121
Shape of X_train_pca: (17184, 100)
Shape of X_test_pca: (4296, 100)
Predictions saved for 100 components.
Task completed.
2000 vs 1000:  True
2000 vs 500:  True
2000 vs 100:  True
2000 vs 1000 differences:  0
2000 vs 500 differen