In [1]:
import pandas as pd

# Load the training and test datasets
train_df = pd.read_excel("train.xlsx")
test_df = pd.read_excel("test.xlsx")

# Inspect the data
print("Columns in train_df:")
print(train_df.columns)

print("\nColumns in test_df:")
print(test_df.columns)

Columns in train_df:
Index(['T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', 'T8', 'T9', 'T10', 'T11',
       'T12', 'T13', 'T14', 'T15', 'T16', 'T17', 'T18', 'target'],
      dtype='object')

Columns in test_df:
Index(['T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', 'T8', 'T9', 'T10', 'T11',
       'T12', 'T13', 'T14', 'T15', 'T16', 'T17', 'T18'],
      dtype='object')


In [2]:
# Separate features (X) and target variable (y)
X_train = train_df.drop(columns=["target"])
y_train = train_df["target"]

X_test = test_df

In [3]:
from sklearn.preprocessing import StandardScaler

# Scale the data using the StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Center the scaled data
X_train_centered = X_train_scaled - X_train_scaled.mean(axis=0)
X_test_centered = X_test_scaled - X_train_scaled.mean(axis=0)

In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Initialize the classifier
clf_knn = KNeighborsClassifier(n_neighbors=5)

# Train the classifier
clf_knn.fit(X_train_centered, y_train)

# Predict on the test data
y_pred_knn = clf_knn.predict(X_test_centered)

# Calculate train accuracy
train_accuracy_knn = accuracy_score(y_train, clf_knn.predict(X_train_centered))

# Print the predictions and accuracy
print("K-Nearest Neighbors (KNN) Predicted Targets:")
print(y_pred_knn)
print(f"Train Accuracy (KNN): {train_accuracy_knn}")

K-Nearest Neighbors (KNN) Predicted Targets:
['B74' 'A10' 'B65' ... 'B69' 'A38' 'A80']
Train Accuracy (KNN): 0.989878101872007


In [5]:
from sklearn.tree import DecisionTreeClassifier

# Initialize the classifier
clf_dt = DecisionTreeClassifier(random_state=42)

# Train the classifier
clf_dt.fit(X_train_centered, y_train)

# Predict on the test data
y_pred_dt = clf_dt.predict(X_test_centered)

# Calculate train accuracy
train_accuracy_dt = accuracy_score(y_train, clf_dt.predict(X_train_centered))

# Print the predictions and accuracy
print("Decision Tree Predicted Targets:")
print(y_pred_dt)
print(f"Train Accuracy (Decision Tree): {train_accuracy_dt}")

Decision Tree Predicted Targets:
['B74' 'A10' 'B65' ... 'B69' 'A38' 'A80']
Train Accuracy (Decision Tree): 0.9994286025250326


In [6]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the classifier
clf_rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
clf_rf.fit(X_train_centered, y_train)

# Predict on the test data
y_pred_rf = clf_rf.predict(X_test_centered)

# Calculate train accuracy
train_accuracy_rf = accuracy_score(y_train, clf_rf.predict(X_train_centered))

# Print the predictions and accuracy
print("Random Forest Predicted Targets:")
print(y_pred_rf)
print(f"Train Accuracy (Random Forest): {train_accuracy_rf}")

Random Forest Predicted Targets:
['B74' 'A10' 'B65' ... 'B69' 'A38' 'A80']
Train Accuracy (Random Forest): 0.9994286025250326


In [7]:
# Output predictions
print("\nTrain Accuracy and Predictions Summary:")
print(f"Train Accuracy (KNN): {train_accuracy_knn}")
print(f"Train Accuracy (Decision Tree): {train_accuracy_dt}")
print(f"Train Accuracy (Random Forest): {train_accuracy_rf}")

# Save predictions to a file for submission
test_df["Prediction_KNN"] = y_pred_knn
test_df["Prediction_DT"] = y_pred_dt
test_df["Prediction_RF"] = y_pred_rf

# Save the predictions to a new Excel file
test_df.to_excel("predictions.xlsx", index=False)

# Reason for choosing algorithms
print("\nReasons for Choosing Algorithms:")
print("- KNN: Simple, non-parametric method suitable for multi-class classification.")
print("- Decision Tree: Interpretable model with non-linear decision boundaries.")
print(
    "- Random Forest: Ensemble method that improves performance and reduces overfitting."
)


Train Accuracy and Predictions Summary:
Train Accuracy (KNN): 0.989878101872007
Train Accuracy (Decision Tree): 0.9994286025250326
Train Accuracy (Random Forest): 0.9994286025250326

Reasons for Choosing Algorithms:
- KNN: Simple, non-parametric method suitable for multi-class classification.
- Decision Tree: Interpretable model with non-linear decision boundaries.
- Random Forest: Ensemble method that improves performance and reduces overfitting.
