In [None]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
import seaborn as sns

# Load the data
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Separate the target variable
y_train = train_df["category"]
X_train = train_df.drop(["ID", "category"], axis=1)
X_test = test_df.drop(["ID"], axis=1)

scaler = StandardScaler()
X_train_standardized = scaler.fit_transform(X_train)
X_train = X_train_standardized

scaler = StandardScaler()
X_train_standardized = scaler.fit_transform(X_train)
X_train = X_train_standardized

In [None]:
# Define the hyperparameters to tune
param_grid = {
    'n_neighbors': [5, 6, 7],
    'algorithm': ['ball_tree'],
    'metric': ['euclidean'],
    'contamination': [0.01]
}

# Initialize the LOF algorithm
lof = LocalOutlierFactor()

# Perform grid search
grid_search = GridSearchCV(lof, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best hyperparameters: ", grid_search.best_params_)

In [None]:
# Fit the LOF algorithm on X_train
lof = LocalOutlierFactor()
outliers = lof.fit_predict(X_train)

print(len(X_train))
print(len(X_train[outliers==1]))

# Remove the outliers from X_train and y_train
X_train = X_train[outliers == 1]
y_train = y_train[outliers == 1]

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

pca = PCA(grid_search.best_params_)

X_train_pca = pca.fit_transform(X_train)
print(X_train_pca.shape)

exp_var = sum(pca.explained_variance_ratio_ * 100)
print('Variance explained:', exp_var)

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis(n_components=19)
X_train_pca = lda.fit_transform(X_train_pca, y_train)

print(X_train_pca.shape)
exp_var = sum(lda.explained_variance_ratio_ * 100)
print('Variance explained:', exp_var)

In [None]:
from sklearn.cluster import KMeans

n_clusters = 8

kmeans = KMeans(n_clusters=n_clusters, random_state=42)

kmeans.fit(X_train_pca)

train_df_cluster_labels = kmeans.labels_

X_train_clustered = pd.concat([pd.DataFrame(X_train_pca), pd.DataFrame(train_df_cluster_labels, columns=["cluster_label"])], axis=1)

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X_train_clustered, y_train_encoded, test_size=0.2, random_state=42)

from sklearn.ensemble import VotingClassifier

log_reg1 = LogisticRegression(C=0.1, max_iter=1000)
log_reg2 = LogisticRegression(C=0.01, max_iter=1000)
log_reg3 = LogisticRegression(C=0.001, max_iter=1000)

voting_clf = VotingClassifier(
    estimators=[('lr1', log_reg1), ('lr2', log_reg2), ('lr3', log_reg3)],
    voting='soft',
    weights=[2, 1, 1] 
)

X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

scores = cross_val_score(voting_clf, X_train, y_train, cv=15)

print("Cross-validation scores:", scores)
print("Mean accuracy:", scores.mean())

voting_clf.fit(X_train, y_train)

y_pred = voting_clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of the ensemble model:", accuracy)

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load the test data
test_df = pd.read_csv("test.csv")

X_test = test_df.drop(["ID"], axis=1)

scaler = StandardScaler()
X_test_standardized = scaler.fit_transform(X_test)
X_test = X_test_standardized

scaler = StandardScaler()
X_test_standardized = scaler.fit_transform(X_test)
X_test = X_test_standardized

X_test_pca = pca.transform(X_test)
X_test_pca = lda.transform(X_test_pca)

test_df_cluster_labels = kmeans.predict(X_test_pca)
X_test_clustered = pd.concat([pd.DataFrame(X_test_pca), pd.DataFrame(test_df_cluster_labels, columns=["cluster_label"])], axis=1)

X_test_clustered.columns = X_test_clustered.columns.astype(str)

y_pred = voting_clf.predict(X_test_clustered)

y_pred_decoded = le.inverse_transform(y_pred)

submission_df = pd.DataFrame({'ID': test_df['ID'], 'category': y_pred_decoded})

submission_df.to_csv('submission.csv', index=False)