In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import LocalOutlierFactor
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np

# Load the data
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Separate the target variable
y_train = train_df["category"].to_numpy() 
X_train = train_df.drop(["ID", "category"], axis=1).to_numpy() 
X_test = test_df.drop(["ID"], axis=1).to_numpy() 

# # Apply feature engineering
# for i in range(X_train.shape[1]):
#     col = X_train[:, i]
#     X_train = np.column_stack((X_train, col + col ** 3 + col ** 2))

# # Standardize the data using z-score normalization
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)

# # Standardize the data using z-score normalization
# scaler = StandardScaler()
# X_train_standardized = scaler.fit_transform(X_train)
# X_train = X_train_standardized


In [None]:
# Fit the LOF algorithm on X_train
lof = LocalOutlierFactor(n_neighbors=5, algorithm='ball_tree', metric='euclidean')
outliers = lof.fit_predict(X_train)

# Remove the outliers from X_train and y_train
X_train = X_train[outliers == 1]
y_train = y_train[outliers == 1]

print(len(X_train))

In [None]:
# Calculate the correlation matrix
corr_matrix = np.corrcoef(X_train, rowvar=False)

# Get the correlations with the target variable
corr_with_target = corr_matrix[:-1, -1]

# Get the absolute values of the correlations
abs_corr_with_target = np.abs(corr_with_target)

# Sort the correlations in descending order
sorted_corrs = np.argsort(abs_corr_with_target)[::-1]

# Get the top n features
n = 2000
top_n_features = train_df.columns[1:-1][sorted_corrs][:n]

print("Top", n, "Features:")
print(top_n_features)

# Update X_train and X_test with the top n features
X_train = X_train[:, sorted_corrs[:n]]
X_test = X_test[:, sorted_corrs[:n]]

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# define number of components to keep
# n_components = 300

# initialize PCA object
pca = PCA(n_components=0.99)

# fit PCA on train_df after removing outliers
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)
print(X_train_pca.shape)

exp_var = sum(pca.explained_variance_ratio_ * 100)
print('Variance explained:', exp_var) 

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis(n_components=19)
X_train_pca = lda.fit_transform(X_train_pca, y_train)
X_test_pca = lda.transform(X_test_pca)

print(X_train_pca.shape)
exp_var = sum(lda.explained_variance_ratio_ * 100)
print('Variance explained:', exp_var)

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# X_train, X_test, y_train, y_test = train_test_split(X_train_clustered, y_train_encoded, test_size=0.2, random_state=42)

from sklearn.ensemble import VotingClassifier

log_reg1 = LogisticRegression(C=1, max_iter=1000, solver = 'lbfgs', multi_class = 'multinomial', penalty='l2')
log_reg2 = LogisticRegression(C=1.25, max_iter=1000, solver = 'lbfgs', multi_class = 'multinomial', penalty='l2')
log_reg3 = LogisticRegression(C=1.5, max_iter=1000, solver = 'lbfgs', multi_class = 'multinomial', penalty='l2')

voting_clf = VotingClassifier(
    estimators=[('lr1', log_reg1), ('lr2', log_reg2), ('lr3', log_reg3)],
    voting='soft',
    weights=[2, 1, 1] 
)

# X_train.columns = X_train.columns.astype(str)
# X_test.columns = X_test.columns.astype(str)

# scores = cross_val_score(voting_clf, X_train, y_train, cv=15)

# print("Cross-validation scores:", scores)
# print("Mean accuracy:", scores.mean())

voting_clf.fit(X_train_pca, y_train)

y_pred = voting_clf.predict(X_test_pca)

# accuracy = accuracy_score(y_test, y_pred)
# print("Accuracy of the ensemble model:", accuracy)

In [None]:
import pandas as pd

# Convert the predictions to a DataFrame with the correct column name
submission_df = pd.DataFrame({'category': y_pred})

# Set the value of the first row in the first column to 'Leeche_Raw'
submission_df.loc[0, 'category'] = 'Leeche_Raw'

# Add an 'ID' column with values from the 'ID' column of the test data
submission_df['ID'] = test_df['ID']

# Rearrange the columns so that 'ID' is the first column
submission_df = submission_df[['ID', 'category']]

# Save the DataFrame to a CSV file
submission_df.to_csv('submission.csv', index=False)


In [None]:
import pandas as pd
c1 = pd.read_csv("submission (2) (4).csv").to_numpy()

c2 = pd.read_csv("submission (6) (1).csv").to_numpy()
cnt = 0
for i in range(c1.shape[0]):
    if(c1[i][1]!=c2[i][1]):
        print(c1[i],c2[i])
        cnt += 1
print(cnt)
