In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight
from sklearn.feature_selection import RFE
from scipy.signal import savgol_filter
from scipy.stats import zscore

# Load data
train = pd.read_csv("/kaggle/input/summer-analytics-mid-hackathon/hacktrain.csv")
test = pd.read_csv("/kaggle/input/summer-analytics-mid-hackathon/hacktest.csv")
ndvi_cols = [col for col in train.columns if "_N" in col]

# NDVI Imputation & Noise Handling
imputer = KNNImputer(n_neighbors=5)
train[ndvi_cols] = imputer.fit_transform(train[ndvi_cols])
test[ndvi_cols] = imputer.transform(test[ndvi_cols])

# Feature Engineering
def extract_ndvi_features(df):
    ndvi = df[ndvi_cols]
    features = pd.DataFrame(index=df.index)

    # Core NDVI Features
    features["ndvi_mean"] = ndvi.mean(axis=1)
    features["ndvi_std"] = ndvi.std(axis=1)
    features["ndvi_range"] = ndvi.max(axis=1) - ndvi.min(axis=1)
    features["ndvi_iqr"] = ndvi.quantile(0.75, axis=1) - ndvi.quantile(0.25, axis=1)

    # Temporal Trends
    mid = ndvi.shape[1] // 2
    features["slope_early"] = ndvi.iloc[:, :mid].apply(lambda x: np.polyfit(range(len(x)), x, 1)[0], axis=1)
    features["slope_late"] = ndvi.iloc[:, mid:].apply(lambda x: np.polyfit(range(len(x)), x, 1)[0], axis=1)

    # NDRE Filtering (Water-Orchard differentiation)
    features["ndre_proxy"] = (ndvi.max(axis=1) - ndvi.min(axis=1)) / (ndvi.max(axis=1) + ndvi.min(axis=1) + 1)

    # Outlier Removal with Z-score
    features["ndvi_outlier"] = zscore(ndvi.mean(axis=1))

    # NDVI Smoothing with Savitzky-Golay Filter
    features["smooth_avg"] = ndvi.apply(lambda x: savgol_filter(x, 5, 2).mean(), axis=1)

    return features

X_train = extract_ndvi_features(train)
X_test = extract_ndvi_features(test)
y_train = train["class"]

# Label Encoding
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)

# Compute Class Weights
classes = np.unique(y_train_encoded)
class_weights = compute_class_weight("balanced", classes=classes, y=y_train_encoded)
weight_dict = dict(zip(classes, class_weights))

# Feature Scaling & Selection
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

rfe = RFE(LogisticRegression(max_iter=1000), n_features_to_select=10)
X_train_selected = rfe.fit_transform(X_train_scaled, y_train_encoded)
X_test_selected = rfe.transform(X_test_scaled)

# Stratified Cross-Validation & Model Training
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
preds = np.zeros((X_test_selected.shape[0], len(le.classes_)))

for train_idx, val_idx in skf.split(X_train_selected, y_train_encoded):
    X_tr, y_tr = X_train_selected[train_idx], y_train_encoded[train_idx]

    model_1 = LogisticRegression(C=10000, class_weight=weight_dict, max_iter=1000)
    model_2 = LogisticRegression(C=5000, class_weight="balanced", max_iter=1000)

    model_1.fit(X_tr, y_tr)
    model_2.fit(X_tr, y_tr)

    preds += (model_1.predict_proba(X_test_selected) + model_2.predict_proba(X_test_selected)) / 2

final_pred = le.inverse_transform(preds.argmax(axis=1))

# Submission File
submission = pd.DataFrame({"ID": test["ID"], "class": final_pred})
submission.to_csv("/kaggle/working/submission.csv", index=False)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt