In [1]:
# importing libraries
import pandas as pd
import numpy as np
from scipy.signal import savgol_filter
from scipy.fft import fft
from scipy.integrate import trapz
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler

# Loading the datasets
try:
    train_df = pd.read_csv('C:\\Users\\viraj\\Downloads\\hacktrain.csv')
    test_df = pd.read_csv('C:\\Users\\viraj\\Downloads\\hacktest.csv')
except FileNotFoundError:
    print("Error: file not found.")
    exit()


ndvi_cols = [col for col in train_df.columns if '_N' in col]

def preprocess_ndvi_data(df, ndvi_cols):
    for col in ndvi_cols:
        df[col] = df[col].replace([np.inf, -np.inf], np.nan)
    df[ndvi_cols] = df[ndvi_cols].interpolate(method='linear', axis=1, limit_direction='both')
    df[ndvi_cols] = df[ndvi_cols].fillna(0)

    window_length = 7 
    polyorder = 2
    
    denoised_ndvi_values = []
    for index, row in df.iterrows():
        try:
            denoised_ndvi = savgol_filter(row[ndvi_cols].values, window_length, polyorder)
            denoised_ndvi_values.append(denoised_ndvi)
        except ValueError:
            denoised_ndvi_values.append(row[ndvi_cols].values)
    
    df[ndvi_cols] = pd.DataFrame(denoised_ndvi_values, index=df.index, columns=ndvi_cols)
    return df

print("Applying preprocessing")
train_df_processed = preprocess_ndvi_data(train_df.copy(), ndvi_cols)
test_df_processed = preprocess_ndvi_data(test_df.copy(), ndvi_cols)
print("Preprocessing complete.")

def create_enhanced_features(df, ndvi_cols):
    features_df = pd.DataFrame(df['ID'])

    features_df['ndvi_mean'] = df[ndvi_cols].mean(axis=1)
    features_df['ndvi_median'] = df[ndvi_cols].median(axis=1)
    features_df['ndvi_min'] = df[ndvi_cols].min(axis=1)
    features_df['ndvi_max'] = df[ndvi_cols].max(axis=1)
    features_df['ndvi_std'] = df[ndvi_cols].std(axis=1)
    features_df['ndvi_range'] = features_df['ndvi_max'] - features_df['ndvi_min']
    

    features_df['ndvi_auc'] = df[ndvi_cols].apply(lambda x: trapz(x), axis=1)

    def get_phenology_features(row_series):
        peak_idx = row_series.argmax()
        trough_idx = row_series.argmin()
        
        peak_timing = peak_idx / len(ndvi_cols)
        trough_timing = trough_idx / len(ndvi_cols)
        
        peak_val = row_series.iloc[peak_idx]
        trough_val = row_series.iloc[trough_idx]
        
        if peak_timing > trough_timing:
            growth_slope = (peak_val - trough_val) / (peak_timing - trough_timing)
        else:
            growth_slope = 0
            
        if peak_timing < 1.0: 
             
             end_val = row_series.iloc[-1]
             senescence_slope = (peak_val - end_val) / (1.0 - peak_timing)
        else:
             senescence_slope = 0 

        return peak_timing, trough_timing, growth_slope, senescence_slope

    phenology_features = df[ndvi_cols].apply(get_phenology_features, axis=1, result_type='expand')
    phenology_features.columns = ['peak_timing', 'trough_timing', 'growth_slope', 'senescence_slope']
    features_df = pd.concat([features_df, phenology_features], axis=1)
    
   
    for index, row in df.iterrows():
        fft_result = fft(row[ndvi_cols].values)
        num_components = 5 
        magnitudes = np.abs(fft_result[1:num_components+1])
        for i in range(num_components):
            features_df.loc[index, f'fft_mag_{i}'] = magnitudes[i]

    
    features_df['mean_x_std'] = features_df['ndvi_mean'] * features_df['ndvi_std']
    features_df['range_x_peak_timing'] = features_df['ndvi_range'] * features_df['peak_timing']
    
    features_df.replace([np.inf, -np.inf], np.nan, inplace=True)
    features_df.fillna(0, inplace=True)

    return features_df

X_train_features = create_enhanced_features(train_df_processed, ndvi_cols)
X_test_features = create_enhanced_features(test_df_processed, ndvi_cols)

common_cols = list(set(X_train_features.columns) & set(X_test_features.columns))
feature_cols_to_use = [col for col in common_cols if col != 'ID']

X_train_final = X_train_features[feature_cols_to_use]
X_test_final = X_test_features[feature_cols_to_use]
y_train = train_df_processed['class']

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_final)
X_test_scaled = scaler.transform(X_test_final)

X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=feature_cols_to_use, index=X_train_final.index)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=feature_cols_to_use, index=X_test_final.index)


print("Training Logistic Regression model...")

logistic_model = LogisticRegression(multi_class='multinomial', solver='saga', max_iter=5000, random_state=42)


param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1, 10, 100]
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(logistic_model, param_grid, cv=skf, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_train_scaled_df, y_train)

best_model = grid_search.best_estimator_
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")

print("Making predictions on the test set")
predictions = best_model.predict(X_test_scaled_df)

submission_df = pd.DataFrame({'ID': test_df['ID'], 'class': predictions})
submission_df.to_csv('submission.csv', index=False)

print("'submission.csv' created successfully!")

Applying preprocessing
Preprocessing complete.
Training Logistic Regression model...
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best parameters found: {'C': 100, 'penalty': 'l1'}
Best cross-validation accuracy: 0.8454
Making predictions on the test set
'submission.csv' created successfully!
