In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from scipy.ndimage import median_filter

# 1. Load Data
train_df = pd.read_csv('/content/hacktrain.csv')
test_df = pd.read_csv('/content/hacktest.csv')

In [9]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,ID,class,20150720_N,20150602_N,20150517_N,20150501_N,20150415_N,20150330_N,20150314_N,...,20140610_N,20140525_N,20140509_N,20140423_N,20140407_N,20140322_N,20140218_N,20140202_N,20140117_N,20140101_N
0,0,1,water,637.595,658.668,-1882.03,-1924.36,997.904,-1739.99,630.087,...,,-1043.16,-1942.49,267.138,,,211.328,-2203.02,-1180.19,433.906
1,1,2,water,634.24,593.705,-1625.79,-1672.32,914.198,-692.386,707.626,...,,-933.934,-625.385,120.059,364.858,476.972,220.878,-2250.0,-1360.56,524.075
2,3,4,water,58.0174,-1599.16,,-1052.63,,-1564.63,,...,-1025.88,368.622,,-1227.8,304.621,,369.214,-2202.12,,-1343.55
3,4,5,water,72.518,,380.436,-1256.93,515.805,-1413.18,-802.942,...,-1813.95,155.624,,-924.073,432.15,282.833,298.32,-2197.36,,-826.727
4,7,8,water,1136.44,,,1647.83,1935.8,,2158.98,...,1535.0,1959.43,-279.317,-384.915,-113.406,1020.72,1660.65,-116.801,-568.05,-1357.14


In [10]:
# 2. Identify Columns
# NDVI time-series columns (numeric)
ndvi_cols = [col for col in train_df.columns if '_N' in col]

# Non-numeric columns (to preserve)
non_numeric_cols = ['ID', 'class']  # ID and target variable

# Columns to drop (neither features nor target)
cols_to_drop = [col for col in train_df.columns
               if col not in ndvi_cols + non_numeric_cols]


# 4. Denoising
def denoise_ndvi(df):
    df[ndvi_cols] = median_filter(df[ndvi_cols], size=(1, 3), mode='reflect')
    return df

train_df = denoise_ndvi(train_df)
test_df = denoise_ndvi(test_df)

def create_features(df):

    # Annual statistics
    df['max_ndvi'] = df[ndvi_cols].max(axis=1)
    df['min_ndvi'] = df[ndvi_cols].min(axis=1)
    df['mean_ndvi'] = df[ndvi_cols].mean(axis=1)

    # Growing season features (May-Sept)
    summer_months = ['05', '06', '07', '08', '09']
    summer_cols = [col for col in ndvi_cols
                  if any(month in col for month in summer_months)]
    df['summer_mean'] = df[summer_cols].mean(axis=1)

    # Phenological metrics
    df['ndvi_amplitude'] = df['max_ndvi'] - df['min_ndvi']
    return df

train_df = create_features(train_df)
test_df = create_features(test_df)



# 6. Handle Missing Values (numeric only)
# Get new numeric columns after feature engineering
new_numeric_cols = [col for col in train_df.columns
                   if pd.api.types.is_numeric_dtype(train_df[col])
                   and col not in non_numeric_cols]

# Median imputation only for numeric features
train_df[new_numeric_cols] = train_df[new_numeric_cols].fillna(train_df[new_numeric_cols].median())
test_df[new_numeric_cols] = test_df[new_numeric_cols].fillna(test_df[new_numeric_cols].median())

# 7. Prepare Data
X_train = train_df[new_numeric_cols]
X_test = test_df[new_numeric_cols]



le = LabelEncoder()
y_train = le.fit_transform(train_df['class'])

# Drop unnecessary columns
train_df = train_df.drop(columns=cols_to_drop)
test_df = test_df.drop(columns=cols_to_drop)

# 8. Scale Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 9. Train Model
model = LogisticRegression(
    multi_class='multinomial',
    solver='lbfgs',
    max_iter=1000,
    class_weight='balanced',
    random_state=42
)
model.fit(X_train_scaled, y_train)

# 10. Predict
test_preds = le.inverse_transform(model.predict(X_test_scaled))
results = pd.DataFrame({
    'ID': test_df['ID'],
    'predicted_class': test_preds
})
results.to_csv('predictions.csv', index=False)

