In [6]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
import pandas as pd

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


In [7]:
path = "/Users/vineethrayadurgam/Desktop/Machine Learning 245/FINAL_LA_FIRE_ML_DATA_MERGED.csv"
df = pd.read_csv(path)

In [8]:
df['date'] = pd.to_datetime(df['date'], errors='coerce')

In [9]:
cols_to_drop = ['DAPR', 'MDPR', 'PGTM'] + [f'WT{str(i).zfill(2)}' for i in range(1, 12)]
cols_to_drop += ['TOBS', 'WDF2', 'WESD', 'WESF', 'WSF2']
existing_cols_to_drop = [col for col in cols_to_drop if col in df.columns]
df = df.drop(columns=existing_cols_to_drop)

In [10]:
df = df.drop(columns=['NAME']) #Same as station column representing the name of the station, redundant column

# Encode STATION using category codes
df['STATION'] = df['STATION'].astype(str)
df['STATION_ENC'] = df['STATION'].astype('category').cat.codes
df = df.drop(columns=['STATION'])

In [12]:
from scipy.stats.mstats import winsorize
from sklearn.preprocessing import StandardScaler
import numpy as np

# Start with a fresh copy of the raw dataset
df_winsor = df.copy()
df_transform = df_winsor.copy()  # We'll work off this for transformations

### Winsorize heavy-tailed columns
winsor_cols = ['PRCP', 'PRCP_prev', 'PRCP_7D', 'AWND', 'AWND_prev', 'AWND_7D', 'fire_count']
for col in winsor_cols:
    df_transform[col + '_win'] = winsorize(df_transform[col], limits=[0.01, 0.01])

### Two-step transformation for precipitation variables
precip_cols = ['PRCP', 'PRCP_prev', 'PRCP_7D']
epsilon = 1e-8
for col in precip_cols:
    win_col = col + '_win'
    df_transform[col + '_nonzero'] = (df_transform[win_col] > 0).astype(int)
    df_transform[col + '_log'] = np.where(
        df_transform[win_col] > 0,
        np.log(df_transform[win_col] + epsilon),
        0
    )

### Clip temperature values (physical plausibility bounds)
temp_cols = ['TMAX', 'TMIN', 'TAVG']
for col in temp_cols:
    df_transform[col + '_clipped'] = np.clip(df_transform[col], -30, 130)

### Standardize the clipped temperatures
scaler = StandardScaler()
df_transform[[col + '_scaled' for col in temp_cols]] = scaler.fit_transform(
    df_transform[[col + '_clipped' for col in temp_cols]]
)

### Drop the raw original columns
cols_to_drop = winsor_cols + temp_cols
df_transform.drop(columns=cols_to_drop, inplace=True)


In [13]:
df_transform = df_transform.drop(columns= ["fire_count_win"])

In [14]:
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Step 1: Drop target
X_pca_input = df_transform.drop(columns=['Fire_Occurred'], errors='ignore').select_dtypes(include=['float64', 'int64'])

# Step 2: Impute + Scale
imputer = SimpleImputer(strategy='mean')
scaler = StandardScaler()

X_imputed = imputer.fit_transform(X_pca_input)
X_scaled = scaler.fit_transform(X_imputed)

# Step 3: Fit PCA
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Step 4: Wrap into a DataFrame
pca_cols = [f'PC{i+1}' for i in range(X_pca.shape[1])]
df_pca = pd.DataFrame(X_pca, columns=pca_cols)

# Optional: Check variance explained
explained_variance = pca.explained_variance_ratio_
print("Explained Variance per PC:")
for i, v in enumerate(explained_variance[:10]):
    print(f"PC{i+1}: {v:.4f}")

Explained Variance per PC:
PC1: 0.2739
PC2: 0.1911
PC3: 0.1159
PC4: 0.1006
PC5: 0.0921
PC6: 0.0570
PC7: 0.0396
PC8: 0.0269
PC9: 0.0258
PC10: 0.0216


In [15]:
df_transform['fire_risk_index'] = (
    (df_transform['TMAX_scaled'] ** 1.5 + df_transform['AWND_win']) /
    (df_transform['PRCP_7D_win'] + 0.1)
)

In [16]:
df_transform['dryness_score'] = np.log1p(
    np.clip(df_transform['dry_streak'] * df_transform['LST_Day_C'], 0, 20000)
) / (df_transform['PRCP_7D_win'] + 0.1)

In [17]:
df_transform['spread_score'] = np.log1p(
    10 * df_transform['AWND_7D_win'] * df_transform['LST_Day_C'] * df_transform['is_dry']
)

In [18]:
df_transform['wind_temp_combo'] = np.clip((df_transform['AWND_7D_win'] * df_transform['TMAX_scaled']) ** 2, 0, 100)

In [19]:
df_final = pd.concat([
    df_pca.reset_index(drop=True),
    df_transform[[
        'fire_risk_index', 'dryness_score', 'spread_score',
        'wind_temp_combo'
    ]].reset_index(drop=True)
], axis=1)

In [21]:
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import f1_score
import pandas as pd

# Load df_pca and df_transform already processed externally
# Assume df_pca and df_transform are available in memory

# Combine PCA + engineered features for modeling
df_model = pd.concat([
    df_pca[['PC1', 'PC2', 'PC4', 'PC5', 'PC6', 'PC7', 'PC3']],
    df_transform[['dryness_score', 'spread_score', 'Fire_Occurred', 'wind_temp_combo', 'fire_risk_index']]
], axis=1)

# Define features and target
final_features = ['PC1', 'PC2', 'PC4', 'PC5', 'PC6', 'PC7', 'PC3',
                  'dryness_score', 'spread_score', 'wind_temp_combo', 'fire_risk_index']
X = df_model[final_features].copy()
y = df_model['Fire_Occurred'].copy()


In [22]:
# Sample 30k rows for faster experimentation
X_sampled = X.sample(n=30000, random_state=42)
y_sampled = y.loc[X_sampled.index]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_sampled, y_sampled, test_size=0.2, stratify=y_sampled, random_state=42
)

# Loop over different k values for KNNImputer and test F1
f1_results_sampled = []


In [23]:
for k in [2, 3, 5, 7, 10]:
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('imputer', KNNImputer(n_neighbors=k))
    ])

    # Process the data
    X_train_processed = pipeline.fit_transform(X_train)
    X_test_processed = pipeline.transform(X_test)

    # Train SVM
    svm_model = SVC(kernel='rbf', C=10, gamma='scale', class_weight={0: 1, 1: 5}, probability=True, random_state=42)
    svm_model.fit(X_train_processed, y_train)

    # Evaluate
    y_pred = svm_model.predict(X_test_processed)
    f1 = f1_score(y_test, y_pred, pos_label=1)
    f1_results_sampled.append((k, f1))

In [25]:
# Display results
f1_df_sampled = pd.DataFrame(f1_results_sampled, columns=["k (neighbors)", "F1 Score (Fire)"])

In [26]:
print(f1_df_sampled)

   k (neighbors)  F1 Score (Fire)
0              2         0.047009
1              3         0.042463
2              5         0.042105
3              7         0.041929
4             10         0.042105


### highest F1 score gives optimum K to use in KNN for KNN Imputing before training svm