In [2]:
import os
import sys
import pandas as pd
import geopandas as gpd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    precision_score,
    recall_score,
    precision_recall_curve,
    ConfusionMatrixDisplay,
    roc_curve,
    RocCurveDisplay,
    auc,
    classification_report,
    confusion_matrix,
    roc_auc_score,
    roc_curve,
)
from sklearn.model_selection import RandomizedSearchCV, train_test_split

from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV

from scipy.stats import randint
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample
import matplotlib.pyplot as plt

from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz

sys.path.append("../utils")

In [3]:
data = pd.read_csv(
    "/capstone/wildfire_prep/data/PUZZLE_PIECES/assembled_puzzle.csv"
).drop(columns="basemap_id")

In [8]:
# Remove Mosaiks from dataset for testing

mosaiks86 = data[
    [
        "inspection_id",
        "maj_landcover_code_y",
        "status",
        "structure_code",
        "current_month_rain",
        "previous_month_rain",
        "two_months_prior_rain"
    ]
]

data = mosaiks86
data.columns

Index(['inspection_id', 'maj_landcover_code_y', 'status', 'structure_code',
       'current_month_rain', 'previous_month_rain', 'two_months_prior_rain'],
      dtype='object')

In [None]:
X = data.drop("status", axis=1)
y = data["status"]

upsample_data_ids = X.inspection_id

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

train = pd.concat([X_train, y_train], axis=1)

majority = train[train.status == 0]
minority = train[train.status == 1]

n_majority_desired = len(minority) * 3
majority_downsampled = majority.sample(
    n=n_majority_desired, replace=False, random_state=42
)

train_balanced = (
    pd.concat([majority_downsampled, minority])
    .sample(frac=1, random_state=42)
    .reset_index(drop=True)
)

X_train = train_balanced.drop("status", axis=1).drop(columns="inspection_id")
y_train = train_balanced["status"]

X_test = X_test.drop(columns="inspection_id")

print(f"Shape of compliant:", majority.shape)
print(f"Shape of non-compliant:", minority.shape)

print(f"N_minority desired:", int(len(majority) * 3))

print("Shapes:")
print(" Downsampled X_train:", X_train.shape)
print(" Downsampled y_train:", y_train.shape)
print(" Downsampled X_test: ", X_test.shape)
print(" Downsampled y_test: ", y_test.shape)

print("\nTrain distribution after up-sampling:")
print(y_train.value_counts(normalize=True))

X_train


Shapes:
  X_train: (1337, 4008)
  y_train: (1337,)
  X_test:  (335, 4008)
  y_test:  (335,)

Train distribution after downsampling:
status
0    0.750187
1    0.249813
Name: proportion, dtype: float64


Unnamed: 0,X_0,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,...,X_3998,X_3999,maj_landcover_code_x,maj_landcover_code_y,mean_ndvi,structure_code,current_month_rain,previous_month_rain,two_months_prior_rain,total_rain_three_months_prior
263,0.905372,0.0,0.463061,0.000000,1.886351,0.218102,0.0,0.846962,0.314425,0.072376,...,8.630460,4.673191,582.0,583,0.296054,100,0.01,,,
1174,0.684678,0.0,0.411536,0.000000,1.702152,0.190052,0.0,0.884239,0.345773,0.061749,...,8.364141,4.439219,303.0,557,0.311666,102,,0.03,0.12,
852,0.565098,0.0,0.244072,0.000000,1.397524,0.053152,0.0,0.507633,0.047321,0.004118,...,7.250310,3.956442,303.0,582,0.318141,102,0.23,0.05,7.20,7.48
1377,0.551114,0.0,0.380738,0.008085,1.375016,0.255696,0.0,0.667960,0.307508,0.101963,...,7.452064,4.018713,303.0,581,-0.032437,101,0.02,0.02,,
1000,0.577466,0.0,0.383065,0.000237,1.404185,0.225870,0.0,0.861246,0.377799,0.100622,...,7.692123,3.993739,582.0,581,0.196850,102,0.01,0.51,0.92,1.44
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1560,0.761893,0.0,0.361810,0.011677,1.540986,0.198266,0.0,0.647898,0.297218,0.108509,...,7.620576,4.148877,557.0,582,0.255376,102,1.39,,,
275,0.567275,0.0,0.386197,0.000000,1.351602,0.208301,0.0,0.792596,0.257522,0.025489,...,7.412181,3.893932,39.0,581,0.344665,101,0.10,,0.15,
701,0.459184,0.0,0.220497,0.000000,1.271045,0.104797,0.0,0.578228,0.136149,0.011004,...,7.113483,3.831665,583.0,581,0.190989,102,1.01,0.03,0.01,1.05
425,0.605536,0.0,0.298521,0.003151,1.470883,0.163972,0.0,0.562979,0.151078,0.071432,...,7.478241,4.158309,583.0,556,0.128669,100,,,0.01,


In [None]:
X = data.drop("status", axis=1)
y = data["status"]

# Save inspection_ids from the data, in case we need them later
upsample_data_ids = X.inspection_id

# split data, as usual. 20% test data split.
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Recombine just the train fold
train = pd.concat([X_train, y_train], axis=1)

# Up-sample the minority inside train
majority = train[train.status == 0]
minority = train[train.status == 1]
n_min = int(len(majority) / 3)

minority_upsampled = minority.sample(n=n_min, replace=True, random_state=42)

train_balanced = (
    pd.concat([majority, minority_upsampled])
    .sample(frac=1, random_state=42)
    .reset_index(drop=True)
)

X_train = train_balanced.drop("status", axis=1).drop(columns="inspection_id")
y_train = train_balanced["status"]
X_test = X_test.drop(columns="inspection_id")

# Check majority/minority sizes
print(f"Shape of compliant:", majority.shape)
print(f"Shape of non-compliant:", minority.shape)

print(f"N_minority desired:", int(len(majority) / 3))

print("Shapes:")
print(" Upsampled X_train:", X_train.shape)
print(" Upsampled y_train:", y_train.shape)
print(" Upsampled X_test: ", X_test.shape)
print(" Upsampled y_test: ", y_test.shape)

print("\nTrain distribution after up-sampling:")
print(y_train.value_counts(normalize=True))

X_train

Shape of compliant: (67162, 4010)
Shape of non-compliant: (418, 4010)
N_minority desired: 22387
Shape of non-compliant upsampled: (22387, 4010)
Shape of upsampled training data: (89549, 4010)
Shapes:
 Upsampled X_train: (71639, 4008)
 Upsampled y_train: (71639,)
 Upsampled X_test:  (17910, 4008)
 Upsampled y_test:  (17910,)

Train distribution after up-sampling:
status
0    0.749997
1    0.250003
Name: proportion, dtype: float64


Unnamed: 0,X_0,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,...,X_3998,X_3999,maj_landcover_code_x,maj_landcover_code_y,mean_ndvi,structure_code,current_month_rain,previous_month_rain,two_months_prior_rain,total_rain_three_months_prior
12408,0.234782,0.0,0.199884,0.000000,1.001444,0.084475,0.0,0.578815,0.129434,0.002561,...,6.630246,3.508047,304.0,582,0.300603,101,,2.56,0.01,
20205,0.366751,0.0,0.281059,0.002908,0.918135,0.174326,0.0,0.580405,0.332229,0.070729,...,6.283739,3.311795,581.0,582,0.580423,102,0.03,2.33,0.01,2.37
703,0.457277,0.0,0.369603,0.005987,1.308423,0.265479,0.0,0.784860,0.353869,0.091935,...,7.416942,3.905155,581.0,582,0.268931,102,0.01,,,
82084,0.153808,0.0,0.138017,0.000350,0.790564,0.077879,0.0,0.474470,0.096333,0.001487,...,6.039326,3.230875,582.0,581,0.221757,102,0.14,0.30,0.02,0.46
37554,0.699233,0.0,0.421262,0.000017,1.419048,0.202724,0.0,0.738152,0.221872,0.060353,...,7.264755,3.847027,,581,0.126517,102,0.07,0.01,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33033,1.071834,0.0,0.543664,0.006707,1.946876,0.272266,0.0,0.883787,0.285899,0.107478,...,8.510991,4.543437,304.0,582,0.214708,102,0.39,,0.40,
59193,0.033401,0.0,0.042321,0.000000,0.307716,0.038443,0.0,0.176424,0.059110,0.003028,...,4.485913,2.573127,582.0,582,0.242261,102,,,,
59285,0.147926,0.0,0.064140,0.000000,0.295015,0.021248,0.0,0.203973,0.107248,0.001626,...,4.166584,2.308218,581.0,581,0.502475,102,,,0.04,
88866,0.429087,0.0,0.311793,0.000000,1.507914,0.255528,0.0,0.947954,0.354570,0.027585,...,8.126602,4.264431,583.0,556,0.141627,102,0.01,0.51,0.92,1.44


In [18]:
X = data.drop("status", axis=1)
y = data["status"]

upsample_data_ids = X.inspection_id

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

train = pd.concat([X_train, y_train], axis=1)

majority = train[train.status == 0]
minority = train[train.status == 1]

n_majority_desired = int(len(majority) * 0.9)
majority_downsampled = majority.sample(
    n=n_majority_desired, replace=False, random_state=42
)

n_min = int(len(majority_downsampled) * 0.05)

minority_upsampled = minority.sample(n=n_min, replace=True, random_state=42)

train_balanced = (
    pd.concat([majority_downsampled, minority_upsampled])
    .sample(frac=1, random_state=42)
    .reset_index(drop=True)
)

X_train = train_balanced.drop("status", axis=1).drop(columns="inspection_id")
y_train = train_balanced["status"]
X_test = X_test.drop(columns="inspection_id")

# Check majority/minority sizes
print(f"Shape of compliant:", majority.shape)
print(f"Shape of non-compliant:", minority.shape)

print(f"N_minority desired:", int(len(majority_downsampled) / 3))

print(f"Shape of compliant:", majority_downsampled.shape)
print(f"Shape of non-compliant:", minority_upsampled.shape)

print("Shapes:")
print(" Upsampled X_train:", X_train.shape)
print(" Upsampled y_train:", y_train.shape)
print(" Upsampled X_test: ", X_test.shape)
print(" Upsampled y_test: ", y_test.shape)

print("\nTrain distribution after up-sampling:")
print(y_train.value_counts(normalize=True))

X_train

Shape of compliant: (53730, 7)
Shape of non-compliant: (334, 7)
N_minority desired: 16119
Shape of compliant: (48357, 7)
Shape of non-compliant: (2417, 7)
Shapes:
 Upsampled X_train: (50774, 5)
 Upsampled y_train: (50774,)
 Upsampled X_test:  (13516, 5)
 Upsampled y_test:  (13516,)

Train distribution after up-sampling:
status
0    0.952397
1    0.047603
Name: proportion, dtype: float64


Unnamed: 0,maj_landcover_code_y,structure_code,current_month_rain,previous_month_rain,two_months_prior_rain
0,304,100,,,0.28
1,582,101,,,
2,582,102,0.03,0.01,0.02
3,582,102,,0.01,
4,304,101,,0.24,2.48
...,...,...,...,...,...
50769,557,102,1.09,,
50770,303,102,,,
50771,556,102,0.07,0.01,
50772,303,102,0.06,1.88,0.16


In [6]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Length of y_pred:", len(y_pred))
print("Accuracy:", accuracy)


KeyboardInterrupt: 