In [1]:
import numpy as np
import pandas as pd

# More tools here
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import BaggingRegressor, BaggingClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import cross_val_score, cross_val_predict, cross_validate
from sklearn.model_selection import StratifiedKFold, RepeatedKFold, LeaveOneOut, KFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, mean_absolute_error, root_mean_squared_error
from sklearn.preprocessing import OneHotEncoder

## Question 13

In [2]:
wildfire_data = pd.read_csv('wildfires.csv')
wildfire_data.head()

Unnamed: 0,x,y,temp,humidity,windspd,winddir,rain,days,vulnerable,other,ranger,pre1950,heli,resources,traffic,burned,wlf
0,7.834467,8.306801,99.506964,65.940704,7.614523,W,3.7e-05,127,1157.377161,0,0,1,0,117.067076,med,791.620319,0
1,2.694922,3.551933,69.887657,31.895045,6.534184,E,4e-05,115,1134.429689,0,1,0,1,127.598019,hi,451.951898,0
2,6.498186,4.106111,91.15293,57.606073,11.580965,SE,4.1e-05,119,1209.603068,0,0,0,1,132.273679,hi,584.451361,1
3,8.750841,8.887995,54.360593,46.16672,15.383351,E,4e-05,112,1118.691631,0,0,0,0,116.482609,hi,589.681584,1
4,9.20021,9.810147,77.442791,25.490945,7.096639,NW,4.5e-05,146,1319.237687,0,0,1,0,136.52175,lo,1010.567058,0


In [3]:
predictors = wildfire_data.drop(columns = ["burned"], axis=1)
target = wildfire_data["burned"]

# separate numerical and non-numerical predictors for encoding
numerical_vars = predictors.select_dtypes(exclude=["object"])
non_numerical_vars = predictors.select_dtypes(include=["object"])
print(non_numerical_vars.columns)

Index(['winddir', 'traffic'], dtype='object')


In [4]:
encoder = OneHotEncoder(drop='first', sparse_output=False)
# one hot encode non numerical variables
non_num_vars_encoded = encoder.fit_transform(non_numerical_vars)
encoded_df = pd.DataFrame(non_num_vars_encoded, columns=encoder.get_feature_names_out(non_numerical_vars.columns))
predictors = pd.concat([numerical_vars, encoded_df], axis=1)

In [5]:
# train test split and scale
X_train, X_test, y_train, y_test = train_test_split(predictors, target, test_size=0.1, random_state=3)

scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = scaler.transform(X_test)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [None]:
model = KNeighborsRegressor()

# started with given coarse grid got best score of 112.3885 with n_neighbors = 11, weights = distance
# grid = {
#     "n_neighbors": np.arange(1, 101, 5),
#     "weights": ["uniform", "distance"]}
# continued to narrow down
grid = {
    "n_neighbors": np.arange(5, 11),
    "weights": ["uniform", "distance"]}

cv = RepeatedKFold(n_splits=5, n_repeats=2, random_state=3)

gscv = GridSearchCV(model, grid, cv=cv, scoring = "neg_mean_absolute_error", n_jobs=-1)

gscv.fit(X_train_scaled, y_train)

print(gscv.best_params_)
print(-gscv.best_score_)

{'n_neighbors': np.int64(8), 'weights': 'distance'}
111.81358946933385


In [22]:
tuned_knn_model = gscv.best_estimator_
y_pred = tuned_knn_model.predict(X_test_scaled)
print(mean_absolute_error(y_test, y_pred))

104.15105904683168


## Question 19

In [30]:
heart_train = pd.read_csv('train_heart.csv')
heart_test = pd.read_csv('test_heart.csv')
heart_train.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,0,64,2,0,0,0,0,1,0,254,196.0,119.0,35.22,100,79,0
1,0,35,2,0,0,0,0,0,0,248,107.0,73.0,20.64,90,80,0
2,1,38,2,0,0,0,1,0,0,224,127.5,81.0,20.39,80,90,0
3,1,60,1,0,0,0,0,1,0,226,155.0,92.5,30.85,80,87,0
4,1,48,4,0,0,0,0,0,0,233,138.0,88.5,23.62,86,68,0


In [31]:
X_train = heart_train.drop(columns = ["TenYearCHD"], axis=1)
y_train = heart_train["TenYearCHD"]

X_test = heart_test.drop(columns = ["TenYearCHD"], axis=1)
y_test = heart_test["TenYearCHD"]

In [41]:
max_depth_range = range(2,11)
thresholds = np.arange(0, 1.01, 0.01)

cv_results = pd.DataFrame(columns=['Max Depth', 'Threshold', 'Recall', 'Accuracy'])
counter = 0


for max_depth in max_depth_range:
    model = DecisionTreeClassifier(max_depth=max_depth, random_state=6)
    y_pred_proba = cross_val_predict(model, X_train, y_train, cv=5, method="predict_proba")[:, 1]

    for threshold in thresholds:
        y_pred = (y_pred_proba > threshold).astype(int)
        recall = recall_score(y_train, y_pred)
        accuracy = accuracy_score(y_train, y_pred)

        if recall >= 0.3:
            cv_results.loc[counter] = [max_depth, threshold, recall, accuracy]
            counter += 1

best_index = cv_results["Accuracy"].idxmax()
print(cv_results.loc[best_index]) 


Max Depth    4.000000
Threshold    0.240000
Recall       0.326829
Accuracy     0.798322
Name: 73, dtype: float64


In [43]:
best_threshold = cv_results.loc[best_index]["Threshold"]
best_max_depth = int(cv_results.loc[best_index]["Max Depth"])

tuned_tree_model = DecisionTreeClassifier(max_depth=best_max_depth, random_state=6)
tuned_tree_model.fit(X_train, y_train)

y_test_pred_proba = tuned_tree_model.predict_proba(X_test)[:, 1]
y_test_pred = (y_test_pred_proba > best_threshold).astype(int)

print(accuracy_score(y_test, y_test_pred))
print(recall_score(y_test, y_test_pred))

0.7735229759299781
0.35374149659863946
