In [1]:
import pandas as pd
import numpy as np

### Columns which I think are not useful or hard to utilize: 

- year: Storm year
- county: Impacted county name
- abrState: State postal abreviation
- name: Storm name
- numInMonth: Storm sequence in storm's month 1 (all 1's)
- numInSeason: Storm sequence in storm's season (all 1's)
- recoverCount: Storm sequence in previous storm's recovery period (all 1's)
- takeupTotal: County take up rate in month 0

### One-hot encoding variables: 

- month: Storm month
- CAT: Storm category over county

In [39]:
raw = pd.read_csv('sampleStruc.csv')

# Drop columns mentioned above
raw.drop(['year','county','abrState','name','numInMonth', 'numInSeason','recoverCount','takeupTotal'], axis=1, inplace=True)

# One-hot encoding
dummy_month = pd.get_dummies(raw.month)
dummy_CAT = pd.get_dummies(raw.CAT)
data = pd.concat([raw, dummy_month, dummy_CAT], axis=1, sort=False)
data.drop(['month','CAT'], axis=1, inplace=True)

In [40]:
data.columns

Index([           'recovery',            'prevProd',          'Production',
                      'vmax',                'mslp',                'time',
               'ratePoverty',    'houseMedianValue',       'houseOccupied',
                'houseTotal', 'sumBuildingCoverage',         'policyCount',
                           5,                     6,                     7,
                           8,                     9,                    10,
                        'EX',                  'H1',                  'H2',
                        'H3',                  'H4',                  'H5',
                        'LO',                  'SS',                  'TD',
                        'TS'],
      dtype='object')

### Models 

2-layer model framework: <br>
  First layer: classification for whether obs is an outlier (set threshold = 12, minority class ratio = 10%) <br>
  Second layer: regression

In [41]:
#Train valid test split
from sklearn.model_selection import train_test_split

def outlier_label(df, i):
    if df.iloc[i].recovery >= 12:
        return 1
    else:
        return 0

data['outlier'] = [outlier_label(data, i) for i in range(len(data))]

In [53]:
X, y = data.drop(['recovery'], axis=1), data[['recovery', 'outlier']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### 1st layer: classification random forest

Output model name: **layer1_rf**

In [54]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, roc_curve, auc
from sklearn.metrics import classification_report

X_train_1 = X_train.drop(['outlier'], axis=1)
y_train_1 = X_train['outlier']
X_test_1 = X_test.drop(['outlier'], axis=1)
y_test_1 = X_test['outlier']

parameters = {'n_estimators':[50,100], 'max_depth':[None, 5, 10]}
layer1_rf = GridSearchCV(RandomForestClassifier(random_state=1, oob_score=True, class_weight='balanced_subsample'), parameters, scoring='f1')
layer1_rf.fit(X_train_1, y_train_1)

y_pred_rf = layer1_rf.predict(X_test_1)
y_pred_proba_rf = layer1_rf.predict_proba(X_test_1)[::,1]
fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test_1, y_pred_proba_rf)
auc_rf = auc(fpr_rf, tpr_rf)

print(classification_report(y_test_1, y_pred_rf))
print('F1 score:', f1_score(y_test_1,y_pred_rf))
print('AUC:', auc_rf)
print('Accuracy:', accuracy_score(y_test_1,y_pred_rf))

              precision    recall  f1-score   support

           0       0.90      0.99      0.94        74
           1       0.67      0.20      0.31        10

    accuracy                           0.89        84
   macro avg       0.78      0.59      0.62        84
weighted avg       0.87      0.89      0.87        84

F1 score: 0.30769230769230765
AUC: 0.7878378378378378
Accuracy: 0.8928571428571429


### Second layer (a): non-outliers

Decision Tree Regression, Random Forest Regression, Gradient Boosting Regression <br>
All conducted on 5 folds cross-validation

In [61]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

X_train_2a = X_train[X_train['outlier']==0]
y_train_2a = y_train[X_train['outlier']==0]['recovery']
X_train_2a.drop(['outlier'], axis=1, inplace=True)

# DT
layer2a_dt = DecisionTreeRegressor(max_depth=5, random_state=0)
print('Decision Tree R2:', cross_val_score(layer2a_dt, X_train_2a, y_train_2a, scoring='r2').mean())

# RF
layer2a_rf = RandomForestRegressor(max_depth=5, random_state=0)
print('Random Forest R2:', cross_val_score(layer2a_rf, X_train_2a, y_train_2a, scoring='r2').mean())

# GB
layer2a_gb = GradientBoostingRegressor(random_state=0)
print('Gradient Boosting R2:', cross_val_score(layer2a_gb, X_train_2a, y_train_2a, scoring='r2').mean())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Decision Tree R2: -0.2478770501790999
Random Forest R2: 0.08899012183454477
Gradient Boosting R2: -0.08162386340013952


### Second layer (b): outliers

In [62]:
X_train_2b = X_train[X_train['outlier']==1]
y_train_2b = y_train[X_train['outlier']==1]['recovery']
X_train_2b.drop(['outlier'], axis=1, inplace=True)

# DT
layer2b_dt = DecisionTreeRegressor(max_depth=5, random_state=0)
print('Decision Tree R2:', cross_val_score(layer2b_dt, X_train_2b, y_train_2b, scoring='r2').mean())

# RF
layer2b_rf = RandomForestRegressor(max_depth=5, random_state=0)
print('Random Forest R2:', cross_val_score(layer2b_rf, X_train_2b, y_train_2b, scoring='r2').mean())

# GB
layer2b_gb = GradientBoostingRegressor(random_state=0)
print('Gradient Boosting R2:', cross_val_score(layer2b_gb, X_train_2b, y_train_2b, scoring='r2').mean())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Decision Tree R2: -3.0664889703850564
Random Forest R2: -0.6931668173336873
Gradient Boosting R2: -1.2580561468099267


### Test Algorithm

In [67]:
X_test.drop(['outlier'], axis=1, inplace=True)
y_test.drop(['outlier'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [80]:
def prediction(X_test, outlier_ind_lst):
    y_pred = []
    
    layer2a_rf.fit(X_train_2a, y_train_2a)
    y_pred_a = layer2a_rf.predict(X_test)
    
    layer2b_rf.fit(X_train_2b, y_train_2b)
    y_pred_b = layer2b_rf.predict(X_test)
    
    for i in range(len(X_test)):
        if outlier_ind_lst[i] == 0:
            y_pred.append(y_pred_a[i])
        elif outlier_ind_lst[i] == 1:
            y_pred.append(y_pred_b[i])
    return y_pred

In [81]:
from sklearn.metrics import mean_squared_error, r2_score

outlier_pred = layer1_rf.predict(X_test)
y_pred = prediction(X_test, outlier_pred)

print('Test R2 score:', r2_score(y_test,y_pred))
print('Test MSE:', mean_squared_error(y_test, y_pred))

Test R2 score: -0.16831842684318699
Test MSE: 131.3000490530202


#### Comparison with single layer regression

In [93]:
X_train_single = X_train.drop(['outlier'], axis=1)
y_train_single = y_train.drop(['outlier'], axis=1)

single_rf = RandomForestRegressor(random_state=1, max_depth=5)
single_rf.fit(X_train_single, y_train_single)
y_pred_single = single_rf.predict(X_test)

print('Single Layer R2 score:', r2_score(y_test,y_pred_single))
print('Single Layer MSE:', mean_squared_error(y_test, y_pred_single))

Single Layer R2 score: -0.25234114674084673
Single Layer MSE: 140.74284049639408


  """
