In [1]:
import pandas as pd
import numpy as np

In [10]:
categorical = pd.read_csv('./files_for_lab/categorical.csv')
numerical = pd.read_csv('./files_for_lab/numerical.csv')
targets = pd.read_csv('./files_for_lab/target.csv')

In [6]:
categorical.isna().sum().sum()

0

In [8]:
numerical.isna().sum().sum()

0

In [12]:
targets.isna().sum().sum()

0

In [13]:
data = pd.concat([numerical, categorical, targets], axis = 1)
data['TARGET_B'].value_counts()

0    90569
1     4843
Name: TARGET_B, dtype: int64

In [26]:
X = pd.concat([numerical, categorical], axis = 1)
y = targets['TARGET_B']

In [27]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

numericalX_train   = X_train.select_dtypes(np.number)
numericalX_test    = X_test.select_dtypes(np.number)
categoricalX_train = X_train.select_dtypes(object) 
categoricalX_test  = X_test.select_dtypes(object) 

# we OneHotEncode the categoricals so we can use the same dataset to perform a regression later (in the lab).
# it is not needed for a DecisionTree or RandomForest model
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='first').fit(categoricalX_train)
encoded_categorical_train = encoder.transform(categoricalX_train).toarray()
encoded_categorical_train = pd.DataFrame(encoded_categorical_train, columns=encoder.get_feature_names_out()) # needed to avoid error
encoded_categorical_test = encoder.transform(categoricalX_test).toarray()
encoded_categorical_test = pd.DataFrame(encoded_categorical_test, columns=encoder.get_feature_names_out())

# we scale the numericals so we can use the same dataset to perform a regression later (in the lab).
# it is not needed for a DecisionTree or RandomForest model
from sklearn.preprocessing import MinMaxScaler
transformer = MinMaxScaler().fit(numericalX_train)
scaled_numerical_train = transformer.transform(numericalX_train)
scaled_numerical_train = pd.DataFrame(scaled_numerical_train, columns=numericalX_train.columns)
scaled_numerical_test  = transformer.transform(numericalX_test)
scaled_numerical_test  = pd.DataFrame(scaled_numerical_test, columns=numericalX_train.columns)

X_train_treated = pd.concat([scaled_numerical_train, encoded_categorical_train], axis = 1)
X_test_treated  = pd.concat([scaled_numerical_test,  encoded_categorical_test],  axis = 1)

In [28]:
y_train.head()

79401    0
86429    0
76729    1
38838    0
83012    0
Name: TARGET_B, dtype: int64

In [29]:
X_train_aggragrated = pd.concat([X_train_treated, y_train.reset_index(drop=True)], axis = 1)

In [30]:
X_train_aggragrated.shape

(76329, 355)

In [33]:
X_train_aggragrated.head()

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U,TARGET_B
0,1.7e-05,0.762887,0.5,0.666667,0.008299,0.0,0.313131,0.10101,0.686869,0.060606,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.536082,0.666667,1.0,0.0,0.0,0.292929,0.242424,0.383838,0.070707,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0
2,1.7e-05,0.608247,0.666667,0.111111,0.020747,0.0,0.424242,0.161616,0.626263,0.10101,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,1.7e-05,0.783505,0.833333,0.666667,0.037344,0.010101,0.40404,0.232323,0.414141,0.080808,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
4,5.2e-05,0.556701,0.666667,0.222222,0.087137,0.333333,0.272727,0.292929,0.181818,0.121212,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [38]:
from imblearn.over_sampling import RandomOverSampler

X = X_train_aggragrated
y = X_train_aggragrated['TARGET_B']  

ros = RandomOverSampler(sampling_strategy='minority', random_state=0)

# Fit and apply the oversampling to the data
X_train_resampled, y_train_resampled = ros.fit_resample(X, y)

In [39]:
X_train_resampled['TARGET_B'].value_counts()

0    72486
1    72486
Name: TARGET_B, dtype: int64

In [43]:
X_train_resampled.drop('TARGET_B', axis=1, inplace=True)

In [44]:
y_train_resampled.value_counts()

0    72486
1    72486
Name: TARGET_B, dtype: int64

In [45]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

clf = RandomForestClassifier(max_depth=5, # max number of questions to ask
                             min_samples_split=20, # amount of rows still considered at every question
                             min_samples_leaf =20, # ultimate answer based on at least this many rows
                             max_samples=0.8, # fraction of X-train to use in each tree
                             random_state=42)
clf.fit(X_train_resampled, y_train_resampled)
print(clf.score(X_train_resampled, y_train_resampled))
print(clf.score(X_test_treated, y_test))

y_pred = clf.predict(X_test_treated)
display(y_test.value_counts())
display(confusion_matrix(y_test, y_pred))

0.6253897304307039
0.6042550961588848


0    18083
1     1000
Name: TARGET_B, dtype: int64

array([[10988,  7095],
       [  457,   543]])

FEATURE SELECTION

In [46]:
scaler = MinMaxScaler()
numerical_scaled = scaler.fit_transform(numerical)

In [47]:
from sklearn.feature_selection import VarianceThreshold 
var_threshold = 0.02
sel = VarianceThreshold(threshold=(var_threshold))

sel = sel.fit(numerical_scaled)
temp = sel.transform(numerical_scaled)
temp = pd.DataFrame(temp)
print(numerical_scaled.shape)
print(temp.shape)

(95412, 315)
(95412, 79)


In [48]:
sel.variances_ > var_threshold
sel.get_support()
var_list = list(sel.get_support())
var_list

[False,
 True,
 True,
 True,
 False,
 False,
 False,
 True,
 True,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 True,
 True,
 True,
 False,
 False,
 True,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 True,
 False,
 True,
 True,
 True,
 False,
 False,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 False,
 True,
 False,
 True,
 True,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 True,
 True,
 False,
 False,
 True,
 False,
 True,

In [49]:
list(zip(numerical.columns, var_list))

[('TCODE', False),
 ('AGE', True),
 ('INCOME', True),
 ('WEALTH1', True),
 ('HIT', False),
 ('MALEMILI', False),
 ('MALEVET', False),
 ('VIETVETS', True),
 ('WWIIVETS', True),
 ('LOCALGOV', False),
 ('STATEGOV', False),
 ('FEDGOV', False),
 ('WEALTH2', True),
 ('POP901', False),
 ('POP902', False),
 ('POP903', False),
 ('POP90C1', True),
 ('POP90C2', True),
 ('POP90C3', True),
 ('POP90C4', False),
 ('POP90C5', False),
 ('ETH1', True),
 ('ETH2', True),
 ('ETH3', False),
 ('ETH4', False),
 ('ETH5', False),
 ('ETH6', False),
 ('ETH7', False),
 ('ETH8', False),
 ('ETH9', False),
 ('ETH10', False),
 ('ETH11', False),
 ('ETH12', False),
 ('ETH13', False),
 ('ETH14', False),
 ('ETH15', False),
 ('ETH16', False),
 ('AGE901', False),
 ('AGE902', False),
 ('AGE903', False),
 ('AGE904', False),
 ('AGE905', False),
 ('AGE906', False),
 ('AGE907', False),
 ('CHIL1', False),
 ('CHIL2', False),
 ('CHIL3', False),
 ('AGEC1', False),
 ('AGEC2', False),
 ('AGEC3', False),
 ('AGEC4', False),
 ('AGEC5', F

In [50]:
cols_to_remove = [col[0] for col in zip(numerical.columns, var_list) if col[1] == False]

In [51]:
cols_to_remove

['TCODE',
 'HIT',
 'MALEMILI',
 'MALEVET',
 'LOCALGOV',
 'STATEGOV',
 'FEDGOV',
 'POP901',
 'POP902',
 'POP903',
 'POP90C4',
 'POP90C5',
 'ETH3',
 'ETH4',
 'ETH5',
 'ETH6',
 'ETH7',
 'ETH8',
 'ETH9',
 'ETH10',
 'ETH11',
 'ETH12',
 'ETH13',
 'ETH14',
 'ETH15',
 'ETH16',
 'AGE901',
 'AGE902',
 'AGE903',
 'AGE904',
 'AGE905',
 'AGE906',
 'AGE907',
 'CHIL1',
 'CHIL2',
 'CHIL3',
 'AGEC1',
 'AGEC2',
 'AGEC3',
 'AGEC4',
 'AGEC5',
 'AGEC6',
 'AGEC7',
 'CHILC1',
 'CHILC2',
 'CHILC3',
 'CHILC4',
 'CHILC5',
 'HHAGE1',
 'HHAGE2',
 'HHAGE3',
 'HHN1',
 'HHN2',
 'HHN4',
 'HHN5',
 'HHN6',
 'MARR1',
 'MARR2',
 'MARR3',
 'MARR4',
 'HHP1',
 'HHP2',
 'DW3',
 'DW7',
 'DW8',
 'DW9',
 'HU3',
 'HU4',
 'HHD1',
 'HHD4',
 'HHD6',
 'HHD7',
 'HHD8',
 'HHD9',
 'HHD10',
 'HHD11',
 'HHD12',
 'ETHC1',
 'ETHC3',
 'ETHC4',
 'ETHC5',
 'ETHC6',
 'HUR1',
 'RHP1',
 'RHP2',
 'RHP3',
 'RHP4',
 'HUPA1',
 'HUPA4',
 'HUPA5',
 'HUPA7',
 'DMA',
 'IC1',
 'IC2',
 'IC3',
 'IC4',
 'IC5',
 'IC7',
 'IC8',
 'IC9',
 'IC10',
 'IC11',
 'IC1

In [52]:
numerical.drop(cols_to_remove, axis=1, inplace=True)

In [54]:
numerical.shape

(95412, 79)

In [56]:
correlation_matrix = numerical.corr()

In [57]:
columns_to_drop = []
n_features = len(numerical.columns)

for i in range(n_features):
    for j in range(i + 1, n_features):
        if abs(correlation_matrix.iloc[i, j]) > 0.9:
            column_i = numerical.columns[i]
            column_j = numerical.columns[j]

            if column_i not in columns_to_drop:
                columns_to_drop.append(column_j)

In [58]:
columns_to_drop

['DW2',
 'DW5',
 'DW6',
 'HV2',
 'HVP1',
 'HVP6',
 'HV4',
 'HU2',
 'HHD3',
 'HHD5',
 'HVP4',
 'RP2',
 'LFC4',
 'HC8',
 'HC18']

In [59]:
numerical.drop(columns_to_drop, axis=1, inplace=True)

RETRYING THE FOREST MODEL WITH FEATURE REDUCTION

In [60]:
X = pd.concat([numerical, categorical], axis = 1)
y = targets['TARGET_B']

In [61]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

numericalX_train   = X_train.select_dtypes(np.number)
numericalX_test    = X_test.select_dtypes(np.number)
categoricalX_train = X_train.select_dtypes(object) 
categoricalX_test  = X_test.select_dtypes(object) 

# we OneHotEncode the categoricals so we can use the same dataset to perform a regression later (in the lab).
# it is not needed for a DecisionTree or RandomForest model
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='first').fit(categoricalX_train)
encoded_categorical_train = encoder.transform(categoricalX_train).toarray()
encoded_categorical_train = pd.DataFrame(encoded_categorical_train, columns=encoder.get_feature_names_out()) # needed to avoid error
encoded_categorical_test = encoder.transform(categoricalX_test).toarray()
encoded_categorical_test = pd.DataFrame(encoded_categorical_test, columns=encoder.get_feature_names_out())

# we scale the numericals so we can use the same dataset to perform a regression later (in the lab).
# it is not needed for a DecisionTree or RandomForest model
from sklearn.preprocessing import MinMaxScaler
transformer = MinMaxScaler().fit(numericalX_train)
scaled_numerical_train = transformer.transform(numericalX_train)
scaled_numerical_train = pd.DataFrame(scaled_numerical_train, columns=numericalX_train.columns)
scaled_numerical_test  = transformer.transform(numericalX_test)
scaled_numerical_test  = pd.DataFrame(scaled_numerical_test, columns=numericalX_train.columns)

X_train_treated = pd.concat([scaled_numerical_train, encoded_categorical_train], axis = 1)
X_test_treated  = pd.concat([scaled_numerical_test,  encoded_categorical_test],  axis = 1)

In [62]:
X_train_aggragrated = pd.concat([X_train_treated, y_train.reset_index(drop=True)], axis = 1)

In [63]:
from imblearn.over_sampling import RandomOverSampler

X = X_train_aggragrated
y = X_train_aggragrated['TARGET_B']  

ros = RandomOverSampler(sampling_strategy='minority', random_state=0)

# Fit and apply the oversampling to the data
X_train_resampled, y_train_resampled = ros.fit_resample(X, y)

In [64]:
X_train_resampled.drop('TARGET_B', axis=1, inplace=True)

In [65]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

clf = RandomForestClassifier(max_depth=5, # max number of questions to ask
                             min_samples_split=20, # amount of rows still considered at every question
                             min_samples_leaf =20, # ultimate answer based on at least this many rows
                             max_samples=0.8, # fraction of X-train to use in each tree
                             random_state=42)
clf.fit(X_train_resampled, y_train_resampled)
print(clf.score(X_train_resampled, y_train_resampled))
print(clf.score(X_test_treated, y_test))

y_pred = clf.predict(X_test_treated)
display(y_test.value_counts())
display(confusion_matrix(y_test, y_pred))

0.6218649118450459
0.6035738615521669


0    18083
1     1000
Name: TARGET_B, dtype: int64

array([[10970,  7113],
       [  452,   548]])

Compare to... With no features reduction

0.6253897304307039
0.6042550961588848
0    18083
1     1000
Name: TARGET_B, dtype: int64
array([[10988,  7095],
       [  457,   543]])

My feature selection did not improve much the model, they are basically the same

Discuss the output and its impact in the business scenario. Is the cost of a false positive equals to the cost of the false negative? How would you change your algorithm or data in order to maximize the return of the business?

In [66]:
data['TARGET_D'][data['TARGET_B'] == 1].mean()

15.624344414619037

Cost of false negative no feature selection

In [81]:
no_f_sel_cost_false_neg = 15.62 * 457

In [80]:
no_f_sel_shipping_cost_saved = 0.68 * 10988

In [82]:
no_f_sel_saved_money = no_f_sel_shipping_cost_saved - no_f_sel_cost_false_neg

In [74]:
f_sel_cost_false_neg = 15.62 * 452

In [77]:
f_sel_shipping_cost_saved = 10970 * 0.68

In [78]:
f_sel_saved_money = f_sel_saved_money - f_sel_cost_false_neg

In [83]:
print(no_f_sel_saved_money , f_sel_saved_money)

333.5000000000009 399.3600000000006
