# Women in Data Science - 2018 - Kaggle/Stanford Uni

In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import RandomizedSearchCV

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

### Preprocessing

Clean and Transform dataset for use with sklearn and tensorflow. 

In [2]:
df = pd.read_csv("Data/train.csv", low_memory=False)
df_test = pd.read_csv("Data/test.csv", low_memory=False)
df_dict = pd.read_csv('Data/WiDS data dictionary v2.csv')

Y = df['is_female'] # Label
X = df.drop(['is_female','train_id'], axis = 1)
X = X.dropna(axis=1, how='all') # If all values are nans, drop col

In [4]:
def preprocess_df(X):


    '''

    ### Returns a dictionary object with dummified categorical variables and standardized numerical variables.

    1. Check data dictionary to see which variables are numerical/categorical

    - Get a tentative list of numerical and categorical variables:
        
        - For numerical variables, we first look at columns that have the values N/A or 99=DK. 
          Most variables that are numerical have this sort of 'value' in the data dictionary
          To make sure, we treat anything with less than 10 distinct unique values as categorical.
    
    - For categorical vars, a starting point is all variables that do not have the dtype 'np.number'
    - Final list of column names is stored as 'treat_as_num' and 'treat_as_cat'.
    
    
    '''

    # For storage of column names that are categorical / numerical
    treat_as_num = []
    treat_as_cat = []

    tentative_num = [i for i in df_dict[df_dict['Values'] == 'N/A\n99=DK']['Column Name'].values if i in X.columns.values]
    tentative_cat = X.select_dtypes(exclude=[np.number]).columns.tolist()

    treat_as_cat = treat_as_cat + tentative_cat[:]
    
    ### Columns that are in both the df and data dictionary.
    cols_in_df_and_datadict = [i for i in X.columns.values if i in df_dict['Column Name'].values]

    ### Columns that are NOT in the data dictionary but are found in the df.
    cols_not_in_dict = [i for i in X.columns.values if i not in df_dict['Column Name'].values]
    
    ####
    
    '''
    For columns in tentative_num, if column has less than ten categories, treat it as categorical
    
    '''

    for col in tentative_num:
        
        num_of_categories = len(X[col].value_counts().keys())

        if num_of_categories <= 10:
            print('Change To Categorical : ', col, num_of_categories)
            treat_as_cat.append(col)
        else:
            print('Keep As Numerical : ', col, num_of_categories)
            treat_as_num.append(col)
            
    ####
    
    '''
    For columns not found in the dictionary, use the same crude rule - if number of distinct values <= 10,
    treat as a categorical var else numerical
    
    '''
    
    for col in cols_not_in_dict:
    
        num_of_categories = len(X[col].value_counts().keys())
    
        if col in treat_as_cat:
            pass
        elif num_of_categories >= 10:
            treat_as_num.append(col)
        elif num_of_categories <= 10:
            treat_as_cat.append(col)

    ####
    
    
    '''
    Every column that is in df and data dict but not in treat_as_num or treat_as_cat yet, treat them as categorical.
    
    '''
    
    for col in cols_in_df_and_datadict:
    
        if col in treat_as_num:
            pass

        elif col in treat_as_cat:
            pass

        else:
            treat_as_cat.append(col)

    
    ####
    
    '''
    Now we dummify the categorical variables and standardize the numerical vars
    
    '''

    data_dict = {} 
    
    for i in X.columns:

        if i in treat_as_cat: # For every column, if categorical, convert to one hot encoding/dummy vars
            
            # Replace value of 99 [mostly refers to Don't Know in data] as np.nan
            X[i] = X[i].replace(to_replace=99,value=np.NaN) 

            sub_dummy = pd.get_dummies(X[i],prefix=i,dummy_na=True)

            for j in sub_dummy.columns.values: # For every column in dummified df, add to data_dict
                data_dict[j] = sub_dummy[j].values

        else: # If col is numerical, standardize, fill 
            
            X[i].fillna(X[i].median(),inplace=True)
            stdized_col = (X[i] - X[i].mean() )/ X[i].std()
            data_dict[i] = stdized_col.values
            
            
    return data_dict

In [5]:
X = pd.DataFrame.from_dict(preprocess_df(X))

Keep As Numerical :  DG1 79
Keep As Numerical :  DG8a 13
Keep As Numerical :  DG8b 13
Keep As Numerical :  DG8c 13
Keep As Numerical :  DG9a 12
Keep As Numerical :  DG9b 11
Change To Categorical :  DG9c 8
Keep As Numerical :  DL8 341
Keep As Numerical :  DL11 15
Keep As Numerical :  MT1 13
Keep As Numerical :  MT6C 28
Change To Categorical :  FF7_1 4
Change To Categorical :  FF7_2 4
Change To Categorical :  FF7_3 2
Change To Categorical :  FF7_4 5
Change To Categorical :  FF7_5 4
Change To Categorical :  FF7_6 3
Change To Categorical :  FF7_7 2
Change To Categorical :  FF7_96 2
Change To Categorical :  FF8_1 3
Change To Categorical :  FF8_2 2
Change To Categorical :  FF8_3 2
Change To Categorical :  FF8_4 3
Change To Categorical :  FF8_5 2
Change To Categorical :  FF8_6 2
Change To Categorical :  FF8_7 2
Change To Categorical :  FF8_96 1
Change To Categorical :  MM23 1
Change To Categorical :  IFI18 9
Keep As Numerical :  FB13 25
Keep As Numerical :  FB14 14
Keep As Numerical :  FB15 2

In [6]:
X.shape

(18255, 5643)

In [7]:
# Split the 'features' and 'income' data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.2, random_state = 0)

### Random Forest

In [8]:
clf = RandomForestClassifier(n_estimators=100,
                             max_features='auto',
                             n_jobs=-1,
                             random_state=1,
                             criterion='gini',
                             oob_score=True,
                             bootstrap=True,
                             )

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test, y_pred))

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


0.8485346480416325


#### RandomForest Parameter Tuning

In [62]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 5000, num = 15)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(2, 100, num = 10)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)

{'max_depth': [2, 7, 12, 18, 23, 28, 34, 39, 44, 50, None], 'min_samples_split': [2, 5, 10], 'bootstrap': [True, False], 'n_estimators': [10, 366, 722, 1079, 1435, 1792, 2148, 2505, 2861, 3217, 3574, 3930, 4287, 4643, 5000], 'min_samples_leaf': [1, 2, 4], 'max_features': ['auto', 'sqrt', 'log2']}


In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, 
                               param_distributions = random_grid, 
                               n_iter = 10, cv = 3, verbose=3, random_state=1, n_jobs = -1)

# Fit the random search model
rf_random.fit(X_train, y_train)

# Best model from our random search
rf_best = rf_random.best_estimator_

# For Accuracy
y_pred = rf_best.predict(X_test)
print('Accuracy :', accuracy_score(y_test, y_pred))

# Get probabilities for being female
y_pred_prob = rf_best.predict_proba(X_test)
y_pred_prob = [i[1] for i in y_pred_prob] # Only get probabilities for is_female == 1 as per Kaggle website.
print("AUC - ROC : ", roc_auc_score(y_test,y_pred_prob))

Fitting 3 folds for each of 10 candidates, totalling 30 fits


### Gradient Tree Boosting

In [510]:
clf = GradientBoostingClassifier(n_estimators=5000,
                                 learning_rate=0.5,
                                 max_depth=None, 
                                 random_state=0)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test, y_pred))
#clf.score(X_test, y_test)

0.8761983018351137


### Binary Adaboost

In [33]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# Create and fit an AdaBoosted decision tree
bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3),
                         algorithm="SAMME",
                         n_estimators=1000,
                         learning_rate=1)

bdt.fit(X_train, y_train)
y_pred = bdt.predict(X_test)
y_pred_prob = bdt.predict_proba(X_test)
y_pred_prob = [i[1] for i in y_pred_prob]

print(accuracy_score(y_test,y_pred))
print("AUC - ROC : ", roc_auc_score(y_test,y_pred_prob))

KeyboardInterrupt: 

#### Hyperparameter Tuning for AdaBoost

In [34]:
from sklearn.model_selection import RandomizedSearchCV

# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2',None]

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(2, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15,30,50, 100]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4, 7, 10, 20, 30, 50]


# Create the random grid
random_grid = {'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               }

print(random_grid)

{'max_features': ['auto', 'sqrt', 'log2', None], 'min_samples_leaf': [1, 2, 4, 7, 10, 20, 30, 50], 'min_samples_split': [2, 5, 10, 15, 30, 50, 100], 'max_depth': [2, 12, 23, 34, 45, 56, 66, 77, 88, 99, 110, None]}


In [35]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
bdt_cv = DecisionTreeClassifier()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
bdt_random = RandomizedSearchCV(estimator = bdt_cv, 
                               param_distributions = random_grid, 
                               n_iter = 100, cv = 3, verbose=3, random_state=1, n_jobs = -1)

# Fit the random search model
bdt_random.fit(X_train, y_train)

print(bdt_random.best_estimator_)
print(bdt_random.best_params_)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   26.7s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed: 11.2min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 327.9min finished


RandomizedSearchCV(cv=10, error_score='raise',
          estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          fit_params=None, iid=True, n_iter=100, n_jobs=-1,
          param_distributions={'max_features': ['auto', 'sqrt', 'log2', None], 'min_samples_leaf': [1, 2, 4, 7, 10, 20, 30, 50], 'min_samples_split': [2, 5, 10, 15, 30, 50, 100], 'max_depth': [2, 12, 23, 34, 45, 56, 66, 77, 88, 99, 110, None]},
          pre_dispatch='2*n_jobs', random_state=1, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [None]:
### Now use decision tree with tuned hyperparameters as input to AdaBoost

best_bdt = bdt_random.best_estimator_

# Create and fit an AdaBoosted decision tree
ada = AdaBoostClassifier(best_bdt,
                         algorithm="SAMME",
                         n_estimators=1079,
                         learning_rate=0.8276034482758621,
                         )

ada.fit(X_train, y_train)
y_pred = ada.predict(X_test)
print('Accuracy for AdaBoost with Decision Trees : ', accuracy_score(y_test,y_pred))

y_pred_prob = ada.predict_proba(X_test)
y_pred_prob = [i[1] for i in y_pred_prob]

print("AUC - ROC for AdaBoost with Decision Trees : ", roc_auc_score(y_test,y_pred_prob))

In [538]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 5000, num = 15)]

algorithm = ['SAMME','SAMME.R']
# Maximum number of levels in tree
learning_rate = [x for x in np.linspace(0.0001, 1, num = 30)]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'algorithm': algorithm,
               'learning_rate': learning_rate,
               }

print(random_grid)

{'learning_rate': [0.0001, 0.03457931034482759, 0.06905862068965518, 0.10353793103448276, 0.13801724137931035, 0.17249655172413794, 0.2069758620689655, 0.2414551724137931, 0.2759344827586207, 0.3104137931034483, 0.3448931034482759, 0.3793724137931035, 0.41385172413793103, 0.44833103448275863, 0.4828103448275862, 0.5172896551724139, 0.5517689655172414, 0.586248275862069, 0.6207275862068966, 0.6552068965517241, 0.6896862068965518, 0.7241655172413793, 0.758644827586207, 0.7931241379310345, 0.8276034482758621, 0.8620827586206897, 0.8965620689655173, 0.9310413793103449, 0.9655206896551725, 1.0], 'n_estimators': [10, 366, 722, 1079, 1435, 1792, 2148, 2505, 2861, 3217, 3574, 3930, 4287, 4643, 5000], 'algorithm': ['SAMME', 'SAMME.R']}


In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
ada_cv = AdaBoostClassifier(best_bdt)

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
ada_random = RandomizedSearchCV(estimator = ada_cv, 
                               param_distributions = random_grid, 
                               n_iter = 10, cv = 3, verbose=3, random_state=1, n_jobs = -1)

# Fit the random search model
ada_random.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [None]:
ada_random.best_params_

In [554]:
# Create and fit an AdaBoosted decision tree
ada_best = AdaBoostClassifier(best_bdt,
                         algorithm="SAMME",
                         n_estimators=1079,
                         learning_rate=0.8276034482758621)

ada_best.fit(X_train, y_train)
y_pred = ada_best.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.9156395508079979


### Prediction on Test Set

In [None]:
!jupyter nbconvert --to script config_template.ipynb