# Women in Data Science - 2018 - Kaggle/Stanford Uni

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV

### Preprocessing

Clean and Transform dataset for use with sklearn and tensorflow. 

In [None]:
df = pd.read_csv("Data/train.csv", low_memory=False)
df_test = pd.read_csv("Data/test.csv", low_memory=False)
df_dict = pd.read_csv('Data/WiDS data dictionary v2.csv')
test_id = df_test['test_id']

Y = df['is_female'] # Label
X = df.drop(['train_id'], axis = 1)
X = X.dropna(axis=1, how='all') # If all values are nans, drop col
X = X.replace(to_replace=[99,99.0],value=[np.NaN,np.NaN])

In [None]:
def preprocess(X):
    '''

    Returns a dictionary object with dummified categorical variables and standardized numerical variables.

    1. Check data dictionary to see which variables are numerical/categorical

    - Get a tentative list of numerical and categorical variables:
    - For categorical vars, a starting point is all variables that do not have the dtype 'np.number'
    - Final list of column names is stored as 'treat_as_num' and 'treat_as_cat'.


    '''

    # For storage of column names that are categorical / numerical
    treat_as_num = []
    treat_as_cat = []

    tentative_num = [i for i in df_dict[df_dict['Values'] == 'N/A\n99=DK']['Column Name'].values if i in X.columns.values]
    tentative_cat = X.select_dtypes(exclude=[np.number]).columns.tolist()

    treat_as_num = treat_as_num + tentative_num[:]
    treat_as_cat = treat_as_cat + tentative_cat[:]

    ### Drop columns that are 90% missing values
    for i in X.columns:
        if sum(X[i].isnull()) > 14604:
            if sum(X[i].value_counts()) < 400:
                X.drop(i,inplace=True,axis=1)


    ### Columns that are in both the df and data dictionary.
    cols_in_df_and_datadict = [i for i in X.columns.values if i in df_dict['Column Name'].values]

    ### Columns that are NOT in the data dictionary but are found in the df.
    cols_not_in_dict = [i for i in X.columns.values if i not in df_dict['Column Name'].values]

    for col in cols_not_in_dict:

        num_of_categories = len(X[col].value_counts().keys())

        if col in treat_as_cat:
            pass
        elif num_of_categories >= 10:
            treat_as_num.append(col)
        elif num_of_categories <= 10:
            treat_as_cat.append(col)

    ####

    for col in cols_in_df_and_datadict:

        if col in treat_as_num:
            pass

        elif col in treat_as_cat:
            pass

        else:
            treat_as_cat.append(col)


    ####
    
    data_dict = {} 

    for i in X.columns:

        if i in treat_as_cat: # For every column, if categorical, convert to one hot encoding/dummy vars
            
            #for row in X[i].iteritems():
                
            #    if np.isnan(row[1]):
                    
            #        if X.loc[row[0],'is_female'] == 0:
            #            X.set_value(row[0],i,X[X['is_female'] == 0][i].mode()) 
                
            #        elif X.loc[row[0],'is_female'] == 1:
            #            X.set_value(row[0],i,X[X['is_female'] == 1][i].mode())

            sub_dummy = pd.get_dummies(X[i],prefix=i,dummy_na=False)

            for j in sub_dummy.columns.values: # For every column in dummified df, add to data_dict
                data_dict[j] = sub_dummy[j].values

        else: # If col is numerical, standardize, fill 

            X[i].fillna(X[i].median(),inplace=True)
            stdized_col = (X[i] - X[i].mean() )/ X[i].std()
            data_dict[i] = stdized_col.values
            
    data_dict.pop('is_female',None)
    
    return data_dict

In [None]:
X = pd.DataFrame.from_dict(preprocess(X))

In [None]:
X.shape

In [None]:
df_test = df_test.drop(['test_id'], axis = 1)
df_test = df_test.dropna(axis=1, how='all') # If all values are nans, drop col
df_test = df_test.replace(to_replace=[99,99.0],value=[np.NaN,np.NaN])
X_test = pd.DataFrame.from_dict(preprocess(df_test))

In [None]:
X_test = X_test[[i for i in X_test.columns.values if i in X.columns.values]]

In [None]:
X = X[[i for i in X_test.columns.values if i in X.columns.values]]

In [None]:
# Split the 'features' and 'income' data into training and testing sets
x_train, x_val, y_train, y_val = train_test_split(X,Y, test_size = 0.3, random_state = 0)

### CatBoost

In [None]:
from catboost import CatBoostClassifier

In [None]:
model = CatBoostClassifier(iterations=50, learning_rate=0.5, depth=10,\
                       custom_metric='AUC',eval_metric='AUC',)

In [None]:
model.fit(x_train.values, y_train.values)

In [None]:
# Get predicted classes
preds_class = model.predict(x_val)
# Get predicted probabilities for each class
preds_proba = model.predict_proba(x_val)
# Get predicted RawFormulaVal
#preds_raw = model.predict(test_data, prediction_type='RawFormulaVal')

In [None]:
print(accuracy_score(y_val,preds_class))
print(roc_auc_score(y_val,[i[1] for i in preds_proba]))

### Best scoring model uptill now

In [None]:
results = []
for i in [0.3,0.32,0.35,0.38]:
    
    for j in [5,7,11]:
        
        for x in [100,300,500,1000,1500,2000]:
                
                x_train, x_val, y_train, y_val = train_test_split(X,Y, test_size = 0.3, random_state = 0)

                print('\n','Learning Rate : ' , i , ' Max Depth : ', j, ' Iterations : ', x)

                model = CatBoostClassifier(iterations=x, learning_rate=i, depth=j,\
                                           custom_metric='AUC',eval_metric='AUC',logging_level='Silent')

                model.fit(x_train.values, y_train.values)

                # Get predicted classes
                preds_class = model.predict(x_val)
                # Get predicted probabilities for each class
                preds_proba = model.predict_proba(x_val)
                print(accuracy_score(y_val,preds_class))
                print(roc_auc_score(y_val,[i[1] for i in preds_proba]))
                
                
                preds_proba_test = model.predict_proba(X_test)
                current = [i[1] for i in preds_proba_test]
                results.append(current)

In [None]:
avg = pd.DataFrame(results)
avg = avg.mean(axis=0)
pd.DataFrame(avg,columns=['is_female'],index=test_id).to_csv('catboost_avg.csv',encoding='utf-8')

### More Models

In [None]:
### Avg of various RandomForest models
results_rf = []

for j in [2,5,7,11,None]:

    for x in [100,500,1500]:

            x_train, x_val, y_train, y_val = train_test_split(X,Y, test_size = 0.3, random_state = 0)

            print(' Max Depth : ', j, ' Iterations : ', x)

            model = RandomForestClassifier(n_estimators=x,
                                           max_features='auto',
                                           max_depth=j,
                                           n_jobs=-1,
                                           random_state=1,
                                           criterion='gini',
                                           oob_score=True,
                                           bootstrap=True,
                                           )

            model.fit(x_train.values, y_train.values)

            # Get predicted classes
            preds_class = model.predict(x_val)
            # Get predicted probabilities for each class
            preds_proba = model.predict_proba(x_val)
            print(accuracy_score(y_val,preds_class))
            print(roc_auc_score(y_val,[i[1] for i in preds_proba]))


            preds_proba_test = model.predict_proba(X_test)
            current = [i[1] for i in preds_proba_test]
            results_rf.append(current)

In [None]:
avg = pd.DataFrame(results_rf)
avg = avg.mean(axis=0)
pd.DataFrame(avg,columns=['is_female'],index=test_id).to_csv('rf_avg.csv',encoding='utf-8')

### Voting Classifiers

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from itertools import product
from sklearn.ensemble import VotingClassifier

# Training classifiers
clf1 = DecisionTreeClassifier(max_depth=4)
clf2 = GradientBoostingClassifier(n_estimators=1500, learning_rate=0.32,loss='exponential', max_depth=11,random_state=0)
clf3 = SVC(kernel='rbf', probability=True, random_state=0, verbose=True)
clf4 = RandomForestClassifier(n_estimators=250)
clf5 = AdaBoostClassifier(n_estimators=1100,learning_rate=0.82)

eclf = VotingClassifier(estimators=[('dt', clf1), ('gbc', clf2), ('svc', clf3), ('rf',clf4), ('ada',clf5)], voting='soft')

clf1 = clf1.fit(X,Y)
clf2 = clf2.fit(X,Y)
clf3 = clf3.fit(X,Y)
clf4 = clf4.fit(X,Y)
clf5 = clf5.fit(X,Y)

eclf = eclf.fit(X,Y)

In [None]:
pred_prob_test = eclf.predict_proba(X_test)
pred_prob_test = [i[1] for i in pred_prob_test]
pd.DataFrame(pred_prob_test,columns=['is_female'],index=test_id).to_csv('Voting_Classifier_1.csv',encoding='utf-8')

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from itertools import product
from sklearn.ensemble import VotingClassifier

# Training classifiers
clf1 = DecisionTreeClassifier(max_depth=None,)
clf2 = GradientBoostingClassifier(n_estimators=1500, learning_rate=0.32,loss='exponential', max_depth=11,random_state=0)
clf3 = SVC(kernel='rbf', probability=True, random_state=0, verbose=True)
clf4 = RandomForestClassifier(n_estimators=250)
clf5 = AdaBoostClassifier(n_estimators=1100,learning_rate=0.82)

eclf = VotingClassifier(estimators=[('dt', clf1), ('gbc', clf2), ('svc', clf3), ('rf',clf4), ('ada',clf5)],\
                        voting='soft',weights=[1,2,1,1,1],n_jobs=-1)

clf1 = clf1.fit(X,Y)
clf2 = clf2.fit(X,Y)
clf3 = clf3.fit(X,Y)
clf4 = clf4.fit(X,Y)
clf5 = clf5.fit(X,Y)

eclf = eclf.fit(X,Y)

pred_prob_test = eclf.predict_proba(X_test)
pred_prob_test = [i[1] for i in pred_prob_test]
pd.DataFrame(pred_prob_test,columns=['is_female'],index=test_id).to_csv('Voting_Classifier_2.csv',encoding='utf-8')

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from itertools import product
from sklearn.ensemble import VotingClassifier

# Training classifiers
clf1 = CatBoostClassifier(iterations=300,max_depth=5,learning_rate=0.38,custom_metric='AUC',eval_metric='AUC')
clf2 = GradientBoostingClassifier(n_estimators=1500, learning_rate=0.32,loss='exponential', max_depth=11,random_state=0)
clf3 = SVC(kernel='rbf', probability=True, random_state=0, verbose=True)
clf4 = RandomForestClassifier(n_estimators=250)
clf5 = AdaBoostClassifier(n_estimators=1100,learning_rate=0.82)
clf6 = CatBoostClassifier(iterations=100,max_depth=7,learning_rate=0.38,custom_metric='AUC',eval_metric='AUC')

eclf = VotingClassifier(estimators=[('cat', clf1), ('gbc', clf2), ('svc', clf3), ('rf',clf4), ('ada',clf5),('cat2',clf6)],\
                        voting='soft',weights=[2,2,1,1,1,2],n_jobs=-1)

clf1 = clf1.fit(X,Y)
clf2 = clf2.fit(X,Y)
clf3 = clf3.fit(X,Y)
clf4 = clf4.fit(X,Y)
clf5 = clf5.fit(X,Y)
clf6 = clf6.fit(X,Y)
eclf = eclf.fit(X,Y)

pred_prob_test = eclf.predict_proba(X_test)
pred_prob_test = [i[1] for i in pred_prob_test]
pd.DataFrame(pred_prob_test,columns=['is_female'],index=test_id).to_csv('Voting_Classifier_3.csv',encoding='utf-8')

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from itertools import product
from sklearn.ensemble import VotingClassifier

# Training classifiers
clf1 = CatBoostClassifier(iterations=300,max_depth=5,learning_rate=0.38,custom_metric='AUC',eval_metric='AUC',random_state=1)
clf2 = GradientBoostingClassifier(n_estimators=1500, learning_rate=0.32,loss='exponential', max_depth=11,random_state=1)
clf3 = SVC(kernel='rbf', probability=True, random_state=1, verbose=True)
clf4 = RandomForestClassifier(n_estimators=250,random_state=1,)
clf5 = AdaBoostClassifier(n_estimators=1100,learning_rate=0.82,random_state=1)
clf6 = CatBoostClassifier(iterations=100,max_depth=7,learning_rate=0.35,custom_metric='AUC',eval_metric='AUC',random_state=1,)
clf7 = CatBoostClassifier(iterations=500,max_depth=5,learning_rate=0.3,custom_metric='AUC',eval_metric='AUC',random_state=1,)
clf8 = CatBoostClassifier(iterations=1000,max_depth=3,learning_rate=0.32,custom_metric='AUC',eval_metric='AUC',random_state=1,)
clf9 = GradientBoostingClassifier(n_estimators=500, learning_rate=0.3,loss='exponential', max_depth=5,random_state=1)
clf10 = GaussianNB()
clf11 = LogisticRegression(penalty='l2',tol=0.1,random_state=1)

eclf = VotingClassifier(estimators=[('cat', clf1), ('gbc', clf2), ('svc', clf3),\
                                    ('rf',clf4), ('ada',clf5),('cat2',clf6),\
                                    ('cat3',clf7),('cat4',clf8),('gbc2',clf9),('gnb',clf10),('lgr',clf11),],
                        voting='soft',weights=[1.35, 1.35, 1.25, 1, 1, 1.15, 1.15, 1.15, 1.10, 1, 1.15],n_jobs=-1)

clf1 = clf1.fit(X,Y)
clf2 = clf2.fit(X,Y)
clf3 = clf3.fit(X,Y)
clf4 = clf4.fit(X,Y)
clf5 = clf5.fit(X,Y)
clf6 = clf6.fit(X,Y)
clf7 = clf7.fit(X,Y)
clf8 = clf8.fit(X,Y)
clf9 = clf9.fit(X,Y)
clf10 = clf10.fit(X,Y)
clf11 = clf11.fit(X,Y)

In [None]:
avg_pred = []
for M in [clf1,clf2,clf3,clf4,clf5,clf6,clf7,clf8,clf9,clf10,clf11]:
    
    pred_prob_test = M.predict_proba(X_test)
    pred_prob_test = [i[1] for i in pred_prob_test]
    avg_pred.append(pred_prob_test) 

In [None]:
avg_pred = pd.DataFrame(avg_pred)
avg_pred = avg_pred.mean(axis=0)
pd.DataFrame(avg,columns=['is_female'],index=test_id).to_csv('11_avg_models.csv',encoding='utf-8')

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from itertools import product
from sklearn.ensemble import VotingClassifier

# Training classifiers
clf1 = CatBoostClassifier(iterations=300,max_depth=5,learning_rate=0.38,custom_metric='AUC',eval_metric='AUC',random_state=1)
clf2 = GradientBoostingClassifier(n_estimators=1500, learning_rate=0.32,loss='exponential', max_depth=11,random_state=1)
clf3 = SVC(kernel='rbf', probability=True, random_state=1, verbose=True)
clf4 = RandomForestClassifier(n_estimators=250,random_state=1,)
clf5 = AdaBoostClassifier(n_estimators=1100,learning_rate=0.82,random_state=1)
clf6 = CatBoostClassifier(iterations=100,max_depth=7,learning_rate=0.35,custom_metric='AUC',eval_metric='AUC',random_state=1,)
clf7 = CatBoostClassifier(iterations=500,max_depth=5,learning_rate=0.3,custom_metric='AUC',eval_metric='AUC',random_state=1,)
clf8 = CatBoostClassifier(iterations=1000,max_depth=3,learning_rate=0.32,custom_metric='AUC',eval_metric='AUC',random_state=1,)
clf9 = GradientBoostingClassifier(n_estimators=500, learning_rate=0.3,loss='exponential', max_depth=5,random_state=1)
clf10 = GaussianNB()
clf11 = LogisticRegression(penalty='l2',tol=0.1,random_state=1)

eclf = VotingClassifier(estimators=[('cat', clf1), ('gbc', clf2), ('svc', clf3),\
                                    ('rf',clf4), ('ada',clf5),('cat2',clf6),\
                                    ('cat3',clf7),('cat4',clf8),('gbc2',clf9),('gnb',clf10),('lgr',clf11),],
                        voting='soft',weights=[1.35, 1.35, 1.25, 1, 1, 1.15, 1.15, 1.15, 1.10, 1, 1.15],n_jobs=-1)

clf1 = clf1.fit(X,Y)
clf2 = clf2.fit(X,Y)
clf3 = clf3.fit(X,Y)
clf4 = clf4.fit(X,Y)
clf5 = clf5.fit(X,Y)
clf6 = clf6.fit(X,Y)
clf7 = clf7.fit(X,Y)
clf8 = clf8.fit(X,Y)
clf9 = clf9.fit(X,Y)
clf10 = clf10.fit(X,Y)
clf11 = clf11.fit(X,Y)

eclf = eclf.fit(X,Y)

pred_prob_test = eclf.predict_proba(X_test)
pred_prob_test = [i[1] for i in pred_prob_test]
pd.DataFrame(pred_prob_test,columns=['is_female'],index=test_id).to_csv('Voting_Classifier_4.csv',encoding='utf-8')

In [None]:
pred_prob = eclf.predict_proba(X_test)

In [None]:
pred_prob = [i[1] for i in pred_prob]

In [None]:
pd.DataFrame(pred_prob,columns=['is_female'],index=test_id).to_csv('Voting_Classifier_4.csv',encoding='utf-8')