# LB 02: Baseline

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import cross_val_score

In [2]:
train = pd.read_csv('Data/train_loans.csv', low_memory=False)

In [3]:
test = pd.read_csv('Data/test_loans.csv', low_memory=False)

## EDA

In [4]:
###

## Data preparation

In [5]:
empty_value = 'Empty'
unknown_value = 'Unknown'

value_replace_dict = {
    'Information not provided by applicant in mail, Internet, or telephone application': unknown_value
}

In [6]:
target_dict_encoder = {
    'Loan originated': 1,
    'Application denied by financial institution': 0
}

In [7]:
def convert_col_to_dummy_df(df_col):
    col_name = df_col.name
    # Add calculating frequencies of values here...
    dummy_df = pd.get_dummies(df_col, prefix=col_name, prefix_sep=': ')
    dummy_df_cols = {}
    # Add formatting names here...
    return dummy_df


def prepare_dfs(train, test):
    # 1. Convert and extract target variable
    target_kwargs = {
        'target': train['action_taken_name'].apply(lambda x: target_dict_encoder[x])
    }
    train = train.assign(**target_kwargs)
    y_train = train['target']
    train = train.drop('target', axis=1)
    
    # 2. Join dataframes
    train_size = train.shape[0]
    df = pd.concat([train, test], axis=0, ignore_index=True, sort=False)
    
    # 3. Drop extra columns
    deleted_cols = [
        #'applicant_income_000s','loan_amount_000s', 'applicant_race_name_1',
        'action_taken_name', 'agency_abbr', 'agency_name',
        'applicant_ethnicity_name',
        'applicant_race_name_2', 'applicant_race_name_3', 'applicant_race_name_4',
        'applicant_race_name_5', 'applicant_sex_name', 'as_of_year',
        'census_tract_number', 'co_applicant_ethnicity_name',
        'co_applicant_race_name_1', 'co_applicant_race_name_2',
        'co_applicant_race_name_3', 'co_applicant_race_name_4',
        'co_applicant_race_name_5', 'co_applicant_sex_name', 'county_code',
        'county_name', 'hoepa_status_name', 'lien_status_name',
        'loan_purpose_name', 'loan_type_name', 'msamd', 'msamd_name',
        'owner_occupancy_name', 'preapproval_name', 'property_type_name',
        'respondent_id', 'state_code', 'state_abbr', 'state_name',
        'hud_median_family_income', 
        'number_of_1_to_4_family_units', 'number_of_owner_occupied_units',
        'minority_population', 'population', 'rate_spread',
        'tract_to_msamd_income'
    ]
    df = df.drop(deleted_cols, axis=1)
    
    # 4. Replace some values
    for col in df.columns:
        if df[col].dtype == 'object':
            for repl_key, repl_v in value_replace_dict.items():
                df.loc[df[col] == repl_key, col] = repl_v
    
    # 5. Encode categorical features
    for col_name in df.columns:
        col_series = df[col_name]
        if col_series.dtype == 'object':
            print('Applying categorical encoding to %s...' % col_name)
            dummy_df = convert_col_to_dummy_df(col_series)
            df.drop(col_name, axis=1, inplace=True)
            df = pd.concat([df, dummy_df], axis=1)
    
    # 6. Split data into train and test
    X_train = df[:train_size]
    X_test = df[train_size:]
    
    return X_train, X_test, y_train

In [8]:
X_train, X_test, y_train = prepare_dfs(train, test)

Applying categorical encoding to applicant_race_name_1...


In [9]:
X_train.head()

Unnamed: 0,applicant_income_000s,loan_amount_000s,applicant_race_name_1: American Indian or Alaska Native,applicant_race_name_1: Asian,applicant_race_name_1: Black or African American,applicant_race_name_1: Native Hawaiian or Other Pacific Islander,applicant_race_name_1: Not applicable,applicant_race_name_1: Unknown,applicant_race_name_1: White
0,297.0,556.0,0,0,0,0,0,0,1
1,49.0,236.0,0,0,0,0,0,0,1
2,107.0,200.0,0,0,0,0,0,0,1
3,48.0,150.0,0,0,0,0,0,0,1
4,85.0,204.0,0,0,0,0,0,0,1


## Creating model

In [10]:
from lightgbm import LGBMClassifier

In [11]:
lgbm_clf = LGBMClassifier(
    random_state=2018
)

In [12]:
cv_res = cross_val_score(lgbm_clf, X_train, y_train, scoring='roc_auc', n_jobs=1, cv=10)

In [13]:
print("Cross-validation score for LightGBM, ROC-AUC: mean=%f, std=%f" % (cv_res.mean(), cv_res.std()))

Cross-validation score for LightGBM, ROC-AUC: mean=0.690471, std=0.003359


In [14]:
lgbm_clf.fit(X_train, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        learning_rate=0.1, max_depth=-1, min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
        n_jobs=-1, num_leaves=31, objective=None, random_state=2018,
        reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0)

## Submission

In [15]:
answers = pd.read_csv('Data/baseline_submission.csv')
answers.head()

Unnamed: 0,action_taken_name
0,0.5
1,0.5
2,0.5
3,0.5
4,0.5


In [16]:
preds = lgbm_clf.predict_proba(X_test)

In [17]:
answers['action_taken_name'] = preds[:, 1]
answers.head()

Unnamed: 0,action_taken_name
0,0.88903
1,0.792846
2,0.904819
3,0.9121
4,0.868797


In [18]:
file_name = "submission_ВоробейВладислав.csv"
answers.to_csv(file_name, index=False)