In [15]:
from __future__ import division, print_function
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
from zodbpickle import pickle
from IPython.display import display
import time
pd.set_option('display.max_columns', 100)

start_time = time.time()

# Data folder
data_folder = '../data/'
# Load data
data_train = pd.DataFrame.from_csv(data_folder + 'data_train_all.csv')
data_test = pd.DataFrame.from_csv(data_folder + 'data_test_all.csv')

# Remove plus_date == 0 && plus_date > 7
from preliminary_proc import drop_plus_date_rows

## !!! Caution vars values changed !!!
data_train = drop_plus_date_rows(data_train)
data_test = drop_plus_date_rows(data_test)

# Reset shuffled indices
data_train.reset_index(drop=True, inplace=True)
data_test.reset_index(drop=True, inplace=True)

# Get feature data/label data and preserve test set user_ids
feature_train, feature_test = data_train.drop('isinvested', axis=1), data_test.drop('isinvested', axis=1)
label_train, label_test = data_train.isinvested, data_test.isinvested
user_id_test = data_test.user_id

# Join back for data preprocessing
feature_all_df = feature_train.append(feature_test, ignore_index=True)

# Determine features to use and drop the rest
from preliminary_proc import drop_columns

feature_all_dropped = drop_columns(feature_all_df)

# Feature engineering
from preliminary_proc import speedy_process
feature_all_proc = speedy_process(feature_all_dropped, cat_colnames=['sex_id', 'client_type_id', 
                                                        'isrecharged', 'rechargestatus', 'isinvited'])

feature_train = feature_all_proc.iloc[:len(feature_train), :]
feature_test = feature_all_proc.iloc[len(feature_train):, :]

# Convert labels
label_dict = {'未投资': 0, '已投资': 1}

label_train.replace(label_dict, inplace=True)
label_test.replace(label_dict, inplace=True)

# Training / validation / test split
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(feature_train, label_train, test_size=0.30, random_state=13)

# View and save feature importances
from sklearn.tree import DecisionTreeClassifier

clf_dt = DecisionTreeClassifier(random_state=13)
clf_dt.fit(X_train, y_train)

sorted_importance = sorted(list(zip(X_train.columns, clf_dt.feature_importances_)), key=lambda x: x[1], reverse=True)

# for x in sorted_importance:
#     print(x)

feature_importance_df = pd.DataFrame(sorted_importance)
feature_importance_df.columns = ['feature', 'rating']
feature_importance_df.set_index(['feature'])

feature_importance_df.loc[:, 'rating'] = feature_importance_df.loc[:, 'rating'].apply(lambda x: str(round(100 * x, 2)) + '%')
display(feature_importance_df)
feature_importance_df.to_csv('feature_importances.csv')

# Build model
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=512, min_samples_leaf=25, max_features='sqrt', n_jobs=-1, oob_score=True, random_state=13)
clf.fit(X_train, y_train)

# Predict
y_pred = clf.predict_proba(feature_test)

# Report
from reporting import eval_ks, draw_decile_chart

print("KS: {}".format(eval_ks(label_test, y_pred[:, 1])))

from sklearn.metrics import roc_auc_score

print("ROC: {}".format(roc_auc_score(label_test, y_pred[:, 1])))

print("Training and predicting took {} secs.".format(round(time.time() - start_time, 2)))

from reporting import BinDecile

xl_df = pd.DataFrame({'user_id': user_id_test, 'y_hat': y_pred[:, 1], 'y': label_test})
# display(xl_df.head())

bin_obj = BinDecile()

xl_df_binned, bins = bin_obj.bin_describe_new(xl_df, 'y_hat', binset=10, df_new_bin_name='y_hat_bin')
# display(xl_df_binned.head())

good_rate_table = bin_obj.bin_apply(xl_df_binned, 'y', df_new_bin_name='y_hat_bin')
# print(type(good_rate_table))
# good_rate_table.loc[:, 'y', 'mean'] = good_rate_table['y']['mean'].apply(lambda x: str(round(100 * x, 2)) + '%')
# display(good_rate_table)

# # print(good_rate_table['y']['mean'])
# # good_rate_table.loc[:, 'mean'] = good_rate_table.loc[:, 'mean'].apply(lambda x: str(round(100 * x, 2)) + '%')
# good_rate_table.to_csv('good_rate_table.csv')

# pivot_table = feature_test.copy()
# pivot_table['label'] = label_test
# pivot_table['y_hat'] = y_pred[:, 1]

# pivot_table.to_csv('pivot_table.csv')

# # My old chart
# # decile_df = pd.DataFrame({'Label': label_test, 'Prediction': y_pred[:, 1]})
# # draw_decile_chart(decile_df, 'Prediction', ['Label', 'Prediction'], bins=10)


Unnamed: 0,feature,rating
0,staytime,27.98%
1,staynum,23.1%
2,user_age,17.18%
3,client_type_id_2,15.85%
4,lastday_invite,6.17%
5,lastamt_invite,4.35%
6,sex_id_2,0.99%
7,isinvited_1,0.97%
8,isrecharged_1,0.86%
9,sex_id_1,0.62%


is deprecated and will be removed in a future version
  count = data.groupby(prediction)[label].agg({'bad': np.count_nonzero, 'obs': np.size})
is deprecated and will be removed in a future version
  count = data.groupby(prediction, sort=False)[label].agg({'bad': np.count_nonzero, 'obs': np.size})


KS: (0.65053961160480167, 0.65726312614027127)
ROC: 0.9225526669674886
Training and predicting took 29.91 secs.


In [43]:
idx = pd.IndexSlice
good_rate_table_copy = good_rate_table.copy()
idx = pd.IndexSlice
good_rate_table_copy = good_rate_table.copy()

good_rate_table_copy.loc[:, idx[:, 'mean']].values.reshape(0, -1).shape
# np.array([str(round(x, 2)) + '%' + for x in good_rate_table_copy.loc[:, idx[:, 'mean']].values.reshape(1, -1)]).reshape(10, 1)
# good_rate_table_copy.loc[:, idx[:, 'mean']] = 




# good_rate_table_copy.loc[:, idx[:, 'mean']] = good_rate_table_copy.loc[:, idx[:, 'mean']].apply(lambda x:  str(round(100 * x, 2)) + '%')

ValueError: cannot reshape array of size 10 into shape (0,newaxis)

In [33]:
good_rate_table_copy

Unnamed: 0_level_0,y,y,y,y
Unnamed: 0_level_1,count,mean,min,max
y_hat_bin,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,11590,0.317601,0,1
2,11589,0.068858,0,1
3,11589,0.044525,0,1
4,11589,0.025283,0,1
5,11589,0.012512,0,1
6,11589,0.001294,0,1
7,11589,0.001294,0,1
8,11589,0.0,0,0
9,11589,0.0,0,0
10,11590,0.0,0,0


In [14]:
good_rate_table

Unnamed: 0_level_0,y,y,y,y
Unnamed: 0_level_1,count,mean,min,max
y_hat_bin,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,11590,0,0,1
2,11589,1,0,1
3,11589,2,0,1
4,11589,3,0,1
5,11589,4,0,1
6,11589,5,0,1
7,11589,6,0,1
8,11589,7,0,0
9,11589,8,0,0
10,11590,9,0,0
