In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline

In [3]:
df_train = pd.read_csv("../data/train_users_2.csv")
df_test = pd.read_csv("../data/test_users.csv")
df_all = pd.concat([df_train, df_test])

In [4]:
df_target = df_train["country_destination"]
df_id = df_test["id"]
df_all = df_all.drop("country_destination", axis = 1)
df_all = df_all.drop("id", axis = 1)

df_all["date_account_created"] = pd.to_datetime(df_all["date_account_created"], format = "%Y-%m-%d")
df_all["timestamp_first_active"] = pd.to_datetime(df_all["timestamp_first_active"], format="%Y%m%d%H%M%S", errors='ignore')

df_all["ac_year"] = df_all["date_account_created"].apply(lambda x : x.year)
df_all["ac_month"] = df_all["date_account_created"].apply(lambda x : x.month)
df_all["ac_day"] = df_all["date_account_created"].apply(lambda x : x.day)

df_all["fa_year"] = df_all["timestamp_first_active"].apply(lambda x : x.year)
df_all["fa_month"] = df_all["timestamp_first_active"].apply(lambda x : x.month)
df_all["fa_day"] = df_all["timestamp_first_active"].apply(lambda x : x.day)
df_all["fa_hour"] = df_all["timestamp_first_active"].apply(lambda x : x.hour)
df_all["fa_minute"] = df_all["timestamp_first_active"].apply(lambda x : x.minute)
df_all["fa_second"] = df_all["timestamp_first_active"].apply(lambda x : x.second)

df_all = df_all.drop("date_account_created", axis = 1)
df_all = df_all.drop("timestamp_first_active", axis = 1)

df_all = df_all.drop("date_first_booking", axis = 1)
df_all = df_all.drop("age", axis = 1)

df_all["first_affiliate_tracked"].fillna("untracked", inplace = True)

df_all = pd.get_dummies(df_all)

df_train = df_all.iloc[:213451]
df_test = df_all.iloc[213451:]

In [5]:
from xgboost import XGBClassifier
from xgboost import plot_importance

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [6]:
y_bin =df_target.apply(lambda x : "NDF" if(x=="NDF") else "DF")

In [7]:
xgb = XGBClassifier(n_jobs = 4)

In [8]:
xgb.fit(df_train, y_bin)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=4, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [9]:
y_bin_pre = xgb.predict(df_train)

In [10]:
print(classification_report(y_bin, y_bin_pre))

             precision    recall  f1-score   support

         DF       0.63      0.46      0.53     88908
        NDF       0.68      0.81      0.74    124543

avg / total       0.66      0.66      0.65    213451



In [11]:
idx_DF = np.in1d(y_bin, "DF")

In [12]:
df_train_DF = df_train[idx_DF]
y_DF = df_target[idx_DF]

### DF에서 나라별로 나누기

In [24]:
xgb1 = XGBClassifier(n_jobs = 4)

In [15]:
xgb1.fit(df_train_DF, y_DF)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=4, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [16]:
y_DF_pre = xgb1.predict(df_train_DF)

In [17]:
pd.Series(y_DF_pre).value_counts()

US       88902
IT           3
FR           2
other        1
dtype: int64

In [18]:
print(classification_report(y_DF, y_DF_pre))

             precision    recall  f1-score   support

         AU       0.00      0.00      0.00       539
         CA       0.00      0.00      0.00      1428
         DE       0.00      0.00      0.00      1061
         ES       0.00      0.00      0.00      2249
         FR       1.00      0.00      0.00      5023
         GB       0.00      0.00      0.00      2324
         IT       1.00      0.00      0.00      2835
         NL       0.00      0.00      0.00       762
         PT       0.00      0.00      0.00       217
         US       0.70      1.00      0.82     62376
      other       0.00      0.00      0.00     10094

avg / total       0.58      0.70      0.58     88908



  'precision', 'predicted', average, warn_for)


### test

In [30]:
y_submit_bin = xgb.predict(df_test)

In [31]:
pd.Series(y_submit_bin).value_counts()

NDF    48980
DF     13116
dtype: int64

In [32]:
idx_NDF = np.in1d(y_submit_bin, "NDF")
idx_DF = np.in1d(y_submit_bin, "DF")

In [34]:
df_target_DF = pd.DataFrame(xgb1.predict(df_test[idx_DF]), index = df_test[idx_DF].index)

XGBoostError: need to call fit beforehand

In [35]:
df_target_NDF = pd.DataFrame(y_submit_bin[idx_NDF], index = df_test[idx_NDF].index)

In [37]:
df_submit = pd.concat([df_target_DF, df_target_NDF])

In [40]:
df_submit.sort_index(inplace=True)

In [47]:
df_submit[0].value_counts()

NDF      48980
US       13115
other        1
Name: 0, dtype: int64