In [17]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report

In [3]:
age_gender_bkts = pd.read_csv("data/age_gender_bkts.csv")
countries = pd.read_csv("data/countries.csv")
sessions = pd.read_csv("data/sessions.csv")
test_users = pd.read_csv("data/test_users.csv")
train_users_2 = pd.read_csv("data/train_users_2.csv")
sample_submission_NDF = pd.read_csv("data/sample_submission_NDF.csv")

In [4]:
import xgboost
from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification
from sklearn.ensemble import ExtraTreesClassifier
import lightgbm as lgb

In [5]:
def pre_age_set_data():
    
    check = pd.concat([train_users_2, test_users], ignore_index=True)
    
    check["first_affiliate_tracked"] = check["first_affiliate_tracked"].replace(np.nan, "untracked")
    
    check["date_account_created"] = pd.to_datetime(check["date_account_created"], format = "%Y-%m-%d")
    check["timestamp_first_active"] = pd.to_datetime(check["timestamp_first_active"], format="%Y%m%d%H%M%S")

    s_lag = check["timestamp_first_active"] - check["date_account_created"]

    check["lag_days"] = s_lag.apply(lambda x : -1 * x.days)
    check["lag_seconds"] = s_lag.apply(lambda x : x.seconds)

    s_all_check = (check['age'] < 120) & (check['gender'] != '-unknown-')

    check['faithless_sign'] = s_all_check.apply(lambda x : 0 if x == True else 1)
    
    pre_age = check.drop("date_first_booking",axis = 1)
    
    pre_age['date_account_created_y'] = pre_age["date_account_created"].apply(lambda x : x.year)
    pre_age['date_account_created_m'] = pre_age["date_account_created"].apply(lambda x : x.month)
    pre_age['date_account_created_d'] = pre_age["date_account_created"].apply(lambda x : x.day)

    pre_age['timestamp_first_active_y'] = pre_age["timestamp_first_active"].apply(lambda x : x.year)
    pre_age['timestamp_first_active_m'] = pre_age["timestamp_first_active"].apply(lambda x : x.month)
    pre_age['timestamp_first_active_d'] = pre_age["timestamp_first_active"].apply(lambda x : x.day)

    pre_age = pre_age.drop("date_account_created" , axis=1)
    pre_age = pre_age.drop("timestamp_first_active" , axis=1)
    
    return check, pre_age

check, pre_age = pre_age_set_data()

In [7]:
def pre_age_predict_data():
    
    pre_age['age'] = pre_age['age'].fillna(-1)
    
    pre_age_sub = pre_age.filter(items = ['age', 'country_destination','id'])
    pre_age_dum = pre_age.filter(items = ['affiliate_channel', 'affiliate_provider',
                                       'first_affiliate_tracked', 'first_browser', 'first_device_type',
                                       'language', 'signup_app', 'signup_flow',
                                       'signup_method', 'date_account_created_y', 'date_account_created_m',
                                       'date_account_created_d', 'timestamp_first_active_y',
                                       'timestamp_first_active_m', 'timestamp_first_active_d'])
    
    pre_age_dum = pd.get_dummies(pre_age_dum)
    pre_age_dum_con = pd.concat([pre_age_dum, pre_age_sub], axis=1)
    pre_age_dum_con["age"] = pre_age_dum_con["age"].replace(-1, np.nan)
    
    pre_age_mission = pre_age_dum_con[pre_age_dum_con["age"].isnull()].reset_index()
    pre_age_train = pre_age_dum_con[pre_age_dum_con["age"].notnull()].reset_index()
    
    pre_age_mission_test = pre_age_mission.drop("index", axis=1)
    pre_age_train_test = pre_age_train.drop("index", axis=1)
    
    pre_age_mission_test_drop = pre_age_mission_test.drop(['id', 'age', 'country_destination'], axis=1)
    pre_age_train_test_drop = pre_age_train_test.drop(['id', 'age', 'country_destination'], axis=1)
    
    return pre_age_mission_test, pre_age_train_test, pre_age_mission, pre_age_train, \
            pre_age_mission_test_drop, pre_age_train_test_drop
    
pre_age_mission_test, pre_age_train_test, pre_age_mission, pre_age_train, \
            pre_age_mission_test_drop, pre_age_train_test_drop = pre_age_predict_data()

In [8]:
def pre_age_predict_data_cat():
    
    bins = [0, 15, 25, 35, 60, 9999]
    labels = ["미성년자", "청년", "중년", "장년", "노년"]
    cats = pd.cut(pre_age_train['age'], bins, labels=labels)
    cats = pd.DataFrame(cats)
    
    return cats

cats = pre_age_predict_data_cat()

In [9]:
labels = ["age1", "age2","age3","age4","age5"]
cats_q = pd.qcut(pre_age_train['age'], 5, labels=labels)

In [10]:
pd.value_counts(cats_q)

age1    36477
age5    31377
age3    31139
age2    30586
age4    29102
Name: age, dtype: int64

---

In [15]:
def pre_gen_predict_data():

    pre_gen_sub = pre_age.filter(items = ['age', 'country_destination', 'id', 'gender'])
    pre_gen_dum = pre_age.filter(items = ['affiliate_channel', 'affiliate_provider',
                                       'first_affiliate_tracked', 'first_browser', 'first_device_type',
                                         'language', 'signup_app', 'signup_flow',
                                       'signup_method', 'date_account_created_y', 'date_account_created_m',
                                       'date_account_created_d', 'timestamp_first_active_y',
                                       'timestamp_first_active_m', 'timestamp_first_active_d'])


    pre_gen_dum = pd.get_dummies(pre_gen_dum)
    pre_gen_dum_con = pd.concat([pre_gen_dum, pre_gen_sub], axis=1)
    pre_gen_dum_con["gender"] = pre_gen_dum_con["gender"].replace(['-unknown-', 'OTHER'], np.nan)

    pre_gen_mission = pre_gen_dum_con[pre_gen_dum_con["gender"].isnull()].reset_index()
    pre_gen_train = pre_gen_dum_con[pre_gen_dum_con["gender"].notnull()].reset_index()

    pre_gen_mission_test = pre_gen_mission.drop("index", axis=1)
    pre_gen_train_test = pre_gen_train.drop("index", axis=1)

    pre_gen_mission_test_drop = pre_gen_mission_test.drop(['id', 'age', 'country_destination', "gender"], axis=1)
    pre_gen_train_test_drop = pre_gen_train_test.drop(['id', 'age', 'country_destination', "gender"], axis=1)
    
    return pre_gen_mission_test, pre_gen_train_test, pre_gen_mission, pre_gen_train, \
            pre_gen_mission_test_drop, pre_gen_train_test_drop
    
pre_gen_mission_test, pre_gen_train_test, pre_gen_mission, pre_gen_train, \
            pre_gen_mission_test_drop, pre_gen_train_test_drop = pre_gen_predict_data()

In [18]:
def predict_gen_LightGBM():

    X = pre_gen_train_test_drop
    y = pre_gen_train_test["gender"]
    
    model_gen_lgb = lgb.LGBMClassifier(nthread=3)
    model_gen_lgb.fit(X,y)

    print(classification_report(y, model_gen_lgb.predict(pre_gen_train_test_drop)))
    model_gen_lgb = model_gen_lgb.predict(pre_gen_mission_test_drop)
    model_gen_lgb = pd.DataFrame(model_gen_lgb)
    
    return model_gen_lgb

model_gen_lgb = predict_gen_LightGBM()

  if diff:


             precision    recall  f1-score   support

     FEMALE       0.58      0.72      0.65     77524
       MALE       0.57      0.42      0.48     68209

avg / total       0.58      0.58      0.57    145733



  if diff:


In [19]:
model_gen_lgb.to_csv("model_gen_lgb.csv", index=False)

---

In [20]:
def predict_age_xgboost():
    
    X = pre_age_train_test_drop
    y = cats
    
    model_age_xg = XGBClassifier(nthread=3)
    model_age_xg.fit(X,y)
    
    print(classification_report(y, model_age_xg.predict(pre_age_train_test_drop)))
    model_age_xg = model_age_xg.predict(pre_age_mission_test_drop)
    model_age_xg = pd.DataFrame(model_age_xg)
    
    return model_age_xg

model_age_xg = predict_age_xgboost()

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  if diff:
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

         노년       0.33      0.00      0.00      9993
       미성년자       0.00      0.00      0.00        68
         장년       0.47      0.36      0.41     55518
         중년       0.49      0.80      0.61     70900
         청년       0.50      0.02      0.03     22202

avg / total       0.47      0.48      0.42    158681



  if diff:


In [21]:
def predict_age_ExtraTreesClassifier():
    
    X = pre_age_train_test_drop
    y = cats

    model_age_forest = ExtraTreesClassifier(n_estimators=250, random_state=0)
    model_age_forest.fit(X, y)

    print(classification_report(y, model_age_forest.predict(pre_age_train_test_drop)))
    model_age_forest = model_age_forest.predict(pre_age_mission_test_drop)
    model_age_forest = pd.DataFrame(model_age_forest)
                         
    return model_age_forest

model_age_forest = predict_age_ExtraTreesClassifier()

  import sys


             precision    recall  f1-score   support

         노년       0.69      0.72      0.70      9993
       미성년자       0.95      0.84      0.89        68
         장년       0.74      0.81      0.77     55518
         중년       0.76      0.82      0.79     70900
         청년       0.87      0.46      0.60     22202

avg / total       0.76      0.76      0.75    158681



In [22]:
def predict_age_LightGBM():

    X = pre_age_train_test_drop
    y = cats
    
    model_age_lgb = lgb.LGBMClassifier(nthread=3)
    model_age_lgb.fit(X,y)

    print(classification_report(y, model_age_lgb.predict(pre_age_train_test_drop)))
    model_age_lgb = model_age_lgb.predict(pre_age_mission_test_drop)
    model_age_lgb = pd.DataFrame(model_age_lgb)
    
    return model_age_lgb

model_age_lgb = predict_age_LightGBM()

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  if diff:


             precision    recall  f1-score   support

         노년       0.67      0.00      0.01      9993
       미성년자       0.90      0.65      0.75        68
         장년       0.48      0.38      0.43     55518
         중년       0.50      0.79      0.61     70900
         청년       0.55      0.04      0.08     22202

avg / total       0.51      0.49      0.43    158681



  if diff:


---

In [23]:
model_age_xg.to_csv("model_age_xg.csv", index=False)
model_age_forest.to_csv("model_age_forest.csv", index=False)
model_age_lgb.to_csv("model_age_lgb.csv", index=False)

---

In [24]:
def predict_age_LightGBM():

    X = pre_age_train_test_drop
    y = cats
    
    model_age_lgb = lgb.LGBMClassifier(nthread=3)
    model_age_lgb.fit(X,y)

    print(classification_report(y, model_age_lgb.predict(pre_age_train_test_drop)))
    model_age_lgb = model_age_lgb.predict(pre_age_mission_test_drop)
    model_age_lgb = pd.DataFrame(model_age_lgb)
    
    return model_age_lgb

model_age_lgb = predict_age_LightGBM()

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  if diff:


             precision    recall  f1-score   support

         노년       0.67      0.00      0.01      9993
       미성년자       0.90      0.65      0.75        68
         장년       0.48      0.38      0.43     55518
         중년       0.50      0.79      0.61     70900
         청년       0.55      0.04      0.08     22202

avg / total       0.51      0.49      0.43    158681



  if diff:


In [25]:
model_age_lgb.to_csv("model_age_lgb.csv", index=False)

In [26]:
def predict_age_LightGBM():

    X = pre_age_train_test_drop
    y = cats_q
    
    model_age_lgb = lgb.LGBMClassifier(nthread=3)
    model_age_lgb.fit(X,y)

    print(classification_report(y, model_age_lgb.predict(pre_age_train_test_drop)))
    model_age_lgb = model_age_lgb.predict(pre_age_mission_test_drop)
    model_age_lgb = pd.DataFrame(model_age_lgb)
    
    return model_age_lgb

model_age_lgb = predict_age_LightGBM()

  if diff:


             precision    recall  f1-score   support

       age1       0.33      0.61      0.43     36477
       age2       0.28      0.09      0.14     30586
       age3       0.28      0.18      0.22     31139
       age4       0.30      0.12      0.17     29102
       age5       0.33      0.52      0.40     31377

avg / total       0.30      0.32      0.28    158681



  if diff:


In [27]:
model_age_lgb.to_csv("model_age_lgb.csv", index=False)

---