In [4]:
import os
import gc
import glob
from datetime import date

import pandas as pd
import numpy as np

from multiprocessing import Pool

from logging import getLogger
logger = getLogger(None)

from tools.datahandling import DataHandling as DH
from tools.util import Util
from tools.encoding import FrequencyEncoder


logfile_name = 'logs/training_' + str(date.today().isoformat()) + '.log'
logger = Util.Logger(logfile_name=logfile_name)

In [7]:
def load_train_data():
    paths = sorted(glob.glob('input/3.base/FrequencyEncoding/train/*.csv'))
    tmps = []
    for path in paths:
        logger.info('path {}'.format(path))
        tmps.append(pd.read_csv(path))
        
    df = pd.concat(tmps, ignore_index=True, axis=0, copy=False)
    logger.info('data size {}'.format(df.shape))
    return df


def load_val_data():
    paths = sorted(glob.glob('input/3.base/FrequencyEncoding/val/*.csv'))
    tmps = []
    for path in paths:
        logger.info('path {}'.format(path))
        tmps.append(pd.read_csv(path))
        
    df = pd.concat(tmps, ignore_index=True, axis=0, copy=False)
    logger.info('data size {}'.format(df.shape))
    return df


def load_test_data():
    paths = sorted(glob.glob('input/3.base/FrequencyEncoding/test/*.csv'))
    tmps = []
    for path in paths:
        logger.info('path {}'.format(path))
        tmps.append(pd.read_csv(path))
        
    df = pd.concat(tmps, ignore_index=True, axis=0, copy=False)
    logger.info('data size {}'.format(df.shape))
    return df

In [8]:
train = load_train_data()
val = load_val_data()
test = load_test_data()

2018-11-11 15:26:31,027 <ipython-input-7-03b1e113d350> tools.util 5 [INFO][load_data] path input/3.base/FrequencyEncoding/FE_app.csv 
2018-11-11 15:26:50,064 <ipython-input-7-03b1e113d350> tools.util 5 [INFO][load_data] path input/3.base/FrequencyEncoding/FE_channel.csv 
2018-11-11 15:27:08,900 <ipython-input-7-03b1e113d350> tools.util 5 [INFO][load_data] path input/3.base/FrequencyEncoding/FE_device.csv 
2018-11-11 15:27:27,097 <ipython-input-7-03b1e113d350> tools.util 5 [INFO][load_data] path input/3.base/FrequencyEncoding/FE_ip.csv 
2018-11-11 15:27:47,154 <ipython-input-7-03b1e113d350> tools.util 5 [INFO][load_data] path input/3.base/FrequencyEncoding/FE_os.csv 
2018-11-11 15:28:05,356 <ipython-input-7-03b1e113d350> tools.util 5 [INFO][load_data] path input/3.base/FrequencyEncoding/click_id.csv 
2018-11-11 15:28:18,548 <ipython-input-7-03b1e113d350> tools.util 5 [INFO][load_data] path input/3.base/FrequencyEncoding/data_set.csv 
2018-11-11 15:28:30,258 <ipython-input-7-03b1e113d350

In [None]:
X_train = train.drop("is_attributed", axis=1)
y_train = train.is_attributed

del train
gc.collect()

X_val = val.drop("is_attributed", axis=1)
y_val = val.is_attributed

del val
gc.collect()

test_id = test.click_id
X_test = test.drop(['is_attributed', 'click_id'], axis=1)

del test
gc.collect()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics


rf = RandomForestClassifier(n_estimators=100,
                            max_depth=10,
                            n_jobs=10,
                            class_weight="balanced")
rf.fit(X_train, y_train)

In [None]:
pred_val = rf.predict(X_val)
fpr, tpr, thresholds = metrics.roc_curve(y_val, pred_val)
auc = metrics.auc(fpr, tpr)

In [None]:
pred_test = rf.predict(X_test)
my_submission = pd.DataFrame()
my_submission["click_id"] = test_id
my_submission["is_attributed"] = pred_test

In [None]:
my_submission.to_csv("submission/0.FE_RF_local{}.csv".format(auc), index=False)