In [1]:
import os
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder

# Feature Engineering
This script consist two parts.
1. Refer to the findings from EDA and make transformation accordingly
2. Clustering users, ads, apps, and device and see if there is any new features that makes sense.

# Load Data

In [2]:
data_dir = "./data/sampled_data.csv/"

df = pd.DataFrame()
for file in os.listdir(data_dir):    
    if file[-3] and ".csv" in file:
        df = pd.concat((df,pd.read_csv(os.path.join(data_dir,file))),axis=0)

## Transform Data

In [3]:
user_features = ["age",
                 "city_rank",
                 "residence",
                 "gender",
                 "up_life_duration",
                 "duration_communication_onlinerate",
                 "communication_avgonline_30d"
                ]
ads_features = ["creat_type_cd",
                "dev_id",
                "slot_id",
                "indu_name"]
app_features = ["tags",
                "app_first_class",
                "app_second_class",
                "his_app_size",
                "his_on_shelf_time",
                "app_score"]
device_features = ["emui_dev",
                   "list_time",
                   "device_price"]
event_features = ["net_type"]

In [4]:
value_transform_map = {
    "age":{-1:"high_ctr",5:"low_ctr",7:""}[-1,5,7],
    "city_rank":[2],
    "residence":[11,22],
    "gender":[3],
    "up_life_duration":[-1,1,2,3,4,5,6,7,8,9,10,20],
    "communication_avgonline_30d":[-1,1,2,3,4],
    "duration_communication_onlinerate":[0,1,2,3,4,5,16],
    "creat_type_cd":[3,4,7],
    "dev_id":[17,29,30,34,60,70],
    "inter_type_cd":[3],
    "slot_id":[12,14,15,19],
    "indu_name":[17,27,36,41,49],
    "tags":[11,12,13,23,33,37,39,40],
    "app_first_class":[4],
    "app_second_class":[17,21,23,25],
    "his_app_size":[-1,2,4,5,6,16],
    "his_on_shelf_time":[2,3],
    "app_score":[1,2],
    "emui_dev":[12,13,14,20],
    "list_time":[-1,2,4,5,6,7,8,9,10,12,13,14,15,16],
    "device_price":[-1,1,3,4,5,6],
    "net_type":[6]
}

In [5]:
df["start_communication_onlinerate"] = df["communication_onlinerate"].str.split("^",expand=True)[0]
df["last_communication_onlinerate"] = df["communication_onlinerate"].str.split("^").str[-1].replace(0,24)
df["duration_communication_onlinerate"] = df["last_communication_onlinerate"].astype(int)\
                                        - df["start_communication_onlinerate"].astype(int)

In [6]:
enc = OneHotEncoder(handle_unknown='ignore')

In [7]:
transformed_df = df.copy()
transformed_df = transformed_df[list(value_transform_map.keys())]

In [8]:
for col in value_transform_map:
    transformed_df[col] = np.where(transformed_df[col].isin(value_transform_map[col]),transformed_df[col],"other")