In [1]:
import pandas as pd
pd.options.display.float_format = '{:,.4f}'.format
pd.options.display.max_rows = 100
pd.options.display.max_columns = 40
import numpy as np
import os,random, math
from tqdm import tqdm
from copy import deepcopy
from collections import Counter

# Visualization
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import font_manager, rc
plt.rcParams['font.family'] = 'NanumGothic'
import platform
if platform.system() == 'Windows':
    font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
    rc('font', family=font_name)
else:    
    rc('font', family='AppleGothic')

matplotlib.rcParams['axes.unicode_minus'] = False

# Warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

import datetime

from dateutil.relativedelta import relativedelta
from datetime import datetime
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error as mae

In [2]:
path = "./data/"
drug = pd.read_csv(path + 'log_doll_drug_consume_v3.csv')
emrg = pd.read_csv(path + 'log_emergency_push_v2.csv')
ear = pd.read_csv(path + 'scc_ear_function_log_v2.csv')
log_doll = pd.read_csv(path + 'log_doll.csv')

In [3]:
drug = drug.rename(columns = {'date_day':'date'})
ear = ear.rename(columns = {'date_day':'date'})

In [4]:
drug.shape, emrg.shape, ear.shape, log_doll.shape

((195518, 3), (43995, 3), (75813, 11), (587766, 10))

##  log_doll 기준으로 날짜  + doll_id 기준 데이터 log_user 만들기 

In [5]:
date = []
doll_id = []
for u in log_doll.doll_id.unique():
    origin = pd.to_datetime(np.min(log_doll.loc[log_doll.doll_id == u,'date']))
    end = pd.to_datetime(np.max(log_doll.loc[log_doll.doll_id == u,'date']))
    pe = (end-origin).days
    date+= (pd.to_datetime(list(range(pe+1)),unit = 'D',origin =pd.Timestamp(origin))).strftime('%Y-%m-%d').tolist()
    doll_id += [u]*(pe+1)

In [6]:
log_user = pd.DataFrame({'date':date,'doll_id':doll_id})

In [7]:
log_user.shape

(927578, 2)

In [8]:
log_user.head()

Unnamed: 0,date,doll_id
0,2019-04-01,124017
1,2019-04-02,124017
2,2019-04-03,124017
3,2019-04-04,124017
4,2019-04-05,124017


## log_user 와 log_doll 합치기

In [9]:
mrg = pd.merge(log_user, log_doll, on = ['date','doll_id'], how = 'left')

In [10]:
mrg.shape

(927578, 10)

In [11]:
mrg.isnull().sum()

date                                 0
doll_id                              0
stroke                          339812
hand_hold                       339812
knock                           339812
human_detection                 339812
gymnastics                      339812
brain_timer                     339812
battery                         339812
last_none_action_time_passed    339812
dtype: int64

#### na확인

In [12]:
doll = []
na_num = []
for d in mrg.doll_id.unique():
    if mrg[mrg.doll_id ==d].isnull().sum().sum()!= 0:
        doll.append(d)
        na_num.append(mrg.loc[mrg.doll_id ==d,'stroke'].isnull().sum())

In [13]:
nacheck = pd.DataFrame({'doll_id':doll,'na_num':na_num})

In [14]:
nacheck.sort_values('na_num',ascending =False)

Unnamed: 0,doll_id,na_num
358,125364,688
333,125339,677
378,125384,650
365,125371,634
231,125216,632
...,...,...
1762,127827,1
1761,127826,1
1760,127825,1
1759,127824,1


#### log_doll 데이터 NA check용

In [15]:
mrg['logdoll_yn'] = 1
mrg.loc[mrg.stroke.isnull(),'logdoll_yn'] = 0

In [16]:
mrg = mrg.fillna(0)

## drug 데이터 merge

In [17]:
mrg.shape

(927578, 11)

In [18]:
mrg2 = pd.merge(mrg, drug, on = ['date','doll_id'], how = 'left')

In [19]:
mrg2.shape

(927578, 12)

In [20]:
mrg2.isnull().sum()

date                                 0
doll_id                              0
stroke                               0
hand_hold                            0
knock                                0
human_detection                      0
gymnastics                           0
brain_timer                          0
battery                              0
last_none_action_time_passed         0
logdoll_yn                           0
consume_cnt                     732060
dtype: int64

### drug 데이터 NA check용

In [21]:
mrg2['drug_yn'] = 1
mrg2.loc[mrg2.consume_cnt.isnull(),'drug_yn'] = 0

#### na확인

In [22]:
doll = []
na_num = []
for d in mrg2.doll_id.unique():
    if mrg2[mrg2.doll_id ==d].isnull().sum().sum()!= 0:
        doll.append(d)
        na_num.append(mrg2.loc[mrg2.doll_id ==d,'consume_cnt'].isnull().sum())

In [23]:
nacheck = pd.DataFrame({'doll_id':doll,'na_num':na_num})
nacheck.sort_values('na_num',ascending =False)

Unnamed: 0,doll_id,na_num
40,124097,773
41,123775,773
30,124020,773
103,125040,772
208,125110,771
...,...,...
5091,129970,1
5092,129971,1
5093,129972,1
5094,129973,1


In [24]:
mrg2 = mrg2.fillna(0)

In [25]:
mrg2.isnull().sum()

date                            0
doll_id                         0
stroke                          0
hand_hold                       0
knock                           0
human_detection                 0
gymnastics                      0
brain_timer                     0
battery                         0
last_none_action_time_passed    0
logdoll_yn                      0
consume_cnt                     0
drug_yn                         0
dtype: int64

## emrg 데이터 merge

In [26]:
mrg3 = pd.merge(mrg2, emrg, on = ['date','doll_id'], how = 'left')

### emergency_yn

In [27]:
mrg3['emrg_yn'] = 1
mrg3.loc[mrg3.emrg_cnt.isnull(),'emrg_yn'] = 0

### na확인

In [28]:
doll = []
na_num = []
for d in mrg3.doll_id.unique():
    if mrg3[mrg3.doll_id ==d].isnull().sum().sum()!= 0:
        doll.append(d)
        na_num.append(mrg3.loc[mrg3.doll_id ==d,'emrg_cnt'].isnull().sum())

In [29]:
nacheck = pd.DataFrame({'doll_id':doll,'na_num':na_num})
nacheck.sort_values('na_num',ascending =False)

Unnamed: 0,doll_id,na_num
6,124273,773
104,125042,771
288,125185,770
25,123796,769
330,125226,769
...,...,...
5106,129971,1
5107,129972,1
5108,129973,1
5109,129974,1


In [30]:
mrg3[mrg3.doll_id ==125042]

Unnamed: 0,date,doll_id,stroke,hand_hold,knock,human_detection,gymnastics,brain_timer,battery,last_none_action_time_passed,logdoll_yn,consume_cnt,drug_yn,emrg_cnt,emrg_yn
36561,2019-04-02,125042,0.0000,0.0000,0.0000,67.0000,0.0000,0.0000,80.3333,0.0000,1,0.0000,0,,0
36562,2019-04-03,125042,38.0000,1.0000,243.0000,1377.0000,0.0000,2.0000,98.8000,0.0000,1,0.0000,0,,0
36563,2019-04-04,125042,8.0000,4.0000,81.0000,3100.0000,3.0000,1.0000,100.0000,0.0000,1,0.0000,0,,0
36564,2019-04-05,125042,7.0000,1.0000,199.0000,2491.0000,3.0000,0.0000,100.0000,0.0000,1,0.0000,0,,0
36565,2019-04-06,125042,8.0000,3.0000,173.0000,2598.0000,2.0000,0.0000,100.0000,0.0000,1,0.0000,0,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37327,2021-05-07,125042,0.0000,0.0000,0.0000,3102.0000,0.0000,0.0000,100.0000,0.0000,1,0.0000,0,,0
37328,2021-05-08,125042,1.0000,0.0000,1.0000,3169.0000,0.0000,0.0000,100.0000,0.0000,1,0.0000,0,,0
37329,2021-05-09,125042,0.0000,0.0000,0.0000,3107.0000,0.0000,0.0000,100.0000,0.0000,1,0.0000,0,,0
37330,2021-05-10,125042,0.0000,0.0000,0.0000,3102.0000,0.0000,0.0000,100.0000,0.0000,1,0.0000,0,,0


In [31]:
mrg3 = mrg3.fillna(0)

In [32]:
mrg3.isnull().sum()

date                            0
doll_id                         0
stroke                          0
hand_hold                       0
knock                           0
human_detection                 0
gymnastics                      0
brain_timer                     0
battery                         0
last_none_action_time_passed    0
logdoll_yn                      0
consume_cnt                     0
drug_yn                         0
emrg_cnt                        0
emrg_yn                         0
dtype: int64

In [33]:
mrg3.to_csv('data/log_merge1.csv', index = False)

## ear 기능 merge

In [34]:
np.min(ear.date)

'2020-04-21'

In [35]:
mrg4 = mrg3[mrg3.date >='2020-04-21']

In [36]:
mrg4.shape

(632531, 15)

In [37]:
mrg4 = pd.merge(mrg4,ear, on = ['date','doll_id'], how = 'left')

In [38]:
mrg4.isnull().sum()

date                                 0
doll_id                              0
stroke                               0
hand_hold                            0
knock                                0
human_detection                      0
gymnastics_x                         0
brain_timer                          0
battery                              0
last_none_action_time_passed         0
logdoll_yn                           0
consume_cnt                          0
drug_yn                              0
emrg_cnt                             0
emrg_yn                              0
story                           556718
religion                        556718
music                           556718
english                         556718
rememberance                    556718
quiz                            556718
gymnastics_y                    556718
classic_music                   556718
religion_music                  556718
dtype: int64

In [39]:
mrg4 = mrg4.fillna(0)

In [40]:
mrg4.to_csv('data/log_merge2.csv', index = False)

# 테스트 계정 제외한 데이터 만들기

In [41]:
doll_info = pd.read_csv('data/merge_v5.csv')

In [42]:
use_id = doll_info.doll_id.unique()

In [43]:
mrg_3 = mrg3[mrg3.doll_id.isin(use_id)].reset_index(drop=True)
mrg_3.shape

(521214, 15)

In [44]:
mrg_4 = mrg4[mrg4.doll_id.isin(use_id)].reset_index(drop=True)
mrg_4.shape

(414600, 24)

In [45]:
mrg_3.to_csv('data/log_merge1_v2.csv', index = False)
mrg_4.to_csv('data/log_merge2_v2.csv', index = False)