# Model Data Preperation
###### by Wilson Lee
###### Data Set : China Mobile User Gemographics Source
###### Link : https://www.kaggle.com/chinapage/china-mobile-user-gemographics

This code will convert the cleened data base during data wrangling to the format required for model training. The columns used in this code is base on the knowledge found during Exploratory Data Analysis.

In [12]:
# import libraries
%matplotlib inline

import pandas as pd
import numpy as np
import os
import math

## Load Cleaned Database

### Event Data

In [2]:
wDatabaseFolder = "../../Data/Processed/users_with_age/active_apps"

list_of_database = []
for wRoot, wDirs, wFiles in os.walk(wDatabaseFolder):
    for wFilename in wFiles:
        wCurrentFilename = os.path.join(wRoot, wFilename)
        list_of_database.append(pd.read_csv(wCurrentFilename, index_col=None))
            
df_active_app = pd.concat(list_of_database, axis=0, ignore_index= True)

# de-reference loaded dataframe list
list_of_database = []

## Preview Data

### Table Columnes

In [3]:
print(df_active_app.columns.to_list())

['event_id', 'is_active', '3d', '80', '90', 'abroad', 'academic', 'accommodation', 'accounting', 'action', 'activity', 'adventure', 'advice', 'advisory', 'aggregate', 'air', 'aircraft', 'airport', 'alliance', 'amoy', 'animation', 'answer', 'antique', 'app', 'appliance', 'application', 'area', 'around', 'art', 'asia', 'astrology', 'audiobooks', 'automotive', 'aviation', 'avoid', 'baby', 'ball', 'bank', 'banking', 'based', 'basketball', 'beach', 'beauty', 'behalf', 'big', 'billards', 'blog', 'bobble', 'bond', 'book', 'booking', 'box', 'brokerage', 'browser', 'bus', 'business', 'buy', 'calendar', 'car', 'card', 'care', 'cartoon', 'casual', 'chain', 'channel', 'checkpoint', 'chess', 'child', 'chinese', 'church', 'class', 'classical', 'clock', 'coach', 'collection', 'college', 'comfortable', 'comic', 'commodity', 'community', 'comparing', 'competitive', 'complex', 'condition', 'consumer', 'contact', 'content', 'convenience', 'cool', 'cosplay', 'cost', 'coupon', 'cozy', 'credit', 'crowdfundi

### Data

In [4]:
df_active_app

Unnamed: 0,event_id,is_active,3d,80,90,abroad,academic,accommodation,accounting,action,...,device_id,gender,age,group_y,phone_brand,device_model,longitude,latitude,hour,day_of_week
0,6,1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,1476664663289716375,M,19,M22-,huawei,Mate 7,110.012912,30.494667,0,6
1,29,1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,7166563712658305181,M,60,M39+,huawei,荣耀畅玩4C,117.960000,28.470000,0,6
2,35,1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,-3449419341168524142,M,28,M27-28,huawei,荣耀3X畅玩版,110.012912,30.494667,0,6
3,39,1,0.0,0.0,0.0,0.0,0.0,0.0,0.100000,0.0,...,-6542093539413689868,M,26,M23-26,huawei,荣耀畅玩4X,110.012912,30.494667,0,6
4,40,1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,-5638521260975573107,M,22,M22-,huawei,荣耀7,110.012912,30.494667,0,6
5,44,1,0.0,0.0,0.0,0.0,0.0,0.0,0.076923,0.0,...,-4713356591613805069,M,25,M23-26,xiaomi,红米2,113.370000,28.250000,0,6
6,54,1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,-7868922663453980926,F,27,F27-28,huawei,G660-L075,113.110000,23.040000,23,5
7,61,1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,-1593861387409811850,M,24,M23-26,xiaomi,红米Note3,110.400000,25.310000,0,6
8,70,1,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,...,8250516622760332376,M,28,M27-28,xiaomi,MI 3,121.680000,31.120000,0,6
9,82,1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,3230070203366080618,M,35,M32-38,huawei,Mate 7,110.012912,30.494667,0,6


### Create Working Copy

In [5]:
df_active_app_new = df_active_app.copy()

### Reduce Events to One Event per User Per Hour

In [6]:
df_active_app_new = df_active_app_new.drop('event_id', axis=1)
df_active_app_new = df_active_app_new.groupby(["device_id","gender","age","group_y","phone_brand","device_model","day_of_week","hour"]).mean().reset_index()
df_active_app_new

Unnamed: 0,device_id,gender,age,group_y,phone_brand,device_model,day_of_week,hour,is_active,3d,...,west,western,wifi,word,world,xianxia,zombie,zuma,longitude,latitude
0,-9222956879900151005,M,36,M32-38,samsung,Galaxy Note 2,4,15,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,113.240000,23.190000
1,-9222956879900151005,M,36,M32-38,samsung,Galaxy Note 2,4,20,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,113.240000,23.190000
2,-9222956879900151005,M,36,M32-38,samsung,Galaxy Note 2,4,21,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,113.240000,23.190000
3,-9222956879900151005,M,36,M32-38,samsung,Galaxy Note 2,5,7,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,113.240000,23.190000
4,-9222956879900151005,M,36,M32-38,samsung,Galaxy Note 2,5,11,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,113.240000,23.190000
5,-9222956879900151005,M,36,M32-38,samsung,Galaxy Note 2,5,12,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,113.240000,23.190000
6,-9222956879900151005,M,36,M32-38,samsung,Galaxy Note 2,5,13,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,113.240000,23.190000
7,-9222956879900151005,M,36,M32-38,samsung,Galaxy Note 2,5,14,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,113.240000,23.190000
8,-9222956879900151005,M,36,M32-38,samsung,Galaxy Note 2,5,15,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,113.240000,23.190000
9,-9222956879900151005,M,36,M32-38,samsung,Galaxy Note 2,5,23,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,113.240000,23.190000


### Drop Useless Columns

In [7]:
#columns used for identification
df_active_app_new = df_active_app_new.drop("is_active", axis=1)
df_active_app_new = df_active_app_new.drop("device_id", axis=1)

# The data on the device information was to imbalance to be practical
df_active_app_new = df_active_app_new.drop("phone_brand", axis=1)
df_active_app_new = df_active_app_new.drop("device_model", axis=1)

# We will redefine the age groupe
df_active_app_new = df_active_app_new.drop("group_y", axis=1)


In [8]:
df_active_app_new

Unnamed: 0,gender,age,day_of_week,hour,3d,80,90,abroad,academic,accommodation,...,west,western,wifi,word,world,xianxia,zombie,zuma,longitude,latitude
0,M,36,4,15,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,113.240000,23.190000
1,M,36,4,20,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,113.240000,23.190000
2,M,36,4,21,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,113.240000,23.190000
3,M,36,5,7,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,113.240000,23.190000
4,M,36,5,11,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,113.240000,23.190000
5,M,36,5,12,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,113.240000,23.190000
6,M,36,5,13,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,113.240000,23.190000
7,M,36,5,14,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,113.240000,23.190000
8,M,36,5,15,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,113.240000,23.190000
9,M,36,5,23,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,113.240000,23.190000


### Encoding for Gender

In [9]:
df_active_app_new = pd.concat([df_active_app_new, pd.get_dummies(df_active_app_new["gender"], prefix="gender")], axis=1)
df_active_app_new = df_active_app_new.drop("gender", axis=1)
df_active_app_new

Unnamed: 0,age,day_of_week,hour,3d,80,90,abroad,academic,accommodation,accounting,...,wifi,word,world,xianxia,zombie,zuma,longitude,latitude,gender_F,gender_M
0,36,4,15,0.0,0.0,0.0,0.0,0.0,0.0,0.045656,...,0.0,0.0,0.0,0.0,0.0,0.0,113.240000,23.190000,0,1
1,36,4,20,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,113.240000,23.190000,0,1
2,36,4,21,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,113.240000,23.190000,0,1
3,36,5,7,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,113.240000,23.190000,0,1
4,36,5,11,0.0,0.0,0.0,0.0,0.0,0.0,0.014286,...,0.0,0.0,0.0,0.0,0.0,0.0,113.240000,23.190000,0,1
5,36,5,12,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,113.240000,23.190000,0,1
6,36,5,13,0.0,0.0,0.0,0.0,0.0,0.0,0.044766,...,0.0,0.0,0.0,0.0,0.0,0.0,113.240000,23.190000,0,1
7,36,5,14,0.0,0.0,0.0,0.0,0.0,0.0,0.013889,...,0.0,0.0,0.0,0.0,0.0,0.0,113.240000,23.190000,0,1
8,36,5,15,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,113.240000,23.190000,0,1
9,36,5,23,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,113.240000,23.190000,0,1


### Save The New Data

In [13]:
print("Function start ...")

# save Data per day of the week
for i in range(0, 7):
    # get week slice
    df_day_app_use = df_active_app_new[df_active_app_new["day_of_week"] == i]
    
    # split data into 2
    thresthold = math.floor(df_day_app_use.shape[0]*0.5)
    df_day_app_use_1 = df_day_app_use.iloc[:thresthold, :]
    df_day_app_use_2 = df_day_app_use.iloc[thresthold:, :]
    
    # Set File name
    filename_day_app_use_1 = '../../Data/Modeling/active_apps_data/app_usage_on_weekday_{:1d}_p1.csv'.format(i)
    filename_day_app_use_2 = '../../Data/Modeling/active_apps_data/app_usage_on_weekday_{:1d}_p2.csv'.format(i)
    
    # Save data to csv
    df_day_app_use_1.to_csv(filename_day_app_use_1, index=False)
    df_day_app_use_2.to_csv(filename_day_app_use_2, index=False)
    
print("Function Complete")

Function start ...
Function Complete
