# Data Wrangling and Cleaning
by Wilson Lee
Data Set : China Mobile User Gemographics
Source Link : https://www.kaggle.com/chinapage/china-mobile-user-gemographics

In [1]:
# import libraries
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import nltk
import string
from nltk.tokenize import word_tokenize

# convert sientific notation to decimals
pd.set_option("display.float_format", lambda x:"%.2f" % x)


## Load & Merge the Data

### Load single file database

In [2]:
df_app_label = pd.read_csv('../../Data/Raw/China Mobile User Gemographics/app_labels.csv')
df_gender_age_test = pd.read_csv('../../Data/Raw/China Mobile User Gemographics/gender_age_test.csv')
df_gender_age_train = pd.read_csv('../../Data/Raw/China Mobile User Gemographics/gender_age_train.csv')
df_label_categories = pd.read_csv('../../Data/Raw/China Mobile User Gemographics/label_categories.csv')
df_phone_brand_device_model = pd.read_csv('../../Data/Raw/China Mobile User Gemographics/phone_brand_device_model.csv')



### Load Multi-File database

#### Load Event List

In [3]:
wEvent_splitFolder = "../../Data/Raw/China Mobile User Gemographics/events_split"
wEvents_header_file = "events_header"
df_events_header = pd.read_csv(os.path.join(wEvent_splitFolder, wEvents_header_file))

list_df_events = [];
for wRoot, wDirs, wFiles in os.walk(wEvent_splitFolder):
    for wFilename in wFiles:
        if wEvents_header_file != wFilename:
            wCurrentFilename = os.path.join(wRoot, wFilename)
            list_df_events.append(pd.read_csv(wCurrentFilename, index_col=None, header=None))
            break
            
df_events =  pd.concat(list_df_events, axis = 0, ignore_index = True)
df_events.columns = df_events_header.columns

# de-reference loaded dataframe list
list_df_events = []

#### Load Application Events

In [4]:
wApp_Event_splitFolder = "../../Data/Raw/China Mobile User Gemographics/app_events_split"
wApp_Events_header_file = "app_events_header"
df_app_events_header = pd.read_csv(os.path.join(wApp_Event_splitFolder, wApp_Events_header_file))

list_df_app_events = [];
for wRoot, wDirs, wFiles in os.walk(wApp_Event_splitFolder):
    for wFilename in wFiles:
        if wApp_Events_header_file != wFilename:
            wCurrentFilename = os.path.join(wRoot, wFilename)
            list_df_app_events.append(pd.read_csv(wCurrentFilename, index_col=None, header=None))
            break
            
df_app_events =  pd.concat(list_df_app_events, axis = 0, ignore_index = True)
df_app_events.columns = df_app_events_header.columns

# de-reference loaded dataframe list
list_df_app_events = []

### Preview Data

#### Gender & Age Data

In [5]:
print("Number of rows : {0}".format(len(df_gender_age_test)))
df_gender_age_test.head(3)

Number of rows : 112071


Unnamed: 0,device_id
0,1002079943728939269
1,-1547860181818787117
2,7374582448058474277


In [6]:
print("Number of rows : {0}".format(len(df_gender_age_train)))
df_gender_age_train.head(3)

Number of rows : 74645


Unnamed: 0,device_id,gender,age,group
0,-8076087639492063270,M,35,M32-38
1,-2897161552818060146,M,35,M32-38
2,-8260683887967679142,M,35,M32-38


#### App Label Data

In [7]:
print("Number of rows : {0}".format(len(df_app_label)))
df_app_label.head(3)

Number of rows : 459943


Unnamed: 0,app_id,label_id
0,7324884708820027918,251
1,-4494216993218550286,251
2,6058196446775239644,406


#### App Category Labels

In [8]:
print("Number of rows : {0}".format(len(df_label_categories)))
df_label_categories.head(3)

Number of rows : 930


Unnamed: 0,label_id,category
0,1,
1,2,game-game type
2,3,game-Game themes


#### Device Brand and Model

In [9]:
print("Number of rows : {0}".format(len(df_phone_brand_device_model)))
df_phone_brand_device_model.head(3)

Number of rows : 187245


Unnamed: 0,device_id,phone_brand,device_model
0,-8890648629457979026,小米,红米
1,1277779817574759137,小米,MI 2
2,5137427614288105724,三星,Galaxy S4


#### Event Data

In [10]:
print("Number of rows : {0}".format(len(df_events)))
df_events.head(10)

Number of rows : 849999


Unnamed: 0,event_id,device_id,timestamp,longitude,latitude
0,1,29182687948017175,2016-05-01 00:55:25,121.38,31.24
1,2,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97
2,3,-4833982096941402721,2016-05-01 00:08:05,106.6,29.7
3,4,-6815121365017318426,2016-05-01 00:06:40,104.27,23.28
4,5,-5373797595892518570,2016-05-01 00:07:18,115.88,28.66
5,6,1476664663289716375,2016-05-01 00:27:21,0.0,0.0
6,7,5990807147117726237,2016-05-01 00:15:13,113.73,23.0
7,8,1782450055857303792,2016-05-01 00:15:35,113.94,34.7
8,9,-2073340001552902943,2016-05-01 00:15:33,0.0,0.0
9,10,-8195816569128397698,2016-05-01 00:41:31,119.34,26.04


#### Application Event Data

In [11]:
print("Number of rows : {0}".format(len(df_app_events)))
df_app_events.head(10)

Number of rows : 849999


Unnamed: 0,event_id,app_id,is_installed,is_active
0,2,5927333115845830913,1,1
1,2,-5720078949152207372,1,0
2,2,-1633887856876571208,1,0
3,2,-653184325010919369,1,1
4,2,8693964245073640147,1,1
5,2,4775896950989639373,1,1
6,2,-8022267440849930066,1,0
7,2,9112463267739110219,1,0
8,2,-3725672010020973973,1,0
9,2,7167114343576723123,1,1


### Clean Data

#### Drop NA & NaN

In [12]:
print("<df_app_label> Number of rows prior drop NA : {0}".format(len(df_app_label)))
df_app_label = df_app_label.dropna()
print("<df_app_label> Number of rows after drop NA : {0}".format(len(df_app_label)))
print()
print("<df_gender_age_train> Number of rows prior drop NA : {0}".format(len(df_gender_age_train)))
df_gender_age_train = df_gender_age_train.dropna()
print("<df_gender_age_train> Number of rows after drop NA : {0}".format(len(df_gender_age_train)))
print()
print("<df_label_categories> Number of rows prior drop NA : {0}".format(len(df_label_categories)))
df_label_categories = df_label_categories.dropna()
print("<df_label_categories> Number of rows after drop NA : {0}".format(len(df_label_categories)))
print()
print("<df_phone_brand_device_model> Number of rows prior drop NA : {0}".format(len(df_phone_brand_device_model)))
df_phone_brand_device_model = df_phone_brand_device_model.dropna()
print("<df_phone_brand_device_model> Number of rows after drop NA : {0}".format(len(df_phone_brand_device_model)))
print()
print("<df_app_events> Number of rows prior drop NA : {0}".format(len(df_app_events)))
df_app_events = df_app_events.dropna()
print("<df_app_events> Number of rows after drop NA : {0}".format(len(df_app_events)))
print()
print("<df_events> Number of rows prior drop NA : {0}".format(len(df_events)))
df_events = df_events.dropna()
print("<df_events> Number of rows after drop NA : {0}".format(len(df_events)))



<df_app_label> Number of rows prior drop NA : 459943
<df_app_label> Number of rows after drop NA : 459943

<df_gender_age_train> Number of rows prior drop NA : 74645
<df_gender_age_train> Number of rows after drop NA : 74645

<df_label_categories> Number of rows prior drop NA : 930
<df_label_categories> Number of rows after drop NA : 927

<df_phone_brand_device_model> Number of rows prior drop NA : 187245
<df_phone_brand_device_model> Number of rows after drop NA : 187245

<df_app_events> Number of rows prior drop NA : 849999
<df_app_events> Number of rows after drop NA : 849999

<df_events> Number of rows prior drop NA : 849999
<df_events> Number of rows after drop NA : 849999


#### Remove "unknown" Label Categories

In [13]:
print("<df_app_label> Number of rows prior <unknown> category removal : {0}".format(len(df_label_categories)))

df_label_categories = df_label_categories[(df_label_categories["category"] != "") & (df_label_categories["category"] != "NaN") & (df_label_categories["category"] != "unknown")]

print("<df_app_label> Number of rows after <unknown> category removal : {0}".format(len(df_label_categories)))


<df_app_label> Number of rows prior <unknown> category removal : 927
<df_app_label> Number of rows after <unknown> category removal : 901


#### Remove Events of Un-Installed Applications

In [14]:
print("<df_events> Number of rows prior uninstalled app removal : {0}".format(len(df_events)))

df_app_events = df_app_events[(df_app_events["is_installed"] == 1)]
print("<df_events> Number of rows after uninstalled app removal : {0}".format(len(df_events)))

df_app_events[(df_app_events["is_installed"] == 0)]


<df_events> Number of rows prior uninstalled app removal : 849999
<df_events> Number of rows after uninstalled app removal : 849999


Unnamed: 0,event_id,app_id,is_installed,is_active


##### Remove "is_installed" Column

In [15]:
df_app_events = df_app_events.drop("is_installed", axis=1)
df_app_events.head(3)


Unnamed: 0,event_id,app_id,is_active
0,2,5927333115845830913,1
1,2,-5720078949152207372,0
2,2,-1633887856876571208,0


#### Transform Data

##### Label Categories

###### Clean Text

In [16]:
df_trans_label_categories = df_label_categories.copy()
# to Lower case
df_trans_label_categories["category-mod"] = df_trans_label_categories["category"]
df_trans_label_categories["category-mod"] = df_trans_label_categories["category-mod"].str.lower()

translator = str.maketrans('game-','game ')
df_trans_label_categories["category-mod"] = df_trans_label_categories["category-mod"].str.translate(translator)

# rpg
df_trans_label_categories["category-mod"] = df_trans_label_categories["category-mod"].str.replace(" rpg "," rpg game ")
df_trans_label_categories["category-mod"] = df_trans_label_categories["category-mod"].str.replace("  "," ")
df_trans_label_categories["category-mod"] = df_trans_label_categories["category-mod"].str.replace(" role playing games "," rpg game ")

# paid
df_trans_label_categories["category-mod"] = df_trans_label_categories["category-mod"].str.replace(r'paid[0-9]','paid')

# low
df_trans_label_categories["category-mod"] = df_trans_label_categories["category-mod"].str.replace(" lower "," low")
df_trans_label_categories["category-mod"] = df_trans_label_categories["category-mod"].str.replace(" low "," low ")

# higher
df_trans_label_categories["category-mod"] = df_trans_label_categories["category-mod"].str.replace(" higher "," high")
df_trans_label_categories["category-mod"] = df_trans_label_categories["category-mod"].str.replace(" high "," high ")

# play
df_trans_label_categories["category-mod"] = df_trans_label_categories["category-mod"].str.replace(r'played[0-9]','play')
df_trans_label_categories["category-mod"] = df_trans_label_categories["category-mod"].str.replace(r'play[0-9]','play')
df_trans_label_categories["category-mod"] = df_trans_label_categories["category-mod"].str.replace(" played "," play ")
df_trans_label_categories["category-mod"] = df_trans_label_categories["category-mod"].str.replace(" play "," play ")

# fetus
df_trans_label_categories["category-mod"] = df_trans_label_categories["category-mod"].str.replace("fetus"," pregnant baby ")
df_trans_label_categories["category-mod"] = df_trans_label_categories["category-mod"].str.replace("pregnancy"," pregnant baby ")

# kids
df_trans_label_categories["category-mod"] = df_trans_label_categories["category-mod"].str.replace(" kids "," children ")
df_trans_label_categories["category-mod"] = df_trans_label_categories["category-mod"].str.replace(" children "," child ")

# united states
df_trans_label_categories["category-mod"] = df_trans_label_categories["category-mod"].str.replace(r"^us ","united-states ")
df_trans_label_categories["category-mod"] = df_trans_label_categories["category-mod"].str.replace(r" us$"," united-states")
df_trans_label_categories["category-mod"] = df_trans_label_categories["category-mod"].str.replace(" us "," united-states ")
df_trans_label_categories["category-mod"] = df_trans_label_categories["category-mod"].str.replace("united states","united-states")

# united kingdom
df_trans_label_categories["category-mod"] = df_trans_label_categories["category-mod"].str.replace(r"^uk ","united-kingdom ")
df_trans_label_categories["category-mod"] = df_trans_label_categories["category-mod"].str.replace(r" uk$"," united-kingdom")
df_trans_label_categories["category-mod"] = df_trans_label_categories["category-mod"].str.replace(" uk "," united-kingdom ")
df_trans_label_categories["category-mod"] = df_trans_label_categories["category-mod"].str.replace("united kingdom","united-kingdom")

# special words
df_trans_label_categories["category-mod"] = df_trans_label_categories["category-mod"].str.replace("jin yon","jinyon")
df_trans_label_categories["category-mod"] = df_trans_label_categories["category-mod"].str.replace("xian xia","xianxia")
df_trans_label_categories["category-mod"] = df_trans_label_categories["category-mod"].str.replace("flight","aviation")
df_trans_label_categories["category-mod"] = df_trans_label_categories["category-mod"].str.replace("airline","aviation")
df_trans_label_categories["category-mod"] = df_trans_label_categories["category-mod"].str.replace("aeronautical","aviation")
df_trans_label_categories["category-mod"] = df_trans_label_categories["category-mod"].str.replace("aviations","aviation")
df_trans_label_categories["category-mod"] = df_trans_label_categories["category-mod"].str.replace("shares","stock")
df_trans_label_categories["category-mod"] = df_trans_label_categories["category-mod"].str.replace("puzzel","puzzle")


# replace brackets
df_trans_label_categories["category-mod"] = df_trans_label_categories["category-mod"].str.replace("("," ")
df_trans_label_categories["category-mod"] = df_trans_label_categories["category-mod"].str.replace(")"," ")

# replace quotes
df_trans_label_categories["category-mod"] = df_trans_label_categories["category-mod"].str.replace("'"," ")

# replace comma
df_trans_label_categories["category-mod"] = df_trans_label_categories["category-mod"].str.replace(","," ")

# replace float numbers
df_trans_label_categories["category-mod"] = df_trans_label_categories["category-mod"].str.replace(r'[0-9]\.[0-9]','')
df_trans_label_categories["category-mod"] = df_trans_label_categories["category-mod"].str.replace(r' [0-9][0-9] ','')

# replace period
df_trans_label_categories["category-mod"] = df_trans_label_categories["category-mod"].str.replace("."," ")

# replace single character words
df_trans_label_categories["category-mod"] = df_trans_label_categories["category-mod"].str.replace(r'^. ','')
df_trans_label_categories["category-mod"] = df_trans_label_categories["category-mod"].str.replace(r' . ','')
df_trans_label_categories["category-mod"] = df_trans_label_categories["category-mod"].str.replace(r' .$','')

# clean spaces
df_trans_label_categories["category-mod"] = df_trans_label_categories["category-mod"].str.replace("   ","  ")
df_trans_label_categories["category-mod"] = df_trans_label_categories["category-mod"].str.replace("  "," ")


from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('stopwords')

# for a more complete list of stopwords and in other languages: https://www.ranks.nl/stopwords
# pritn the stop_words
stop_words = set(stopwords.words('english'))
wordnet_lemmatize = WordNetLemmatizer()
stemmer = PorterStemmer()

# custome operation
def custom_op(row):
    wWordBag = row["category-mod"].split(" ")
    # remove stop words
    wWordBag = [word for word in wWordBag if not word in stop_words]
    # remove empty strings
    wWordBag = [word for word in wWordBag if not word == ""]
    
    wNewWordBag = []
    for word in wWordBag:
        # lemmatize word
        wNewWord = wordnet_lemmatize.lemmatize(word)
        wNewWord = stemmer.stem(wNewWord)
        wNewWordBag.append(wNewWord)

    wNewWordBag = [word for word in wNewWordBag if len(word) >= 2]

    ostring = ""
    for word in wNewWordBag:
        if not ostring == "":
            ostring += " "
        ostring += word
    return ostring

df_trans_label_categories["category-mod"] = df_trans_label_categories.apply(custom_op, axis=1)

# list = df_trans_label_categories["category-mod"].tolist()
# print(list)
# print(len(list))
df_trans_label_categories

[nltk_data] Downloading package wordnet to C:\Users\Wilson
[nltk_data]     Lee\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Wilson
[nltk_data]     Lee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,label_id,category,category-mod
1,2,game-game type,game game type
2,3,game-Game themes,game game theme
3,4,game-Art Style,game art style
4,5,game-Leisure time,game leisur time
5,6,game-Cutting things,game cut thing
6,7,game-Finding fault,game find fault
7,8,game-stress reliever,game stress reliev
8,9,game-pet,game pet
9,10,game-Answer,game answer
10,11,game-Fishing,game fish


###### Create word vector

In [17]:
list_of_labels = df_trans_label_categories["category-mod"].tolist()
list_of_words = []
for label in list_of_labels:
    wordbag = label.split(" ")
    for word in wordbag:
        if word == "":
            continue
        if not word in list_of_words:
            list_of_words.append(word)
list_of_words.sort()
print(list_of_words)
print(len(list_of_words))

['2g', '3d', '3g', '4g', '80', '90', 'abroad', 'academ', 'accessori', 'accommod', 'account', 'action', 'activ', 'adventur', 'advertis', 'advic', 'advisori', 'aftermarket', 'age', 'agenc', 'agent', 'aggreg', 'air', 'aircraft', 'airport', 'allianc', 'amount', 'amoy', 'android', 'anim', 'answer', 'antiqu', 'app', 'applianc', 'applic', 'area', 'around', 'arpg', 'art', 'asia', 'astrolog', 'attract', 'audiobook', 'australia', 'austria', 'auto', 'automot', 'aviat', 'avoid', 'babi', 'bag', 'ball', 'bank', 'bar', 'barteahous', 'base', 'basketbal', 'bath', 'bathroom', 'beach', 'beauti', 'bed', 'behalf', 'behavior', 'belgium', 'big', 'billard', 'billiard', 'blog', 'bm', 'board', 'bobbl', 'bond', 'book', 'bookstor', 'box', 'bracelet', 'brand', 'brazil', 'bread', 'brokerag', 'browser', 'bu', 'buffet', 'busi', 'buy', 'cafe', 'calendar', 'camera', 'canada', 'capit', 'car', 'card', 'care', 'carechang', 'cartoon', 'casual', 'categori', 'center', 'chain', 'channel', 'charg', 'chattel', 'checkpoint', 'ch

###### Create "One Hot Encode" for Label Categories

In [23]:
for word in list_of_words:
    df_trans_label_categories[word] = 0
    
for index, row in df_trans_label_categories.iterrows():
    wordbag = row["category-mod"].split(" ")
    for word in wordbag:
        if word in list_of_words:
            df_trans_label_categories.at[index, word] = 1


In [34]:
# test transformation
df_trans_label_categories[(df_trans_label_categories["europ"]==1)]

Unnamed: 0,label_id,category,category-mod,2g,3d,3g,4g,80,90,abroad,...,woman,wonder,word,work,world,wp,xianxia,youth,zombi,zuma
98,99,US and Europe animation,united-st europ anim,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
102,103,US and Europe magic comic,united-st europ magic comic,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
842,930,"Europe, the United States and Macao (aviation)",europ united-st macao aviat,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
858,946,"Europe, the United States and Macao (Travel)",europ united-st macao travel,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
909,1001,Europe and Fantasy,europ fantasi,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


##### Save Data

In [None]:
df_trans_label_categories.to_csv('../../Data/Processed/label_categories.csv', index=False)

#### Aggregate App Labels

##### Merge Label category with Application

In [40]:
df_agg_app_label = df_app_label.copy()
df_agg_app_label = df_agg_app_label.merge(df_trans_label_categories, on="label_id", how="left")

Unnamed: 0,app_id,label_id,category,category-mod,2g,3d,3g,4g,80,90,...,woman,wonder,word,work,world,wp,xianxia,youth,zombi,zuma
415409,-3356637000289276446,930,"Europe, the United States and Macao (aviation)",europ united-st macao aviat,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
415410,-4652629963714764957,930,"Europe, the United States and Macao (aviation)",europ united-st macao aviat,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
415411,-1497748835382840297,930,"Europe, the United States and Macao (aviation)",europ united-st macao aviat,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
415412,3111814287760813327,930,"Europe, the United States and Macao (aviation)",europ united-st macao aviat,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
415413,-1137564176672260815,930,"Europe, the United States and Macao (aviation)",europ united-st macao aviat,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
415414,-8354460306333186298,930,"Europe, the United States and Macao (aviation)",europ united-st macao aviat,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
415415,-9032217973058715563,930,"Europe, the United States and Macao (aviation)",europ united-st macao aviat,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
415416,-7823702840720829524,930,"Europe, the United States and Macao (aviation)",europ united-st macao aviat,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
415417,5435107991258418122,930,"Europe, the United States and Macao (aviation)",europ united-st macao aviat,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
415418,7374504920052709763,930,"Europe, the United States and Macao (aviation)",europ united-st macao aviat,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


##### Drop useless columns

In [41]:
df_agg_app_label = df_agg_app_label.drop("label_id", axis=1)
df_agg_app_label = df_agg_app_label.drop("category", axis=1)
df_agg_app_label = df_agg_app_label.drop("category-mod", axis=1)

##### Aggregate Application word vector

In [44]:
df_agg_app_label = df_agg_app_label.groupby(by='app_id').sum()

##### Normalize word vector

In [52]:
for word in list_of_words:
    df_agg_app_label[word] = df_agg_app_label[word].apply(lambda x : 1 if x > 0.5 else 0)


In [53]:
df_agg_app_label[(df_agg_app_label["xianxia"]==1)]

Unnamed: 0_level_0,2g,3d,3g,4g,80,90,abroad,academ,accessori,accommod,...,woman,wonder,word,work,world,wp,xianxia,youth,zombi,zuma
app_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2082160393290296417,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
7406605706859482641,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
9112463382285835733,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


##### Saving Data

In [None]:
df_agg_app_label.to_csv('../../Data/Processed/app_labels.csv', index=True)

#### Aggregate Application Event 

### Merge Data

##### Merge App Label into App Event  Data

In [96]:
df_event_category = df_app_events.copy()
df_event_category = df_event_category.merge(df_agg_app_label, on='app_id', how='left')
df_event_category.head(10)

MemoryError: 

In [20]:
df_event_category.groupby(["event_id","app_id","is_active"])['category'].apply(lambda cat: ' | '.join(cat))

KeyboardInterrupt: 