# Exploratory Data Analysis
###### by Wilson Lee
###### Data Set : China Mobile User Gemographics Source
###### Link : https://www.kaggle.com/chinapage/china-mobile-user-gemographics

#### Business perspective
The objective of this analysis is to determine the viability of using the provided data to find the interest of the given population. With knowledge of the population interest will it be possible to generate a list of topics to encorage user app engagement. It would be ideal to use the generated topics as the inspiration for aplication development.

In [1]:
# import libraries
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

# convert sientific notation to decimals
pd.set_option("display.float_format", lambda x:"%.2f" % x)

## Load Cleaned Database

### Phone Data

In [2]:
df_phone_data = pd.read_csv("../../Data/Processed/device_model_count.csv", index_col=None)

### Label categories

In [3]:
df_label_categories = pd.read_csv("../../Data/Processed/label_categories.csv", index_col=None)

### Event Data

In [4]:
wDatabaseFolder = "../../Data/Processed/users_with_age/active_apps"

list_of_database = []
for wRoot, wDirs, wFiles in os.walk(wDatabaseFolder):
    for wFilename in wFiles:
        wCurrentFilename = os.path.join(wRoot, wFilename)
        list_of_database.append(pd.read_csv(wCurrentFilename, index_col=None))
            
df_active_app = pd.concat(list_of_database, axis=0, ignore_index= True)

# de-reference loaded dataframe list
list_of_database = []

## Preview Data

### Phone Data

In [None]:
df_phone_data

Unnamed: 0,phone_brand,device_model,count
0,Cool ratio,H1,5
1,Cool ratio,H1S,2
2,Cool ratio,H2,107
3,Cool ratio,H6,2
4,Cool ratio,M1,19
5,Cool ratio,MUSE,2
6,Cool ratio,S1,14
7,Cool ratio,S2,196
8,Cool ratio,S3,95
9,Cool ratio,S6,26


### Label Categories

In [None]:
df_label_categories

Unnamed: 0,label_id,category,category-mod,3d,80,90,abroad,academic,accommodation,accounting,...,weibo,weight,west,western,wifi,word,world,xianxia,zombie,zuma
0,2,game-game type,game game type,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3,game-Game themes,game game theme,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,game-Art Style,game art style,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5,game-Leisure time,game leisure time,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,6,game-Cutting things,game cutting thing,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,7,game-Finding fault,game finding fault,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,8,game-stress reliever,game stress reliever,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,9,game-pet,game pet,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,10,game-Answer,game answer,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,11,game-Fishing,game fishing,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Event Data

In [None]:
df_active_app

Unnamed: 0,event_id,is_active,3d,80,90,abroad,academic,accommodation,accounting,action,...,device_id,gender,age,group_y,phone_brand,device_model,longitude,latitude,hour,day_of_week
0,6,1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,1476664663289716375,M,19,M22-,huawei,Mate 7,110.01,30.49,0,6
1,29,1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,7166563712658305181,M,60,M39+,huawei,荣耀畅玩4C,117.96,28.47,0,6
2,35,1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,-3449419341168524142,M,28,M27-28,huawei,荣耀3X畅玩版,110.01,30.49,0,6
3,39,1,0.00,0.00,0.00,0.00,0.00,0.00,0.10,0.00,...,-6542093539413689868,M,26,M23-26,huawei,荣耀畅玩4X,110.01,30.49,0,6
4,40,1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,-5638521260975573107,M,22,M22-,huawei,荣耀7,110.01,30.49,0,6
5,44,1,0.00,0.00,0.00,0.00,0.00,0.00,0.08,0.00,...,-4713356591613805069,M,25,M23-26,xiaomi,红米2,113.37,28.25,0,6
6,54,1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,-7868922663453980926,F,27,F27-28,huawei,G660-L075,113.11,23.04,23,5
7,61,1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,-1593861387409811850,M,24,M23-26,xiaomi,红米Note3,110.40,25.31,0,6
8,70,1,0.00,0.00,0.00,0.00,0.00,0.00,0.14,0.00,...,8250516622760332376,M,28,M27-28,xiaomi,MI 3,121.68,31.12,0,6
9,82,1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,3230070203366080618,M,35,M32-38,huawei,Mate 7,110.01,30.49,0,6


## Normalize Event Data

### Check Individual User Events per Hour

In [None]:
df_events_per_user_per_hour = df_active_app[["device_id","day_of_week","hour"]].copy()
df_events_per_user_per_hour["count"] = 1
df_events_per_user_per_hour = df_events_per_user_per_hour.groupby(["device_id","day_of_week","hour"]).sum()
df_events_per_user_per_hour

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count
device_id,day_of_week,hour,Unnamed: 3_level_1
-9222956879900151005,4,15,6
-9222956879900151005,4,20,4
-9222956879900151005,4,21,3
-9222956879900151005,5,7,1
-9222956879900151005,5,11,5
-9222956879900151005,5,12,4
-9222956879900151005,5,13,3
-9222956879900151005,5,14,3
-9222956879900151005,5,15,2
-9222956879900151005,5,23,2


### Reduce the Data to 1 Event Per User per Hour

In [None]:
df_normalized_active_app = df_active_app.drop('event_id', axis=1)
df_normalized_active_app = df_normalized_active_app.groupby(["device_id","gender","age","group_y","phone_brand","device_model","day_of_week","hour"]).mean().reset_index()
df_normalized_active_app

### Get Individual User Interest Profile

In [None]:
df_user_profile = df_normalized_active_app.drop(["day_of_week","hour"], axis=1)
df_user_profile = df_user_profile.groupby(["device_id","gender","age","group_y","phone_brand","device_model"]).mean().reset_index()
df_user_profile

## Seperate Event by Gender

In [None]:
df_male_users_active_app = df_normalized_active_app[df_normalized_active_app['gender']=="M"]
df_female_users_active_app = df_normalized_active_app[df_normalized_active_app['gender']=="F"]

df_male_users_profile = df_user_profile[df_user_profile['gender']=="M"]
df_female_users_profile = df_user_profile[df_user_profile['gender']=="F"]

## Check Data distribution in time

### WeekDay

In [None]:
df_weekDayCount = df_male_users_active_app['day_of_week'].value_counts().reset_index();
df_weekDayCount.columns = ["day_of_week", "male_count"]
df_weekDayCount["male_average"] = df_weekDayCount["male_count"].apply(lambda x: x/df_male_users_active_app["device_id"].nunique())

df_weekDayCount_2 = df_female_users_active_app['day_of_week'].value_counts().reset_index();
df_weekDayCount_2.columns = ["day_of_week", "female_count"]
df_weekDayCount_2["female_average"] = df_weekDayCount_2["female_count"].apply(lambda x: x/df_female_users_active_app["device_id"].nunique())

df_weekDayCount = df_weekDayCount.merge(df_weekDayCount_2, on="day_of_week", how="left")
df_weekDayCount = df_weekDayCount.sort_values("day_of_week")
df_weekDayCount

In [None]:
plt.figure(figsize=(12,6))
plt.plot(df_weekDayCount['day_of_week'], df_weekDayCount["male_average"], "o-", color="green", label='male')
plt.plot(df_weekDayCount['day_of_week'], df_weekDayCount["female_average"], "o-", color="red", label='female')
plt.legend(loc='upper right')
plt.xlim([-1,7])
plt.ylim([0,3])
plt.show()

### Hour of Day

In [None]:
df_HourOfDayCount = df_male_users_active_app['hour'].value_counts().reset_index();
df_HourOfDayCount.columns = ["hour", "male_count"]
df_HourOfDayCount["male_average"] = df_HourOfDayCount["male_count"].apply(lambda x: x/df_male_users_active_app["device_id"].nunique())

df_HourOfDayCount_2 = df_female_users_active_app['hour'].value_counts().reset_index();
df_HourOfDayCount_2.columns = ["hour", "female_count"]
df_HourOfDayCount_2["female_average"] = df_HourOfDayCount_2["female_count"].apply(lambda x: x/df_female_users_active_app["device_id"].nunique())

df_HourOfDayCount = df_HourOfDayCount.merge(df_HourOfDayCount_2, on="hour", how="left")
df_HourOfDayCount = df_HourOfDayCount.sort_values("hour")
df_HourOfDayCount

In [None]:
plt.figure(figsize=(12,6))
plt.plot(df_HourOfDayCount['hour'], df_HourOfDayCount["male_average"], "-o", color="green", label='male')
plt.plot(df_HourOfDayCount['hour'], df_HourOfDayCount["female_average"], "-o", color="red", label='female')
plt.legend(loc='upper right')
plt.xlim([-1,24])
plt.ylim([0,1])
plt.show()


### Hour of Week

In [None]:
df_hour_dist_male = df_male_users_active_app[["day_of_week", "hour"]].copy()
df_hour_dist_male['Dist_24_7'] = df_hour_dist_male.apply(lambda x: x['day_of_week']*24 + x["hour"], axis=1)

df_hour_dist_female = df_female_users_active_app[["day_of_week", "hour"]].copy()
df_hour_dist_female['Dist_24_7'] = df_hour_dist_female.apply(lambda x: x['day_of_week']*24 + x["hour"], axis=1)


In [None]:
df_24_7 = df_hour_dist_male['Dist_24_7'].value_counts().reset_index();
df_24_7.columns = ["Dist_24_7", "male_24_7_count"]
df_24_7["male_average"] = df_24_7["male_24_7_count"].apply(lambda x: x/df_male_users_active_app["device_id"].nunique())

df_24_7_2 = df_hour_dist_female['Dist_24_7'].value_counts().reset_index();
df_24_7_2.columns = ["Dist_24_7", "female_24_7_count"]
df_24_7_2["female_average"] = df_24_7_2["female_24_7_count"].apply(lambda x: x/df_female_users_active_app["device_id"].nunique())

df_24_7 = df_24_7.merge(df_24_7_2, on="Dist_24_7", how="left")
df_24_7 = df_24_7.sort_values("Dist_24_7")
df_24_7

In [None]:
plt.figure(figsize=(12,6))
plt.plot(df_24_7['Dist_24_7'], df_24_7["male_average"], "-", color="green", label='male')
plt.plot(df_24_7['Dist_24_7'], df_24_7["female_average"], "-", color="red", label='female')
plt.legend(loc='upper right')
plt.xlim([-1,168])
plt.ylim([0,0.25])
plt.show()

### Week day vs Hour plot

In [None]:
df_hour_dist_male_count = df_hour_dist_male.groupby(by=["day_of_week", "hour"]).count()
df_hour_dist_male_count.columns = ["count"]
df_hour_dist_male_count = df_hour_dist_male_count.reset_index();
max_user_at_hour  = df_hour_dist_male_count ["count"].max()

df_hour_dist_female_count = df_hour_dist_female.groupby(by=["day_of_week", "hour"]).count()
df_hour_dist_female_count.columns = ["count"]
df_hour_dist_female_count = df_hour_dist_female_count.reset_index();
max_user_at_hour2  = df_hour_dist_male_count ["count"].max()
if max_user_at_hour < max_user_at_hour2:
    max_user_at_hour = max_user_at_hour2

plt.figure(figsize=(12,6))
for index, row in df_hour_dist_male_count.iterrows():
    plt.plot(row['day_of_week'], row['hour'], 'o', color='green', alpha=row["count"]/max_user_at_hour)
for index, row in df_hour_dist_female_count.iterrows():
    plt.plot(row['day_of_week'] + 0.3, row['hour'], 'o', color='red', alpha=row["count"]/max_user_at_hour)
plt.show()


## User Age Distribution

In [None]:
plt.figure(figsize=(10,6))

plt.xlim([0,100])

plt.hist(df_male_users_profile['age'], bins=[0,10,20,30,40,50,60,70,80,90], width=3, color="green", alpha=0.5, label='male')
plt.hist(df_female_users_profile['age'] + 3, bins=[3,13,23,33,43,53,63,73,83,93], width=3, color="red", alpha=0.5, label='female')
plt.legend(loc='upper right')
plt.show()


## Device Brand Distribution

### Number of Users per Phone Brand

In [None]:
df_brand_count = df_phone_data.groupby(['phone_brand'])["count"].sum().reset_index()
df_brand_count = df_brand_count.sort_values(by=['count'])
df_brand_count = df_brand_count.tail(20)
plt.figure(figsize=(12,10))
plt.barh(df_brand_count['phone_brand'],df_brand_count['count'], color='blue', orientation='horizontal')
plt.show()

#### Male Users Phone Device Brand

In [None]:
plt.figure(figsize=(12,10))
plt.plot(df_male_users_profile['age'], df_male_users_profile['phone_brand'], 'o', color='green', alpha=0.005, label='male')
plt.show()

#### Female Users Phone Device Brand

In [None]:
plt.figure(figsize=(12,10))
plt.plot(df_female_users_profile['age'], df_female_users_profile['phone_brand'], 'o', color='red', alpha=0.005, label='female')
plt.show()

## Event Interest Vector

### Extract Vector Columns

In [None]:
list_interest_vector = df_active_app.columns
list_exclude_columns = [ "event_id","is_active","device_id","gender","age","group_y","phone_brand","device_model","longitude","latitude","hour","day_of_week"]

temp = []
for word in list_interest_vector:
    if word in list_exclude_columns:
        continue
    else:
        temp.append(word)
list_interest_vector = temp

list_interest_vector

### Extract interest vector and time

#### Male Users

In [None]:
df_male_interest_vs_time = df_male_users_active_app.groupby(["day_of_week","hour"])[list_interest_vector].mean()
df_male_interest_vs_time = df_male_interest_vs_time.reset_index()

plt.figure(figsize=(12,6))
for word in list_interest_vector:
    plt.plot(df_male_interest_vs_time.index, df_male_interest_vs_time[word], '-', color='green', alpha=0.1)
plt.show()

#### Female Users

In [None]:
df_female_interest_vs_time = df_female_users_active_app.groupby(["day_of_week","hour"])[list_interest_vector].mean()
df_female_interest_vs_time = df_female_interest_vs_time.reset_index()

plt.figure(figsize=(12,6))
for word in list_interest_vector:
    plt.plot(df_female_interest_vs_time.index, df_female_interest_vs_time[word], '-', color='red', alpha=0.1)
plt.show()

#### Finding the Most Interested Labels

##### Male interest

In [None]:
df_agg_male_interest = df_male_interest_vs_time.drop(["day_of_week","hour"], axis=1).sum().to_frame()
df_agg_male_interest.columns = ['count']
df_agg_male_interest = df_agg_male_interest.sort_values(by=['count'],ascending=False)
df_agg_male_interest = df_agg_male_interest.head(20)

plt.figure(figsize=(12,10))
plt.barh(df_agg_male_interest.index,df_agg_male_interest['count'], color='green')
plt.show()


##### Female Interest

In [None]:
df_agg_female_interest = df_female_interest_vs_time.drop(["day_of_week","hour"], axis=1).sum().to_frame()
df_agg_female_interest.columns = ['count']
df_agg_female_interest = df_agg_female_interest.sort_values(by=['count'],ascending=False)
df_agg_female_interest = df_agg_female_interest.head(20)

plt.figure(figsize=(12,10))
plt.barh(df_agg_female_interest.index,df_agg_female_interest['count'], color='red')
plt.show()

#### Converting Interest to category

In [None]:
# top Interest [ "property", "industry"]
df_categories_of_interest = df_label_categories[(df_label_categories["property"] == 1)|(df_label_categories["industry"] == 1)]
df_categories_of_interest = df_categories_of_interest.sort_values(by=[ "property", "industry"],ascending=False)
df_categories_of_interest