In [1]:
from mpl_toolkits.basemap import Basemap
from pycm import ConfusionMatrix
from scipy import stats
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import FeatureHasher
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import dask.dataframe as dd
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import scipy.stats as stats

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

sns.set()

In [2]:
app_events = dd.read_csv('talkingdata/app_events.csv')
app_labels = dd.read_csv('talkingdata/app_labels.csv')
events = dd.read_csv('talkingdata/events.csv')
train = dd.read_csv('talkingdata/gender_age_train.csv')
label_categories = dd.read_csv('talkingdata/label_categories.csv')
phone_brand_model = dd.read_csv('talkingdata/phone_brand_device_model.csv')

In [3]:
# ---------------------------------------------------------------------
# CREATE NEW DATAFRAME THAT CONTAINS TOTAL NUMBER OF APPS PER DEVICE  |
# --------------------------------------------------------------------

num_apps = events.groupby('device_id').first().reset_index().compute()
num_apps = dd.merge(num_apps, app_events[['event_id', 'app_id']], on = 'event_id').compute()

num_apps = num_apps.groupby('device_id')['app_id'].count().reset_index()
num_apps = dd.merge(num_apps, phone_brand_model, on = 'device_id', how = 'inner') 

num_apps = num_apps.rename(columns={'app_id': 'app_count'})

num_apps.head()

Unnamed: 0,device_id,app_count,phone_brand,device_model
0,-9222956879900151005,68,三星,Galaxy Note 2
1,-9222661944218806987,10,vivo,Y913
2,-9222399302879214035,43,小米,MI 3
3,-9221767098072603291,25,金立,GN151
4,-9221079146476055829,12,小米,MI 3


In [4]:
# ------------------------------------------------------------------------------
# CREATE ANOTHER DATAFRAME WITH GENDER INFO (GET FROM TRAINING SET DATAFRAME)  |
# -----------------------------------------------------------------------------

gender_info = dd.merge(train, num_apps, on = 'device_id', how = 'inner')
gender_info = gender_info.rename(columns={'index': 'device_id'}).compute()

gender_info.head()

Unnamed: 0,device_id,gender,age,group,app_count,phone_brand,device_model
0,-8260683887967679142,M,35,M32-38,53,小米,MI 2
1,7477216237379271436,F,37,F33-42,26,华为,荣耀6 plus
2,6352067998666467520,M,32,M32-38,19,华为,荣耀畅玩4X
3,8026504930081700361,M,25,M23-26,31,小米,MI 4
4,-7271319853104672050,M,27,M27-28,34,三星,Galaxy Note 3


## Machine learning

In [5]:
# ------------------------------------------------------------------------
# GETTING DATASET IN DESIRED FORMAT IN PREPARATION FOR MACHINE LEARNING  |
# -----------------------------------------------------------------------

X = gender_info.copy()
X['phone'] = X.phone_brand + ' ' + X.device_model

# ----------------------------------------
# CONVERT GENDER/AGE CATEGORY TO NUMBER  |
# ---------------------------------------

X['group'] = pd.Categorical(X.group)
X['group_code'] = X.group.cat.codes
X = X.drop(['gender', 'age', 'group', 'phone_brand', 'device_model'], axis = 1)

X.head()

Unnamed: 0,device_id,app_count,phone,group_code
0,-8260683887967679142,53,小米 MI 2,10
1,7477216237379271436,26,华为 荣耀6 plus,4
2,6352067998666467520,19,华为 荣耀畅玩4X,10
3,8026504930081700361,31,小米 MI 4,7
4,-7271319853104672050,34,三星 Galaxy Note 3,8


The goal is to predict the age and gender group of a mobile user. In this attempt, 'phone_brand' and 'device_model' are combined into a single column. This new column, 'phone', and 'app_count' are used to predict the demographic groups of users.

In [6]:
# -------------------------------------------------------------------------------------
# INITIALIZE RANDOM FOREST CLASSIFER & FEATURE HASH (TO ENCODE CATEGORICAL FEATURES)  |
# ------------------------------------------------------------------------------------

rfc = RandomForestClassifier()
h = FeatureHasher(n_features = 890, input_type = 'string')

X_predictor = X.drop(['device_id', 'group_code'], axis = 1)
X_target = X.group_code

# ------------------
# HASHING 'phone'  |
# -----------------

phones = X.phone
phones_hash = h.transform(phones.values)
phones_hash = pd.DataFrame(phones_hash.toarray())
X_predictor = pd.concat([X_predictor, phones_hash], axis=1)
X_predictor = X_predictor.drop(['phone'], axis = 1)
X_predictor.head()

Unnamed: 0,app_count,0,1,2,3,4,5,6,7,8,...,880,881,882,883,884,885,886,887,888,889
0,53,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# --------------------------------------------------
# SPLIT INTO TRAINING/TEST SETS & FIT AND PREDICT  |
# -------------------------------------------------

X_train, X_test, y_train, y_test = train_test_split(X_predictor, X_target, test_size = 0.3, random_state = 42)

rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)

# -------------------
# PRINTING RESULTS  |
# ------------------

print('RANDOM FOREST CLASSIFIER')
print('------------------------')
print('F1 score:', f1_score(y_test, y_pred, average = 'micro'))

RANDOM FOREST CLASSIFIER
------------------------
F1 score: 0.1044042469524184


This F1 score is only 0.1, indicating the model does not predict the demographic group well.

**Note:** A feature that *should* be useful in enhancing the model's performance is the category/categories of apps. As an example, females are highly likely to have more beauty apps installed than males. However, I currently do not have experience with dealing with multi-label features or NLP, which would be helpful in handling the 900+ app categories (836 of which are unique values). This project will be revisited and updated as I gain more knowledge in dealing with text data.