### In this approach, first we are going to build a classification model to predict the gender. Then we are going to split the data in two groups, Males and Females according to the predictions. Then we are going to build seperate neural networks to predict the probabilites of Males group and Females Group

# Feature Engineering

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics.classification import log_loss
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib as jobl
from tqdm import tqdm
from joblib import dump
from scipy import sparse
import numpy as np
from scipy.integrate import simps
from numpy import trapz
from tqdm import tqdm
%matplotlib inline



In [2]:
# Reading the data
train_data = pd.read_csv('Data/gender_age_train.csv',index_col='device_id')
test_data = pd.read_csv('Data/gender_age_test.csv',index_col='device_id')
phone_data = pd.read_csv('Data/phone_brand_device_model.csv',encoding='utf-8')
# Get rid of duplicate device ids in phone
phone_data = phone_data.drop_duplicates('device_id',keep='first').set_index('device_id') 
label_categories = pd.read_csv('Data/label_categories.csv')
app_labels = pd.read_csv('Data/app_labels.csv')
events = pd.read_csv('Data/events.csv',parse_dates=['timestamp'], index_col='event_id')
app_events = pd.read_csv('Data/app_events.csv',usecols=['event_id','app_id','is_active'],dtype={'is_active':bool})

In [3]:
train_data['trainrow'] = np.arange(train_data.shape[0])
test_data['testrow'] = np.arange(test_data.shape[0])

## One Hot Encoding of Phone Brands

In [4]:
# Encoding the brands using LabelEncoder
# We could have used OneHotEncoder Directly too but OneHotEncoder Transfrom function returns a sparse matrix 
# and adding that to the dataframe (as we had to add this column to the train data) would had un-necessarily increased the space
enc_brand = LabelEncoder().fit(phone_data['phone_brand'])
phone_data['enc_brand'] = enc_brand.transform(phone_data['phone_brand'])

In [5]:
# As both of these dataframes have the same indices, the encoded brands will get assigned to the specific device_ids
train_data['brand'] = phone_data['enc_brand']
test_data['brand'] = phone_data['enc_brand']

In [6]:
# Getting the csr matrix from the encoded labels
Xtr_brand = csr_matrix((np.ones(train_data.shape[0]), 
                       (train_data.trainrow, train_data.brand)))
Xte_brand = csr_matrix((np.ones(test_data.shape[0]), 
                       (test_data.testrow, test_data.brand)))
print('The shape of Xtr_brand_no_events is', Xtr_brand.shape, 'the shape of Xte_brand_no_events is', Xte_brand.shape)

The shape of Xtr_brand_no_events is (74645, 131) the shape of Xte_brand_no_events is (112071, 131)


## One Hot Encoding of Device Models

In [7]:
# Encoding the device models using LabelEncoder same way as done for Brands
enc_model = LabelEncoder().fit(phone_data['device_model'])
phone_data['enc_model'] = enc_model.transform(phone_data['device_model'])

In [8]:
# As both of these dataframes have the same indices, the encoded brands will get assigned to the specific device_ids
train_data['model'] = phone_data['enc_model']
test_data['model'] = phone_data['enc_model']

In [9]:
# Getting the csr matrix from the encoded labels
Xtr_model = csr_matrix((np.ones(train_data.shape[0]), 
                       (train_data.trainrow, train_data.model)))
Xte_model = csr_matrix((np.ones(test_data.shape[0]), 
                       (test_data.testrow, test_data.model)))
print('The shape of Xtr_model_data is', Xtr_model.shape, 'the shape of Xte_model_data is', Xte_model.shape)

The shape of Xtr_model_data is (74645, 1599) the shape of Xte_model_data is (112071, 1599)


## One Hot Encoding for app_id

In [10]:
# For each device, we have to mark the apps that are installed in that device.  
# First, we will encode all the app_ids to integers using LabelEncoder. 
enc_apps = LabelEncoder().fit(app_events['app_id'])
app_events['enc_app'] = enc_apps.transform(app_events['app_id'])

# Then we merge the app_events with device_id column from the events dataframe grouping by device_id and encoded app_ids.
# Finally we merge it with trainrow and testrow columns to understand at which row we should put each device in features matrix.
deviceapps = (app_events.merge(events[['device_id']], how='left',left_on='event_id',right_index=True)
                       .groupby(['device_id','enc_app'])['enc_app'].agg(['size'])
                       .merge(train_data[['trainrow']], how='left', left_index=True, right_index=True)
                       .merge(test_data[['testrow']], how='left', left_index=True, right_index=True)
                       .reset_index())
deviceapps

Unnamed: 0,device_id,enc_app,size,trainrow,testrow
0,-9222956879900151005,548,18,21594.0,
1,-9222956879900151005,1096,18,21594.0,
2,-9222956879900151005,1248,26,21594.0,
3,-9222956879900151005,1545,12,21594.0,
4,-9222956879900151005,1664,18,21594.0,
...,...,...,...,...,...
2369020,9222539910510672930,17358,1,,82667.0
2369021,9222539910510672930,17587,1,,82667.0
2369022,9222539910510672930,18039,1,,82667.0
2369023,9222539910510672930,18686,1,,82667.0


In [11]:
# Getting the number of classes
napps = len(enc_apps.classes_)

# Getting the csr matrix for Apps
d = deviceapps.dropna(subset=['trainrow'])
Xtr_app = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.enc_app)), 
                      shape=(train_data.shape[0],napps))
d = deviceapps.dropna(subset=['testrow'])
Xte_app = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.enc_app)), 
                      shape=(test_data.shape[0],napps))
print('Apps data: train shape {}, test shape {}'.format(Xtr_app.shape, Xte_app.shape))

Apps data: train shape (74645, 19237), test shape (112071, 19237)


## One Hot Encoding for app_labels 

In [12]:
app_labels = app_labels.loc[app_labels.app_id.isin(app_events.app_id.unique())]
app_labels['enc_app'] = enc_apps.transform(app_labels.app_id)

# Encoding the labels
enc_labels = LabelEncoder().fit(app_labels['label_id'])
app_labels['enc_label'] = enc_labels.transform(app_labels['label_id'])

In [13]:
nlabels = len(enc_labels.classes_)

devicelabels = (deviceapps[['device_id','enc_app']]
                .merge(app_labels[['enc_app','enc_label']])
                .groupby(['device_id','enc_label'])['enc_app'].agg(['size'])
                .merge(train_data[['trainrow']], how='left', left_index=True, right_index=True)
                .merge(test_data[['testrow']], how='left', left_index=True, right_index=True)
                .reset_index())

devicelabels.head()

Unnamed: 0,device_id,enc_label,size,trainrow,testrow
0,-9222956879900151005,117,1,21594.0,
1,-9222956879900151005,120,1,21594.0,
2,-9222956879900151005,126,1,21594.0,
3,-9222956879900151005,138,2,21594.0,
4,-9222956879900151005,147,2,21594.0,


In [14]:
d = devicelabels.dropna(subset=['trainrow'])
Xtr_label = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.enc_label)), 
                      shape=(train_data.shape[0],nlabels))
d = devicelabels.dropna(subset=['testrow'])
Xte_label = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.enc_label)), 
                      shape=(test_data.shape[0],nlabels))
print('Labels data: train shape {}, test shape {}'.format(Xtr_label.shape, Xte_label.shape))

Labels data: train shape (74645, 492), test shape (112071, 492)


# Events

## HourBins Data Preparation

In [15]:
def binhour(x):
    '''
    This function returns the phase of the day at which the event occurred.
    So it will be easier to count the number of events that occurred during a specific phase of the day.
    '''
    if x < 5:
        return "midnight"
    elif x < 8:
        return "early_morning"
    elif x < 20:
        return "daytime"
    elif x< 23:
        return "night"
    else:
        return "midnight"

In [16]:
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Timestamp.html
# Extracting the hour at which a specific event occurred to put it in phase
events['hour'] = events['timestamp'].apply(lambda x : x.hour)
# Appling the above function to get the phase
events['hour_bin'] = events['hour'].apply(binhour)

In [17]:
# Grouping the data by device_id and getting the information of all the phases at which the event of that device occurred
l_hours_cnt = events.groupby('device_id')['hour_bin'].apply(lambda x: " ".join(s for s in x)).reset_index()

midnight_counts = []
daytime_counts = []
early_morning_counts = []
night_counts = []

# Counting the events which occurred at the specific phases from l_hours_count
for i in range(len(l_hours_cnt)):
    lis = l_hours_cnt['hour_bin'][i].split(' ')
    midnight_counts.append(lis.count('midnight'))
    daytime_counts.append(lis.count('daytime'))
    early_morning_counts.append(lis.count('early_morning'))
    night_counts.append(lis.count('night'))

# Adding the counts column in the data
l_hours_cnt['midnight_counts'] = midnight_counts
l_hours_cnt['daytime_counts'] = daytime_counts
l_hours_cnt['early_morning_counts'] = early_morning_counts
l_hours_cnt['night_counts'] = night_counts

In [18]:
l_hours_cnt.head()

Unnamed: 0,device_id,hour_bin,midnight_counts,daytime_counts,early_morning_counts,night_counts
0,-9222956879900151005,daytime daytime daytime daytime daytime night ...,3,50,2,10
1,-9222661944218806987,night daytime night daytime daytime daytime mi...,1,5,0,2
2,-9222399302879214035,daytime daytime midnight night daytime midnigh...,3,6,0,1
3,-9221825537663503111,early_morning early_morning early_morning dayt...,0,73,22,4
4,-9221767098072603291,early_morning daytime daytime daytime daytime ...,0,5,3,0


In [19]:
hourevents = events.groupby("device_id")["hour"].apply(lambda x: " ".join('0'+str(s) for s in x))
hourevents = hourevents.reset_index().set_index('device_id')

## DaysOfTheWeek Data Preparation

In [20]:
def day_of_week(x):
    '''
    This function returns the day of the week at which the event occurred.
    So it will be easier to count the number of events that occurred during a specific day of the day.
    '''  
    if(x==0):
        return "Monday"
    elif(x==1):
        return "Tuesday"
    elif(x==2):
        return "Wednesday"
    elif(x==3):
        return "Thursday"
    elif(x==4):
        return "Friday"
    elif(x==5):
        return "Saturday"
    elif(x==6):
        return "Sunday"

In [21]:
# Getting the day at which a specific event occurred from the timestamp object
events['dayofweek'] = events['timestamp'].apply(lambda x : x.dayofweek)
# Applying the function to get the day in words
events['day'] = events['dayofweek'].apply(day_of_week)

In [22]:
# Grouping events data according to device_ids and getting all the days at which that particular device performed that events
l_days_cnt = events.groupby('device_id')['day'].apply(lambda x: " ".join(s for s in x)).reset_index()

monday_counts = []
tuesday_counts = []
wednesday_counts = []
thursday_counts = []
friday_counts = []
saturday_counts = []
sunday_counts = []

# Getting the counts of events that occurred on a specific data
for i in range(len(l_hours_cnt)):
    lis = l_days_cnt['day'][i].split(' ')
    monday_counts.append(lis.count('Monday'))
    tuesday_counts.append(lis.count('Tuesday'))
    wednesday_counts.append(lis.count('Wednesday'))
    thursday_counts.append(lis.count('Thursday'))
    friday_counts.append(lis.count('Friday'))
    saturday_counts.append(lis.count('Saturday'))
    sunday_counts.append(lis.count('Sunday'))

# Adding the counts column in the dataframe
l_days_cnt['monday_counts'] = monday_counts
l_days_cnt['tuesday_counts'] = tuesday_counts
l_days_cnt['wednesday_counts'] = wednesday_counts
l_days_cnt['thursday_counts'] = thursday_counts
l_days_cnt['friday_counts'] = friday_counts
l_days_cnt['saturday_counts'] = saturday_counts
l_days_cnt['sunday_counts'] = sunday_counts

In [23]:
# Setting the index as device is so it will be easier to merge the data
l_days_cnt = l_days_cnt.set_index('device_id')
l_hours_cnt = l_hours_cnt.set_index('device_id')
events = events.set_index('device_id')

In [24]:
# Merging all the data to get the counts that were extracted above
events_count = events.merge(l_days_cnt, on='device_id').\
                merge(l_hours_cnt, on='device_id').\
                drop(['timestamp','longitude','latitude','hour','dayofweek','day_x','day_y','hour_bin_x','hour_bin_y'],axis=1).\
                drop_duplicates().\
                merge(train_data[['trainrow']], how='left', left_index=True, right_index=True).\
                merge(test_data[['testrow']], how='left', left_index=True, right_index=True).\
                reset_index().set_index('device_id')

In [25]:
events_count

Unnamed: 0_level_0,monday_counts,tuesday_counts,wednesday_counts,thursday_counts,friday_counts,saturday_counts,sunday_counts,midnight_counts,daytime_counts,early_morning_counts,night_counts,trainrow,testrow
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
29182687948017175,53,20,20,21,55,25,62,76,86,68,26,58469.0,
-6401643145415154744,12,11,6,5,0,0,39,17,49,1,6,,68691.0
-4833982096941402721,42,39,46,35,28,16,42,21,185,10,32,7337.0,
-6815121365017318426,0,0,0,0,0,1,46,30,9,8,0,9287.0,
-5373797595892518570,26,8,203,118,3,65,102,96,341,47,41,41396.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
-6214565785039593168,1,1,1,0,0,1,1,0,4,0,1,,56726.0
1530011109762458400,0,0,0,0,1,2,0,1,0,0,2,,21397.0
2659296957641269347,0,0,1,0,1,0,0,1,0,1,0,21441.0,
-7567402485735900406,1,0,1,0,0,0,0,1,0,1,0,66060.0,


In [26]:
# As both of these eventsframes have the same indices, the encoded brands will get assigned to the specific device_ids
# Adding the counts columns to train and test data
train_data['monday_counts'] = events_count['monday_counts']
test_data['monday_counts'] = events_count['monday_counts']

train_data['tuesday_counts'] = events_count['tuesday_counts']
test_data['tuesday_counts'] = events_count['tuesday_counts']

train_data['wednesday_counts'] = events_count['wednesday_counts']
test_data['wednesday_counts'] = events_count['wednesday_counts']

train_data['thursday_counts'] = events_count['thursday_counts']
test_data['thursday_counts'] = events_count['thursday_counts']

train_data['friday_counts'] = events_count['friday_counts']
test_data['friday_counts'] = events_count['friday_counts']

train_data['saturday_counts'] = events_count['saturday_counts']
test_data['saturday_counts'] = events_count['saturday_counts']

train_data['sunday_counts'] = events_count['sunday_counts']
test_data['sunday_counts'] = events_count['sunday_counts']

train_data['midnight_counts'] = events_count['midnight_counts']
test_data['midnight_counts'] = events_count['midnight_counts']

train_data['daytime_counts'] = events_count['daytime_counts']
test_data['daytime_counts'] = events_count['daytime_counts']

train_data['early_morning_counts'] = events_count['early_morning_counts']
test_data['early_morning_counts'] = events_count['early_morning_counts']

train_data['night_counts'] = events_count['night_counts']
test_data['night_counts'] = events_count['night_counts']

In [27]:
train_data = train_data.fillna(0)
test_data = test_data.fillna(0)

## MinMaxScaler for Days Counts and HourBins Counts 

In [28]:
#Scaling the days count as the values vary on a very wide range

std_monday = MinMaxScaler()
std_monday.fit(train_data['monday_counts'].values.reshape(-1,1))
train_data['std_monday_counts'] = std_monday.transform(train_data['monday_counts'].values.reshape(-1,1))
test_data['std_monday_counts'] = std_monday.transform(test_data['monday_counts'].values.reshape(-1,1))

std_tuesday = MinMaxScaler()
std_tuesday.fit(train_data['tuesday_counts'].values.reshape(-1,1))
train_data['std_tuesday_counts'] = std_tuesday.transform(train_data['tuesday_counts'].values.reshape(-1,1))
test_data['std_tuesday_counts'] = std_tuesday.transform(test_data['tuesday_counts'].values.reshape(-1,1))

std_wednesday = MinMaxScaler()
std_wednesday.fit(train_data['wednesday_counts'].values.reshape(-1,1))
train_data['std_wednesday_counts'] = std_wednesday.transform(train_data['wednesday_counts'].values.reshape(-1,1))
test_data['std_wednesday_counts'] = std_wednesday.transform(test_data['wednesday_counts'].values.reshape(-1,1))

std_thursday = MinMaxScaler()
std_thursday.fit(train_data['thursday_counts'].values.reshape(-1,1))
train_data['std_thursday_counts'] = std_thursday.transform(train_data['thursday_counts'].values.reshape(-1,1))
test_data['std_thursday_counts'] = std_thursday.transform(test_data['thursday_counts'].values.reshape(-1,1))

std_friday = MinMaxScaler()
std_friday.fit(train_data['friday_counts'].values.reshape(-1,1))
train_data['std_friday_counts'] = std_friday.transform(train_data['friday_counts'].values.reshape(-1,1))
test_data['std_friday_counts'] = std_friday.transform(test_data['friday_counts'].values.reshape(-1,1))

std_saturday = MinMaxScaler()
std_saturday.fit(train_data['saturday_counts'].values.reshape(-1,1))
train_data['std_saturday_counts'] = std_saturday.transform(train_data['saturday_counts'].values.reshape(-1,1))
test_data['std_saturday_counts'] = std_saturday.transform(test_data['saturday_counts'].values.reshape(-1,1))

std_sunday = MinMaxScaler()
std_sunday.fit(train_data['sunday_counts'].values.reshape(-1,1))
train_data['std_sunday_counts'] = std_sunday.transform(train_data['sunday_counts'].values.reshape(-1,1))
test_data['std_sunday_counts'] = std_sunday.transform(test_data['sunday_counts'].values.reshape(-1,1))

std_midnight = MinMaxScaler()
std_midnight.fit(train_data['midnight_counts'].values.reshape(-1,1))
train_data['std_midnight_counts'] = std_midnight.transform(train_data['midnight_counts'].values.reshape(-1,1))
test_data['std_midnight_counts'] = std_midnight.transform(test_data['midnight_counts'].values.reshape(-1,1))

std_daytime = MinMaxScaler()
std_daytime.fit(train_data['daytime_counts'].values.reshape(-1,1))
train_data['std_daytime_counts'] = std_daytime.transform(train_data['daytime_counts'].values.reshape(-1,1))
test_data['std_daytime_counts'] = std_daytime.transform(test_data['daytime_counts'].values.reshape(-1,1))

std_early_morning = MinMaxScaler()
std_early_morning.fit(train_data['early_morning_counts'].values.reshape(-1,1))
train_data['std_early_morning_counts'] = std_early_morning.transform(train_data['early_morning_counts'].values.reshape(-1,1))
test_data['std_early_morning_counts'] = std_early_morning.transform(test_data['early_morning_counts'].values.reshape(-1,1))

std_night = MinMaxScaler()
std_night.fit(train_data['night_counts'].values.reshape(-1,1))
train_data['std_night_counts'] = std_night.transform(train_data['night_counts'].values.reshape(-1,1))
test_data['std_night_counts'] = std_night.transform(test_data['night_counts'].values.reshape(-1,1))

In [29]:
Xtr_monday_counts = train_data['std_monday_counts'].values.reshape(-1,1)
Xte_monday_counts = test_data['std_monday_counts'].values.reshape(-1,1)

Xtr_tuesday_counts = train_data['std_tuesday_counts'].values.reshape(-1,1)
Xte_tuesday_counts = test_data['std_tuesday_counts'].values.reshape(-1,1)

Xtr_wednesday_counts = train_data['std_wednesday_counts'].values.reshape(-1,1)
Xte_wednesday_counts = test_data['std_wednesday_counts'].values.reshape(-1,1)

Xtr_thursday_counts = train_data['std_thursday_counts'].values.reshape(-1,1)
Xte_thursday_counts = test_data['std_thursday_counts'].values.reshape(-1,1)

Xtr_friday_counts = train_data['std_friday_counts'].values.reshape(-1,1)
Xte_friday_counts = test_data['std_friday_counts'].values.reshape(-1,1)

Xtr_saturday_counts = train_data['std_saturday_counts'].values.reshape(-1,1)
Xte_saturday_counts = test_data['std_saturday_counts'].values.reshape(-1,1)

Xtr_sunday_counts = train_data['std_sunday_counts'].values.reshape(-1,1)
Xte_sunday_counts = test_data['std_sunday_counts'].values.reshape(-1,1)

Xtr_midnight_counts = train_data['std_midnight_counts'].values.reshape(-1,1)
Xte_midnight_counts = test_data['std_midnight_counts'].values.reshape(-1,1)

Xtr_daytime_counts = train_data['std_daytime_counts'].values.reshape(-1,1)
Xte_daytime_counts = test_data['std_daytime_counts'].values.reshape(-1,1)

Xtr_early_morning_counts = train_data['std_early_morning_counts'].values.reshape(-1,1)
Xte_early_morning_counts = test_data['std_early_morning_counts'].values.reshape(-1,1)

Xtr_night_counts = train_data['std_night_counts'].values.reshape(-1,1)
Xte_night_counts = test_data['std_night_counts'].values.reshape(-1,1)

## TFIDF Vectorizer for each hour of the day 

In [30]:
train_data['event_hours'] = hourevents['hour']
test_data['event_hours'] = hourevents['hour']

In [31]:
train_data = train_data.fillna('0')
test_data = test_data.fillna('0')

In [32]:
vectorizer_hours=TfidfVectorizer()
vectorizer_hours.fit(train_data['event_hours'].values)

Xtr_hours = vectorizer_hours.transform(train_data['event_hours'].values)
Xte_hours = vectorizer_hours.transform(test_data['event_hours'].values)

print("Train shape : ",Xtr_hours.shape)
print("Test shape : ",Xte_hours.shape)

Train shape :  (74645, 24)
Test shape :  (112071, 24)


## TFIDF Vectorizer for Apps Active 

In [33]:
events = pd.read_csv('Data/events.csv',parse_dates=['timestamp'], index_col='event_id')

apps_active = app_events.groupby(['event_id'])['is_active'].apply(lambda x: " ".join(str(s) for s in x))
events['apps_active'] = events.index.map(apps_active)
events_apps_active = events.groupby("device_id")["apps_active"].apply(lambda x: " ".join(str(s) for s in x if str(s)!='nan'))
events_apps_active = events_apps_active.reset_index().set_index('device_id')
events_apps_active.head()

Unnamed: 0_level_0,apps_active
device_id,Unnamed: 1_level_1
-9222956879900151005,False False False False False False False True...
-9222661944218806987,True False True True True True True False Fals...
-9222399302879214035,False False False False False False False Fals...
-9221825537663503111,False False True False False True True False F...
-9221767098072603291,True False False False False True False True F...


In [34]:
train_data['apps_active'] = events_apps_active['apps_active']
test_data['apps_active'] = events_apps_active['apps_active']

In [35]:
train_data = train_data.fillna('0')
test_data = test_data.fillna('0')

In [36]:
vectorizer_apps_active=TfidfVectorizer()
vectorizer_apps_active.fit(train_data['apps_active'].values)

Xtr_apps_active = vectorizer_apps_active.transform(train_data['apps_active'].values)
Xte_apps_active = vectorizer_apps_active.transform(test_data['apps_active'].values)
print("Train Shape ",Xtr_apps_active.shape," Test Shape ",Xte_apps_active.shape)

Train Shape  (74645, 2)  Test Shape  (112071, 2)


## Data Stacking  

In [37]:
X_train = hstack((Xtr_brand,
                  Xtr_model,
                  Xtr_app,
                  Xtr_label,
                  Xtr_hours,
                  Xtr_monday_counts,
                  Xtr_tuesday_counts,
                  Xtr_wednesday_counts,
                  Xtr_thursday_counts,
                  Xtr_friday_counts,
                  Xtr_saturday_counts,
                  Xtr_sunday_counts,
                  Xtr_midnight_counts,
                  Xtr_early_morning_counts,
                  Xtr_daytime_counts,
                  Xtr_night_counts,
                  Xtr_apps_active)).tocsr()

X_test = hstack((Xte_brand,
                  Xte_model,
                  Xte_app,
                  Xte_label,
                  Xte_hours,
                  Xte_monday_counts,
                  Xte_tuesday_counts,
                  Xte_wednesday_counts,
                  Xte_thursday_counts,
                  Xte_friday_counts,
                  Xte_saturday_counts,
                  Xte_sunday_counts,
                  Xte_midnight_counts,
                  Xte_early_morning_counts,
                  Xte_daytime_counts,
                  Xte_night_counts,
                  Xte_apps_active)).tocsr()

print("X_train shape ",X_train.shape)
print("X_test shape ",X_test.shape)

X_train shape  (74645, 21496)
X_test shape  (112071, 21496)


In [38]:
targetencoder = LabelEncoder().fit(train_data.gender)
y = targetencoder.transform(train_data.gender)
nclasses = len(targetencoder.classes_)
print("y shape ", y.shape)

y shape  (74645,)


In [39]:
from scipy.sparse import save_npz

save_npz('X_train.npz',X_train)
save_npz('X_test.npz',X_test)
np.save('y_gender',y)

In [4]:
def plot_confusion_matrix(test_y, predict_y):
    '''
    This function buils a confusion matrix along with precision and recall matrices
    '''
    C = confusion_matrix(test_y, predict_y)
    A =(((C.T)/(C.sum(axis=1))).T)
    B =(C/C.sum(axis=0))
   
    labels = ['F','M']
    print("-"*20, "Confusion matrix", "-"*20)
    plt.figure()
    sns.heatmap(C, annot=True, cmap="YlGnBu", fmt=".3f", xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted Class')
    plt.ylabel('Original Class')
    plt.show()

    print("-"*20, "Precision matrix (Columm Sum=1)", "-"*20)
    plt.figure()
    sns.heatmap(B, annot=True, cmap="YlGnBu", fmt=".3f", xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted Class')
    plt.ylabel('Original Class')
    plt.show()
    
    print("-"*20, "Recall matrix (Row sum=1)", "-"*20)
    plt.figure()
    sns.heatmap(A, annot=True, cmap="YlGnBu", fmt=".3f", xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted Class')
    plt.ylabel('Original Class')
    plt.show()

In [4]:
# Loading the csr matrix
from scipy.sparse import load_npz

X = load_npz('X_train.npz')
X_test = load_npz('X_test.npz')
y = np.load('y_gender.npy')

In [5]:
# Splitting the data
X_train, X_val, y_train, y_val = \
            train_test_split(X, y, random_state=1026, test_size=0.2, stratify = y)
print('All features Events: train shape {}, test shape {}'.format(X_train.shape, X_val.shape))

All features Events: train shape (59716, 21496), test shape (14929, 21496)


## Model Building 

In [6]:
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, BatchNormalization,Input,PReLU
from keras.optimizers import Adam, Adagrad
from keras.utils import np_utils
from tensorflow.keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


In [8]:
def plot_model(history):
    # Plot training & validation accuracy values
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('Model accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Test'], loc='upper left')
    plt.show()

    # Plot training & validation loss values
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Model loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Test'], loc='upper left')
    plt.show()

In [7]:
y_train = np_utils.to_categorical(y_train)
y_val = np_utils.to_categorical(y_val)

In [10]:
# Reference -> https://www.kaggle.com/c/talkingdata-mobile-user-demographics/discussion/23424
model_list = []
avg_val_loss = 0
filepath="best_models/NN2/best_model_gender.hdf5"
checkpoint = ModelCheckpoint(filepath=filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')
input_dim = X.shape[1]
output_dim = 2
model = Sequential()
model.add(Dropout(0.3, input_shape=(input_dim,)))
model.add(Dense(80))
model.add(PReLU())
model.add(Dropout(0.2))
model.add(Dense(50, init='normal', activation='relu'))
model.add(PReLU())
model.add(Dropout(0.1))
model.add(Dense(output_dim, init='normal', activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train, y_train, batch_size=350, epochs=5, verbose=0, validation_data=(X_val,y_val), callbacks = [checkpoint])


Epoch 00001: val_loss improved from inf to 0.59772, saving model to best_models/NN2/best_model_gender.hdf5

Epoch 00002: val_loss improved from 0.59772 to 0.59058, saving model to best_models/NN2/best_model_gender.hdf5

Epoch 00003: val_loss improved from 0.59058 to 0.58924, saving model to best_models/NN2/best_model_gender.hdf5

Epoch 00004: val_loss did not improve from 0.58924

Epoch 00005: val_loss did not improve from 0.58924


In [8]:
input_dim = X.shape[1]
output_dim = 2
model = Sequential()
model.add(Dropout(0.3, input_shape=(input_dim,)))
model.add(Dense(80))
model.add(PReLU())
model.add(Dropout(0.2))
model.add(Dense(50, init='normal', activation='relu'))
model.add(PReLU())
model.add(Dropout(0.1))
model.add(Dense(output_dim, init='normal', activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.load_weights("best_models/NN2/best_model_gender.hdf5")
X_train_gender = model.predict_proba(X_train)
X_val_gender = model.predict_proba(X_val)
X_test_gender = model.predict_proba(X_test)

In [26]:
X_train_gender_class = np.argmax(X_train_gender,axis=1)
X_val_gender_class = np.argmax(X_val_gender,axis=1)
X_test_gender_class = np.argmax(X_test_gender,axis=1)

In [32]:
targetencoder = LabelEncoder().fit(train_data.group)
y = targetencoder.transform(train_data.group)
nclasses = len(targetencoder.classes_)
print("y shape ", y.shape)

y shape  (74645,)


In [33]:
_, _, y_train, y_val = train_test_split(X, y, random_state=1026, test_size=0.2, stratify = y)

In [35]:
X_train_male = []
X_train_female = []
y_train_male = []
y_train_female = []

for idx,i in enumerate(X_train_gender_class):
    if(i==1):
        X_train_male.append(X_train[idx])
        y_train_male.append(y_train[idx])
    elif(i==0):
        X_train_female.append(X_train[idx])
        y_train_female.append(y_train[idx])

In [36]:
X_val_male = []
X_val_female = []
y_val_male = []
y_val_female = []

for idx,i in enumerate(X_val_gender_class):
    if(i==1):
        X_val_male.append(X_val[idx])
        y_val_male.append(y_val[idx])
    elif(i==0):
        X_val_female.append(X_val[idx])
        y_val_female.append(y_val[idx])

In [43]:
y_train_male = np_utils.to_categorical(y_train_male)
y_train_female = np_utils.to_categorical(y_train_female)
y_val_male = np_utils.to_categorical(y_val_male)
y_val_female = np_utils.to_categorical(y_val_female)

In [44]:
X_train_male = sparse.vstack((i for i in X_train_male), format='csr')
X_train_female = sparse.vstack((i for i in X_train_female), format='csr')

X_val_male = sparse.vstack((i for i in X_val_male), format='csr')
X_val_female = sparse.vstack((i for i in X_val_female), format='csr')

## Females Neural Network 

In [46]:
# Reference -> https://www.kaggle.com/c/talkingdata-mobile-user-demographics/discussion/23424
model_list = []
avg_val_loss = 0
filepath="best_models/NN2/best_model_female.hdf5"
checkpoint = ModelCheckpoint(filepath=filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')
input_dim = X_train_female.shape[1]
output_dim = 12
model = Sequential()
model.add(Dropout(0.3, input_shape=(input_dim,)))
model.add(Dense(80))
model.add(PReLU())
model.add(Dropout(0.2))
model.add(Dense(50, init='normal', activation='relu'))
model.add(PReLU())
model.add(Dropout(0.1))
model.add(Dense(output_dim, init='normal', activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train_female, y_train_female, batch_size=350, epochs=5, verbose=0, validation_data=(X_val_female,y_val_female), callbacks = [checkpoint])


Epoch 00001: val_loss improved from inf to 2.42576, saving model to best_models/NN2/best_model_female.hdf5

Epoch 00002: val_loss did not improve from 2.42576

Epoch 00003: val_loss did not improve from 2.42576

Epoch 00004: val_loss did not improve from 2.42576

Epoch 00005: val_loss did not improve from 2.42576


In [48]:
model.load_weights("best_models/NN2/best_model_female.hdf5")
X_test_pred = model.predict_proba(X_test)

## Males Neural Network 

In [50]:
# Reference -> https://www.kaggle.com/c/talkingdata-mobile-user-demographics/discussion/23424
model_list = []
avg_val_loss = 0
filepath="best_models/NN2/best_model_male.hdf5"
checkpoint = ModelCheckpoint(filepath=filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')
input_dim = X_train_male.shape[1]
output_dim = 12
model = Sequential()
model.add(Dropout(0.3, input_shape=(input_dim,)))
model.add(Dense(80))
model.add(PReLU())
model.add(Dropout(0.2))
model.add(Dense(50, init='normal', activation='relu'))
model.add(PReLU())
model.add(Dropout(0.1))
model.add(Dense(output_dim, init='normal', activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train_male, y_train_male, batch_size=350, epochs=5, verbose=0, validation_data=(X_val_male,y_val_male), callbacks = [checkpoint])


Epoch 00001: val_loss improved from inf to 2.42916, saving model to best_models/NN2/best_model_male.hdf5

Epoch 00002: val_loss did not improve from 2.42916

Epoch 00003: val_loss did not improve from 2.42916

Epoch 00004: val_loss did not improve from 2.42916

Epoch 00005: val_loss did not improve from 2.42916


In [51]:
model.load_weights("best_models/NN2/best_model_male.hdf5")
X_test_pred_2 = model.predict_proba(X_test)

In [55]:
test_predictions = []

for idx,i in enumerate(X_test_gender_class):
    if(i==1):
        test_predictions.append(X_test_pred_2[idx])
    elif(i==0):
        test_predictions.append(X_test_pred[idx])

In [56]:
predict_data = pd.DataFrame(test_predictions).set_index(test_data.index)
predict_data.columns = np.unique(train_data.group)
predict_data.to_csv('submissions_final.csv')