In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics.classification import log_loss
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix
from sklearn.externals import joblib as jobl
from sklearn.manifold import TSNE
from tqdm import tqdm
from joblib import dump
from scipy import sparse
import numpy as np
from tqdm import tqdm
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, BatchNormalization,Input,PReLU
from keras.optimizers import Adam, Adagrad
from keras.utils import np_utils
from tensorflow.keras.callbacks import ModelCheckpoint
%matplotlib inline

Using TensorFlow backend.


In [2]:
# Reading the data
train_data = pd.read_csv('Data/gender_age_train.csv',index_col='device_id')
test_data = pd.read_csv('Data/gender_age_test.csv',index_col='device_id')
phone_data = pd.read_csv('Data/phone_brand_device_model.csv',encoding='utf-8')
# Get rid of duplicate device ids in phone
phone_data = phone_data.drop_duplicates('device_id',keep='first').set_index('device_id') 
label_categories = pd.read_csv('Data/label_categories.csv')
app_labels = pd.read_csv('Data/app_labels.csv')
events = pd.read_csv('Data/events.csv',parse_dates=['timestamp'], index_col='event_id')
app_events = pd.read_csv('Data/app_events.csv',usecols=['event_id','app_id','is_active'],dtype={'is_active':bool})

In [3]:
train_data['trainrow'] = np.arange(train_data.shape[0])
test_data['testrow'] = np.arange(test_data.shape[0])

In [4]:
def PipeLine(train_data, test_data, phone_data, label_categories, app_labels, events, app_events):
    print("Extracting features")

    print("\tPhone Brands")
    
    enc_brand = LabelEncoder().fit(phone_data['phone_brand'])
    phone_data['enc_brand'] = enc_brand.transform(phone_data['phone_brand'])
    
    train_data['brand'] = phone_data['enc_brand']
    test_data['brand'] = phone_data['enc_brand']
    
    Xtr_brand = csr_matrix((np.ones(train_data.shape[0]), 
                       (train_data.trainrow, train_data.brand)))
    Xte_brand = csr_matrix((np.ones(test_data.shape[0]), 
                       (test_data.testrow, test_data.brand)))
    
    print("\tDevice models")

    enc_model = LabelEncoder().fit(phone_data['device_model'])
    phone_data['enc_model'] = enc_model.transform(phone_data['device_model'])

    train_data['model'] = phone_data['enc_model']
    test_data['model'] = phone_data['enc_model']

    Xtr_model = csr_matrix((np.ones(train_data.shape[0]), 
                           (train_data.trainrow, train_data.model)))
    Xte_model = csr_matrix((np.ones(test_data.shape[0]), 
                           (test_data.testrow, test_data.model)))

    print("\tApp_id")

    enc_apps = LabelEncoder().fit(app_events['app_id'])
    app_events['enc_app'] = enc_apps.transform(app_events['app_id'])

    deviceapps = (app_events.merge(events[['device_id']], how='left',left_on='event_id',right_index=True)
                           .groupby(['device_id','enc_app'])['enc_app'].agg(['size'])
                           .merge(train_data[['trainrow']], how='left', left_index=True, right_index=True)
                           .merge(test_data[['testrow']], how='left', left_index=True, right_index=True)
                           .reset_index())

    napps = len(enc_apps.classes_)

    d = deviceapps.dropna(subset=['trainrow'])
    Xtr_app = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.enc_app)), 
                          shape=(train_data.shape[0],napps))
    d = deviceapps.dropna(subset=['testrow'])
    Xte_app = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.enc_app)), 
                          shape=(test_data.shape[0],napps))


    print("\tApp Labels")

    app_labels = app_labels.loc[app_labels.app_id.isin(app_events.app_id.unique())]
    app_labels['enc_app'] = enc_apps.transform(app_labels.app_id)

    enc_labels = LabelEncoder().fit(app_labels['label_id'])
    app_labels['enc_label'] = enc_labels.transform(app_labels['label_id'])

    nlabels = len(enc_labels.classes_)

    devicelabels = (deviceapps[['device_id','enc_app']]
                    .merge(app_labels[['enc_app','enc_label']])
                    .groupby(['device_id','enc_label'])['enc_app'].agg(['size'])
                    .merge(train_data[['trainrow']], how='left', left_index=True, right_index=True)
                    .merge(test_data[['testrow']], how='left', left_index=True, right_index=True)
                    .reset_index())

    d = devicelabels.dropna(subset=['trainrow'])
    Xtr_label = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.enc_label)), 
                          shape=(train_data.shape[0],nlabels))
    d = devicelabels.dropna(subset=['testrow'])
    Xte_label = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.enc_label)), 
                          shape=(test_data.shape[0],nlabels))

    print("\tEvent hour")

    events['hour'] = events['timestamp'].apply(lambda x : x.hour)
    
    hourevents = events.groupby("device_id")["hour"].apply(lambda x: " ".join('0'+str(s) for s in x))
    hourevents = hourevents.reset_index().set_index('device_id')
    
    train_data['event_hours'] = hourevents['hour']
    test_data['event_hours'] = hourevents['hour']
    
    train_data = train_data.fillna('0')
    test_data = test_data.fillna('0')
    
    vectorizer_hours = TfidfVectorizer()
    vectorizer_hours.fit(train_data['event_hours'].values)

    Xtr_hours = vectorizer_hours.transform(train_data['event_hours'].values)
    Xte_hours = vectorizer_hours.transform(test_data['event_hours'].values)
    
    print("\tEvent day")
    
    events['dayofweek'] = events['timestamp'].apply(lambda x : x.dayofweek)
    
    dayevents = events.groupby("device_id")["dayofweek"].apply(lambda x: " ".join('0'+str(s) for s in x))
    dayevents = dayevents.reset_index().set_index('device_id')
    
    train_data['event_day'] = dayevents['dayofweek']
    test_data['event_day'] = dayevents['dayofweek']
    
    train_data = train_data.fillna('0')
    test_data = test_data.fillna('0')
    
    vectorizer_day = TfidfVectorizer()
    vectorizer_day.fit(train_data['event_day'].values)

    Xtr_day = vectorizer_day.transform(train_data['event_day'].values)
    Xte_day = vectorizer_day.transform(test_data['event_day'].values)

    print("\tApps Active")
    
    apps_active = app_events.groupby(['event_id'])['is_active'].apply(lambda x: " ".join(str(s) for s in x)).reset_index().set_index('event_id')
    events['apps_active'] = apps_active['is_active']
    events_apps_active = events.groupby("device_id")["apps_active"].apply(lambda x: " ".join(str(s) for s in x if str(s)!='nan'))
    events_apps_active = events_apps_active.reset_index().set_index('device_id')

    train_data['apps_active'] = events_apps_active['apps_active']
    test_data['apps_active'] = events_apps_active['apps_active']

    train_data = train_data.fillna('0')
    test_data = test_data.fillna('0')

    vectorizer_apps_active=TfidfVectorizer()
    vectorizer_apps_active.fit(train_data['apps_active'].values)

    Xtr_apps_active = vectorizer_apps_active.transform(train_data['apps_active'].values)
    Xte_apps_active = vectorizer_apps_active.transform(test_data['apps_active'].values)
    
    print("Done with Extracting features")
    
    print("Building classification model for Gender")

    print("\tData Stacking")
    
    X_train_gender = hstack((Xtr_brand,
                      Xtr_model,
                      Xtr_app,
                      Xtr_label,
                      Xtr_apps_active)).tocsr()

    X_test_gender = hstack((Xte_brand,
                      Xte_model,
                      Xte_app,
                      Xte_label,
                      Xte_apps_active)).tocsr()
    
    targetencoder = LabelEncoder().fit(train_data.gender)
    y_gender = targetencoder.transform(train_data.gender)
    nclasses = len(targetencoder.classes_)
    
    X_train, X_val, y_train, y_val = \
            train_test_split(X_train_gender, y_gender, random_state=1026, test_size=0.2, stratify = y_gender)
    
    y_train = np_utils.to_categorical(y_train)
    y_val = np_utils.to_categorical(y_val)
    
    input_dim = X_train.shape[1]
    output_dim = 2
    model = Sequential()
    model.add(Dropout(0.3, input_shape=(input_dim,)))
    model.add(Dense(80))
    model.add(PReLU())
    model.add(Dropout(0.2))
    model.add(Dense(50, init='normal', activation='relu'))
    model.add(PReLU())
    model.add(Dropout(0.1))
    model.add(Dense(output_dim, init='normal', activation='softmax'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    print("\tLoading weights")
    model.load_weights("best_models/best_model_gender.hdf5")
    
    print("\tPredicting Gender probabilites")
    X_train_gender = model.predict_proba(X_train_gender)
    X_test_gender = model.predict_proba(X_test_gender)
    
    X_train_gender = np.argmax(X_train_gender,axis=1)
    X_test_gender = np.argmax(X_test_gender,axis=1)
    
    print("\tGetting Gender feature")
    Xtr_gender = X_train_gender.reshape(-1,1)
    Xte_gender = X_test_gender.reshape(-1,1)
    
    print("Building Regression model for age")

    print("\tData Stacking")
    X_train_age = hstack((Xtr_brand,
                      Xtr_model,
                      Xtr_app,
                      Xtr_label,
                      Xtr_apps_active,
                      Xtr_hours)).tocsr()

    X_test_age = hstack((Xte_brand,
                      Xte_model,
                      Xte_app,
                      Xte_label,
                      Xte_apps_active,
                      Xte_hours)).tocsr()
    
    y_age = train_data.age
    
    print("\tBuilding Linear Regression Model")
    reg = LinearRegression().fit(X_train_age, y_age)
    X_train_age = reg.predict(X_train_age)
    X_test_age = reg.predict(X_test_age)
    
    print("\tGetting Age Feature")
    std_age = MinMaxScaler()
    std_age.fit(X_train_age.reshape(-1,1))
    Xtr_age = std_age.transform(X_train_age.reshape(-1,1))
    Xte_age = std_age.transform(X_test_age.reshape(-1,1))
    
    print("Final Classification model for Groups")
    
    print("\tData Stacking")
    
    X = hstack((Xtr_brand,
                      Xtr_model,
                      Xtr_app,
                      Xtr_label,
                      Xtr_hours,
                      Xtr_day,
                      Xtr_apps_active,
                      Xtr_gender,
                      Xtr_age)).tocsr()

    X_test = hstack((Xte_brand,
                      Xte_model,
                      Xte_app,
                      Xte_label,
                      Xte_hours,
                      Xte_day,
                      Xte_apps_active,
                      Xte_gender,
                      Xte_age)).tocsr()
    
    targetencoder = LabelEncoder().fit(train_data.group)
    y = targetencoder.transform(train_data.group)
    nclasses = len(targetencoder.classes_)
    
    y = np_utils.to_categorical(y)
    
    print("\tModel Building")
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=1026, test_size=0.2,stratify = y)
    
    print("\t\tNeural Network 1")
    input_shape = X_train.shape[1]
    model = Sequential()
    model.add(Dense(256, input_dim=input_shape))
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(64))
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(12))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
    
    random_seeds = [36,21,8,12,58,97,79,59,84,62,68,7,46,14,56,85,41,22,54,75]
    
    avg_val_loss = 0
    
    test_pred_avg = np.zeros((X_test.shape[0],12))
    
    for i in random_seeds:
        
        X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=i, test_size=0.2,stratify = y)
        model.load_weights("best_models/NN1/best_model_"+str(i)+".hdf5")
        
        predict_val = model.predict_proba(X_val)
        val_loss=log_loss(y_val, predict_val)
        avg_val_loss += val_loss
        
        pred = model.predict_proba(X_test)
        test_pred_avg += pred
    
    test_pred_avg = test_pred_avg/len(random_seeds)
    
    avg_val_loss = avg_val_loss/len(random_seeds)
    print("\t\tAverage Validation Log Loss for Neural Network 1 = ", avg_val_loss)
    
    np.save('TestPredictions/NN_1_Avg_test_prediction',test_pred_avg)
    print("\t\tNeural Network 2")
    
    input_dim = X_train.shape[1]
    model = Sequential()
    model.add(Dropout(0.3, input_shape=(input_dim,)))
    model.add(Dense(80))
    model.add(PReLU())
    model.add(Dropout(0.2))
    model.add(Dense(50, init='normal', activation='relu'))
    model.add(PReLU())
    model.add(Dropout(0.1))
    model.add(Dense(12, init='normal', activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    random_seeds = [66,23,27,76,60,75,95,50,67,77,46,78,58,97,57,29,54,93,92,10]
    
    avg_val_loss = 0
    
    test_pred_avg = np.zeros((X_test.shape[0],12))
    
    for i in random_seeds:
        
        X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=i, test_size=0.2,stratify = y)
        model.load_weights("best_models/NN2/best_model_"+str(i)+".hdf5")
        
        predict_val = model.predict_proba(X_val)
        val_loss=log_loss(y_val, predict_val)
        avg_val_loss += val_loss
        
        pred = model.predict_proba(X_test)
        test_pred_avg += pred
    
    test_pred_avg = test_pred_avg/len(random_seeds)
    
    avg_val_loss = avg_val_loss/len(random_seeds)
    print("\t\tAverage Validation Log Loss for Neural Network 2 = ", avg_val_loss)
    
    np.save('TestPredictions/NN_2_Avg_test_prediction',test_pred_avg)
    
    print("Model Ensembling")
    
    NN1 = np.load('TestPredictions/NN_1_Avg_test_prediction.npy')
    NN2 = np.load('TestPredictions/NN_2_Avg_test_prediction.npy')
    
    test_pred = 0.1*NN1+0.9*NN2
    predict_data = pd.DataFrame(test_pred).set_index(test_data.index)
    predict_data.columns = np.unique(train_data.group)
    
    print("Done!")
    return predict_data

In [5]:
predict_data = PipeLine(train_data, test_data, phone_data, label_categories, app_labels, events, app_events)

Extracting features
	Phone Brands
	Device models
	App_id
	App Labels
	Event hour
	Event day
	Apps Active
Done with Extracting features
Building classification model for Gender
	Data Stacking
	Loading weights
	Predicting Gender probabilites
	Getting Gender feature
Building Regression model for age
	Data Stacking
	Building Linear Regression Model
	Getting Age Feature
Final Classification model for Groups
	Data Stacking
	Model Building
		Neural Network 1
		Average Validation Log Loss for Neural Network 1 =  2.2394773826653918
		Neural Network 2
		Average Validation Log Loss for Neural Network 2 =  2.2444114916397964
Model Ensembling
Done!


In [6]:
predict_data

Unnamed: 0_level_0,F23-,F24-26,F27-28,F29-32,F33-42,F43+,M22-,M23-26,M27-28,M29-31,M32-38,M39+
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1002079943728939269,0.000455,0.001848,0.003356,0.010226,0.036115,0.053298,0.003276,0.029331,0.043226,0.124902,0.292434,0.401533
-1547860181818787117,0.002094,0.006524,0.009546,0.027018,0.066007,0.071683,0.005581,0.043837,0.051415,0.124199,0.275382,0.316713
7374582448058474277,0.032108,0.058115,0.061915,0.137424,0.148980,0.065997,0.017800,0.058147,0.055632,0.100273,0.168393,0.095214
-6220210354783429585,0.004061,0.007716,0.006949,0.013202,0.026495,0.039261,0.045481,0.167590,0.091289,0.175066,0.219068,0.203824
-5893464122623104785,0.047229,0.056213,0.043569,0.061112,0.058987,0.044063,0.093169,0.157578,0.097888,0.120708,0.132048,0.087435
...,...,...,...,...,...,...,...,...,...,...,...,...
4280900819321920929,0.059726,0.050846,0.037571,0.059532,0.074540,0.059818,0.098772,0.130646,0.069814,0.100470,0.131018,0.127247
818534825520551359,0.064219,0.058046,0.044965,0.065555,0.075392,0.059518,0.102239,0.126236,0.076126,0.099054,0.122903,0.105748
-8956851351560395765,0.056394,0.055516,0.040570,0.053968,0.049541,0.038218,0.123080,0.178429,0.097649,0.118223,0.113556,0.074855
6097318236795836256,0.056394,0.055516,0.040570,0.053968,0.049541,0.038218,0.123080,0.178429,0.097649,0.118223,0.113556,0.074855


In [7]:
predict_data.to_csv('submissions_final.csv')