In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from xgboost.sklearn import XGBClassifier

np.random.seed(0)

In [2]:
#Loading data
df_train = pd.read_csv('../input/train_users_2.csv')
df_test = pd.read_csv('../input/test_users.csv')
labels = df_train['country_destination'].values
df_train = df_train.drop(['country_destination'], axis=1)
id_test = df_test['id']
piv_train = df_train.shape[0]

#Creating a DataFrame with train+test data
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)
#Removing id and date_first_booking
df_all = df_all.drop(['id', 'date_first_booking'], axis=1)
#Filling nan
df_all = df_all.fillna(-1)

#####Feature engineering#######
#date_account_created
dac = np.vstack(df_all.date_account_created.astype(str).apply(lambda x: list(map(int, x.split('-')))).values)
df_all['dac_year'] = dac[:,0]
df_all['dac_month'] = dac[:,1]
df_all['dac_day'] = dac[:,2]
df_all = df_all.drop(['date_account_created'], axis=1)

#timestamp_first_active
tfa = np.vstack(df_all.timestamp_first_active.astype(str).apply(lambda x: list(map(int, [x[:4],x[4:6],x[6:8],x[8:10],x[10:12],x[12:14]]))).values)
df_all['tfa_year'] = tfa[:,0]
df_all['tfa_month'] = tfa[:,1]
df_all['tfa_day'] = tfa[:,2]
df_all = df_all.drop(['timestamp_first_active'], axis=1)

In [3]:
#Age
av = df_all.age.values
df_all['age'] = np.where(np.logical_or(av<14, av>100), -1, av)

#One-hot-encoding features
ohe_feats = ['gender', 'signup_method', 'signup_flow', 'language', 
             'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 
             'first_device_type', 'first_browser']
for f in ohe_feats:
    df_all_dummy = pd.get_dummies(df_all[f], prefix=f)
    df_all = df_all.drop([f], axis=1)
    df_all = pd.concat((df_all, df_all_dummy), axis=1)

#Splitting train and test
vals = df_all.values
X = vals[:piv_train]
le = LabelEncoder()
y = le.fit_transform(labels)   
X_test = vals[piv_train:]

In [7]:
#Classifier
xgb = XGBClassifier(max_depth= 6, learning_rate=0.1, n_estimators=100,
                    objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0)                  
xgb.fit(X, y)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.5,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.5)

In [19]:
y_pred = xgb.predict_proba(X_test)  

#Taking the 5 classes with highest probabilities
ids = []  #list of ids
cts = []  #list of countries
for i in range(len(id_test)):
    idx = id_test[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

#Generate submission
sub_xgb = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
sub_xgb.to_csv('../output/sub_xgb.csv',index=False)

### XGB Feature Importance

# Random Forest

In [9]:
from sklearn.ensemble import RandomForestClassifier as RF
rfclf = RF(n_estimators= 500, n_jobs = 8, class_weight = 'balanced', 
           min_samples_split = 20, min_weight_fraction_leaf =0.0001,
           verbose = 0)

# train the model on the data
rfclf.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=20,
            min_weight_fraction_leaf=0.0001, n_estimators=500, n_jobs=8,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [20]:
y_pred = rfclf.predict_proba(X_test)  

#Taking the 5 classes with highest probabilities
ids = []  #list of ids
cts = []  #list of countries
for i in range(len(id_test)):
    idx = id_test[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

#Generate submission
sub_rf = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
sub_rf.to_csv('../output/sub_rf.csv',index=False)

# Logistic  Regression 

In [11]:
from sklearn import linear_model
logclf = linear_model.LogisticRegression(dual = False,
                                       C = 10.0, class_weight = 'balanced', 
                                       solver = 'sag')
logclf.fit(X, y)



LogisticRegression(C=10.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='sag', tol=0.0001, verbose=0, warm_start=False)

In [17]:
y_pred = logclf.predict_proba(X_test)  

#Taking the 5 classes with highest probabilities
ids = []  #list of ids
cts = []  #list of countries
for i in range(len(id_test)):
    idx = id_test[i]
    ids += [idx] * 1
    cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:1].tolist()

#Generate submission
sub_lg = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
sub_lg.to_csv('../output/sub_lr.csv',index=False)

In [18]:
sub_lg

Unnamed: 0,id,country
0,5uwns89zht,PT
1,jtl0dijy2j,PT
2,xx0ulgorjt,PT
3,6c6puo6ix0,PT
4,czqhjk3yfe,PT
5,szx28ujmhf,PT
6,guenkfjcbq,PT
7,tkpq0mlugk,PT
8,3xtgd5p9dn,PT
9,md9aj22l5a,PT


# Linear Regression

In [13]:
from sklearn import linear_model

linearclf = linear_model.LinearRegression()
linearclf.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [14]:
y_pred = linearclf.predict_proba(X_test)  

#Taking the 5 classes with highest probabilities
ids = []  #list of ids
cts = []  #list of countries
for i in range(len(id_test)):
    idx = id_test[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

#Generate submission
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
sub.to_csv('../output/sub_lr.csv',index=False)

AttributeError: 'LinearRegression' object has no attribute 'predict_proba'