In [62]:
import os
import numpy as np
import pandas as pd
from datetime import date

from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.preprocessing import MinMaxScaler

DATA_ROOT = "./ml100marathon-02-01/"

In [63]:
dfoff = pd.read_csv(os.path.join(DATA_ROOT,'train_offline.csv'))
dftest = pd.read_csv(os.path.join(DATA_ROOT,'test_offline.csv'))
dftest = dftest[~dftest.Coupon_id.isna()]
dftest.reset_index(drop=True, inplace=True)
print(dfoff.shape)
print(dftest.shape)
dfoff.head(20)

(1160742, 7)
(306313, 6)


Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date
0,1439408,2632,,,0.0,,20160217.0
1,1439408,2632,8591.0,20:1,0.0,20160217.0,
2,1439408,2632,1078.0,20:1,0.0,20160319.0,
3,1832624,3381,7610.0,200:20,0.0,20160429.0,
4,2029232,3381,11951.0,200:20,1.0,20160129.0,
5,2223968,3381,9776.0,10:5,2.0,20160129.0,
6,73611,2099,12034.0,100:10,,20160207.0,
7,163606,1569,5054.0,200:30,10.0,20160421.0,
8,3273056,4833,7802.0,200:20,10.0,20160130.0,
9,94107,3381,7610.0,200:20,2.0,20160412.0,


In [3]:
# 檢查 DataFrame 空缺值的狀態
df = pd.concat([dfoff, dftest])
def na_check(df_data):
    data_na = (df_data.isnull().sum() / len(df_data)) * 100
    data_na = data_na.drop(data_na[data_na == 0].index).sort_values(ascending=False)
    missing_data = pd.DataFrame({'Missing Ratio' :data_na})
    display(missing_data.head(10))
na_check(df)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


Unnamed: 0,Missing Ratio
Date,68.868993
Discount_rate,28.204328
Date_received,28.204328
Coupon_id,28.204328
Distance,7.225564


In [64]:
## Creat target label 
"""
According to the definition, 
1) buy with coupon within (include) 15 days ==> 1
2) buy with coupon but out of 15 days ==> 0
3) buy without coupon ==> -1 (we don't care)
"""
def label(row):
    if np.isnan(row['Date_received']):
        return -1
    if not np.isnan(row['Date']):
        td = pd.to_datetime(row['Date'], format='%Y%m%d') -  pd.to_datetime(row['Date_received'], format='%Y%m%d')
        if td <= pd.Timedelta(15, 'D'):
            return 1
    return 0

dfoff["label"] = dfoff.apply(label, axis=1)
dfoff["label"].value_counts()

 0    710665
-1    413773
 1     36304
Name: label, dtype: int64

2.0383885110980704

In [5]:
## Creat coupon use label 
"""
According to the definition, 
1) Date is null & Coupon_id is "not" null, had coupon, but didn't use it ==> 0
2) Date is "not" null & Coupon_id is null, normal transaction  ==> -1
3) Date is null & Coupon_id is null, coupon used ==> 1 
4) others ==> 2
"""
def transaction_type(row):
    if np.isnan(row['Date']) and np.isfinite(row['Coupon_id']):
        return 0
    if np.isfinite(row['Date']) and np.isnan(row['Coupon_id']):
        return -1
    if np.isfinite(row['Date']) and np.isfinite(row['Coupon_id']):
        return 1
    return 2
#        td = pd.to_datetime(row['Date'], format='%Y%m%d') -  pd.to_datetime(row['Date_received'], format='%Y%m%d')
#        if td <= pd.Timedelta(15, 'D'):
#            return 1
#    return 0

dfoff["transaction_type"] = dfoff.apply(transaction_type, axis=1)
dftest["transaction_type"] = dfoff.apply(transaction_type, axis=1)
dfoff["transaction_type"].value_counts()

 0    704033
-1    413773
 1     42936
Name: transaction_type, dtype: int64

In [65]:
# Generate features - weekday acquired coupon
def getWeekday(row):
    if (np.isnan(row)) or (row==-1):
        return row
    else:
        return pd.to_datetime(row, format = "%Y%m%d").dayofweek+1 # add one to make it from 0~6 -> 1~7

dfoff['weekday'] = dfoff['Date_received'].apply(getWeekday)
dftest['weekday'] = dftest['Date_received'].apply(getWeekday)

# weekday_type (weekend = 1)
dfoff['weekday_type'] = dfoff['weekday'].astype('str').apply(lambda x : 1 if x in [6,7] else 0 ) # apply to trainset
dftest['weekday_type'] = dftest['weekday'].astype('str').apply(lambda x : 1 if x in [6,7] else 0 ) # apply to testset

In [66]:
weekdaycols = ['weekday_' + str(i) for i in range(1,8)]
print(weekdaycols)

tmpdf = pd.get_dummies(dfoff['weekday'].replace(-1, np.nan))
tmpdf.columns = weekdaycols
dfoff[weekdaycols] = tmpdf

tmpdf = pd.get_dummies(dftest['weekday'].replace(-1, np.nan))
tmpdf.columns = weekdaycols
dftest[weekdaycols] = tmpdf

['weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']


In [67]:
# Generate features - coupon discount and distance
def getDiscountType(row):
    if row == 'null':
        return 'null'
    elif ':' in row:
        return 1
    else:
        return 0

def convertRate(row):
    """Convert discount to rate"""
    if row == 'null':
        return 1.0
    elif ':' in row:
        rows = row.split(':')
        return 1.0 - float(rows[1])/float(rows[0])
    else:
        return float(row)

def getDiscountMan(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[0])
    else:
        return 0

def getDiscountJian(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[1])
    else:
        return 0

def processData(df):
    
    # convert discunt_rate
    df['discount_rate'] = df['Discount_rate'].astype('str').apply(convertRate)
    df['discount_man'] = df['Discount_rate'].astype('str').apply(getDiscountMan)
    df['discount_jian'] = df['Discount_rate'].astype('str').apply(getDiscountJian)
    df['discount_type'] = df['Discount_rate'].astype('str').apply(getDiscountType)
    
    # convert distance
    # pmpmpm modify null from 99 to 2.7(mean)
    df.loc[df.Distance.isna(), "Distance"] = dfoff["Distance"].median()
    return df

dfoff = processData(dfoff)
dftest = processData(dftest)

In [92]:


#User_id_count
dfoffuser = dfoff.groupby(['User_id'])['Merchant_id'].agg({'User_id_count':'size'}).reset_index()
validuser = dftest.groupby(['User_id'])['Merchant_id'].agg({'User_id_count':'size'}).reset_index()
dfoff = pd.merge(dfoff, dfoffuser, on=['User_id'], how = 'left')
dftest = pd.merge(dftest, validuser, on=['User_id'], how = 'left')

dftest

is deprecated and will be removed in a future version
  after removing the cwd from sys.path.
is deprecated and will be removed in a future version
  """


Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,weekday,weekday_type,weekday_1,weekday_2,...,weekday_6,weekday_7,discount_rate,discount_man,discount_jian,discount_type,Coupon_id_count_x,Merchant_id_count,Coupon_id_count_y,User_id_count
0,1439408,4663,11002.0,150:20,1.0,20160528.0,6,0,0,0,...,1,0,0.866667,150,20,1,7730,11312,7730,3
1,1439408,2632,8591.0,20:1,0.0,20160613.0,1,0,1,0,...,0,0,0.950000,20,1,1,7,11,7,3
2,1439408,2632,8591.0,20:1,0.0,20160516.0,1,0,1,0,...,0,0,0.950000,20,1,1,7,11,7,3
3,2029232,450,1532.0,30:5,0.0,20160530.0,1,0,1,0,...,0,0,0.833333,30,5,1,13299,22210,13299,2
4,2029232,6459,12737.0,20:1,0.0,20160519.0,4,0,0,0,...,0,0,0.950000,20,1,1,16,16,16,2
5,2747744,6901,1097.0,50:10,1.0,20160606.0,1,0,1,0,...,0,0,0.800000,50,10,1,483,11863,483,1
6,196342,1579,10698.0,20:1,1.0,20160606.0,1,0,1,0,...,0,0,0.950000,20,1,1,9,13,9,1
7,253750,6901,2366.0,30:5,0.0,20160518.0,3,0,0,0,...,0,0,0.833333,30,5,1,10276,11863,10276,1
8,343660,4663,11002.0,150:20,1.0,20160528.0,6,0,0,0,...,1,0,0.866667,150,20,1,7730,11312,7730,1
9,1113008,3621,2705.0,20:5,0.0,20160524.0,2,0,0,1,...,0,0,0.750000,20,5,1,7633,19004,7633,4


In [97]:
#Merchant_id_count


#dfoff['Merchant_id'].value_counts()
dfofftemp = dfoff.groupby(['Merchant_id'])['User_id'].agg({'Merchant_id_count':'size'}).reset_index()
#dfofftemp
#dftest['Merchant_id'].value_counts()
validtemp = dftest.groupby(['Merchant_id'])['User_id'].agg({'Merchant_id_count':'size'}).reset_index()

#count_df = dfoff.groupby(['User_id'])['label'].agg({'User_id_count':'size'}).reset_index()
#dfofftemp.sort_values(by = ['Merchant_id_count'],ascending = False)
dfoff = pd.merge(dfoff, dfofftemp, on=['Merchant_id'], how = 'left')
#train1
#dfoff
dftest = pd.merge(dftest, validtemp, on=['Merchant_id'], how = 'left')



is deprecated and will be removed in a future version
  """
is deprecated and will be removed in a future version
  


In [98]:

#Coupon_id_count
dfofftemp1 = dfoff.groupby(['Coupon_id'])['User_id'].agg({'Coupon_id_count':'size'}).reset_index()
validtemp1 = dftest.groupby(['Coupon_id'])['User_id'].agg({'Coupon_id_count':'size'}).reset_index()
dfoff = pd.merge(dfoff, dfofftemp1, on=['Coupon_id'], how = 'left')
dftest = pd.merge(dftest, validtemp1, on=['Coupon_id'], how = 'left')

dftest

is deprecated and will be removed in a future version
  This is separate from the ipykernel package so we can avoid doing imports until
is deprecated and will be removed in a future version
  after removing the cwd from sys.path.


Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,weekday,weekday_type,weekday_1,weekday_2,...,discount_rate,discount_man,discount_jian,discount_type,Coupon_id_count_x,Merchant_id_count_x,Coupon_id_count_y,User_id_count,Merchant_id_count_y,Coupon_id_count
0,1439408,4663,11002.0,150:20,1.0,20160528.0,6,0,0,0,...,0.866667,150,20,1,7730,11312,7730,3,11312,7730
1,1439408,2632,8591.0,20:1,0.0,20160613.0,1,0,1,0,...,0.950000,20,1,1,7,11,7,3,11,7
2,1439408,2632,8591.0,20:1,0.0,20160516.0,1,0,1,0,...,0.950000,20,1,1,7,11,7,3,11,7
3,2029232,450,1532.0,30:5,0.0,20160530.0,1,0,1,0,...,0.833333,30,5,1,13299,22210,13299,2,22210,13299
4,2029232,6459,12737.0,20:1,0.0,20160519.0,4,0,0,0,...,0.950000,20,1,1,16,16,16,2,16,16
5,2747744,6901,1097.0,50:10,1.0,20160606.0,1,0,1,0,...,0.800000,50,10,1,483,11863,483,1,11863,483
6,196342,1579,10698.0,20:1,1.0,20160606.0,1,0,1,0,...,0.950000,20,1,1,9,13,9,1,13,9
7,253750,6901,2366.0,30:5,0.0,20160518.0,3,0,0,0,...,0.833333,30,5,1,10276,11863,10276,1,11863,10276
8,343660,4663,11002.0,150:20,1.0,20160528.0,6,0,0,0,...,0.866667,150,20,1,7730,11312,7730,1,11312,7730
9,1113008,3621,2705.0,20:5,0.0,20160524.0,2,0,0,1,...,0.750000,20,5,1,7633,19004,7633,4,19004,7633


In [99]:
## Naive model
def split_train_valid(row, date_cut="20160416"):
    is_train = True if pd.to_datetime(row, format="%Y%m%d") < pd.to_datetime(date_cut, format="%Y%m%d") else False
    return is_train
    
df = dfoff[dfoff['label'] != -1].copy()
df["is_train"] = df["Date_received"].apply(split_train_valid)
train = df[df["is_train"]]
valid = df[~df["is_train"]]
train.reset_index(drop=True, inplace=True)
valid.reset_index(drop=True, inplace=True)
print("Train size: {}, #positive: {}".format(len(train), train["label"].sum()))
print("Valid size: {}, #positive: {}".format(len(valid), valid["label"].sum()))

Train size: 667753, #positive: 32472
Valid size: 79216, #positive: 3832


In [112]:
train['Date'].value_counts()

20160330.0    1421
20160328.0    1387
20160329.0    1362
20160327.0    1318
20160331.0    1291
20160325.0    1164
20160401.0    1127
20160326.0    1076
20160131.0     881
20160324.0     874
20160402.0     825
20160205.0     775
20160130.0     754
20160206.0     694
20160403.0     690
20160323.0     665
20160204.0     652
20160202.0     635
20160203.0     625
20160404.0     599
20160129.0     595
20160201.0     592
20160207.0     552
20160228.0     533
20160208.0     530
20160209.0     527
20160229.0     485
20160214.0     476
20160220.0     447
20160210.0     441
              ... 
20160509.0      22
20160504.0      21
20160502.0      19
20160506.0      17
20160505.0      17
20160510.0      17
20160507.0      14
20160503.0      13
20160512.0       6
20160511.0       6
20160514.0       4
20160513.0       4
20160515.0       3
20160518.0       2
20160523.0       2
20160623.0       2
20160526.0       2
20160525.0       2
20160522.0       1
20160517.0       1
20160624.0       1
20160603.0  

ValueError: could not convert string to float: '20:1'

In [None]:
#Merchant_id_count


#dfoff['Merchant_id'].value_counts()
#dfofftemp = dfoff.groupby(['Merchant_id'])['label'].agg({'Merchant_id_count':'size'}).reset_index()
#valid['Merchant_id'].value_counts()
#validtemp = valid.groupby(['Merchant_id'])['label'].agg({'Merchant_id_count':'size'}).reset_index()

#count_df = dfoff.groupby(['User_id'])['label'].agg({'User_id_count':'size'}).reset_index()
#dfofftemp.sort_values(by = ['Merchant_id_count'],ascending = False)
#train1 = pd.merge(train, dfofftemp, on=['Merchant_id'], how = 'left')
#train1

#valid1 = pd.merge(valid, validtemp, on=['Merchant_id'], how = 'left')
#train1["transaction_type"].value_counts()

In [104]:
original_feature = ['discount_rate',
                    'User_id',
                    'User_id_count',
                    'Merchant_id',
                    'Merchant_id_count_x',
                    'Coupon_id',
                    'Coupon_id_count_x',
#                    'transaction_type',
                    'discount_type',
                    'discount_man', 
                    'discount_jian',
                    'Distance', 
                    'weekday']
#+ weekdaycols
predictors = original_feature
print(len(original_feature),original_feature)


12 ['discount_rate', 'User_id', 'User_id_count', 'Merchant_id', 'Merchant_id_count_x', 'Coupon_id', 'Coupon_id_count_x', 'discount_type', 'discount_man', 'discount_jian', 'Distance', 'weekday']


In [105]:

#print(predictors)

def check_model(data, predictors):
    
    classifier = lambda: SGDClassifier(
        loss='modified_huber', 
        penalty='elasticnet', 
        fit_intercept=True, 
        max_iter=500, 
        shuffle=True, 
        n_jobs=1,
        class_weight=None)

    model = Pipeline(steps=[
        ('ss', StandardScaler()),
        ('en', classifier())
    ])

    parameters = {
        'en__alpha': [ 0.001],
        'en__l1_ratio': [ 0.001]
        #'loss': [ 'log', 0.01, 0.1]
    }

    folder = StratifiedKFold(n_splits=3, shuffle=True)
    
    grid_search = GridSearchCV(
        model, 
        parameters, 
        cv=folder, 
        n_jobs=-1, 
        verbose=1)
    grid_search = grid_search.fit(data[predictors], 
                                  data['label'])
    
    return grid_search

In [106]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestRegressor



def check_model_gdbt(data, predictors):
    
    classifier = GradientBoostingClassifier(tol=0.1, subsample=0.37, max_features=7)

    #model = Pipeline(steps=[
    #    ('ss', StandardScaler()),
    #    ('en', classifier())
    #])

    parameters = {
        'learning_rate': [ 0.01, 0.1],
        'max_depth': [6,10],
        'n_estimators': [30,50,80],
        'min_samples_leaf':[10, 20]
        #'loss': [ 'log', 0.01, 0.1]
    }
    folder = StratifiedKFold(n_splits=3, shuffle=True)
    grid_search = GridSearchCV(
        model, 
        parameters, 
        cv=folder, 
        n_jobs=-1, 
        verbose=1)
    grid_search = grid_search.fit(data[predictors], 
                                  data['label'])
    
    return grid_search

In [None]:
from sklearn.tree import DecisionTreeClassifier
DT = DecisionTreeClassifier()
model = DT.fit(train[predictors], train['label'])
print(predictors)
print("Feature importance: ", model.feature_importances_)

In [113]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
forest.fit(train[predictors], train['label'])
model = forest.fit(train[predictors], train['label'])
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(train[predictors].shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30, predictors[indices[f]], importances[indices[f]]))

 1) User_id                        0.478425
 2) User_id_count                  0.154567
 3) weekday                        0.090699
 4) Coupon_id_count_x              0.059278
 5) Distance                       0.049374
 6) Coupon_id                      0.042984
 7) Merchant_id_count_x            0.038137
 8) Merchant_id                    0.034897
 9) discount_man                   0.021405
10) discount_rate                  0.015579
11) discount_jian                  0.013511
12) discount_type                  0.001144


In [107]:
gdbt = GradientBoostingClassifier(tol=0.1, subsample=0.37, n_estimators=80, max_features=7, 
                                 max_depth=6, learning_rate=0.01, min_samples_leaf = 10)
#model = gdbt.fit(train1[predictors][400000:600000], train1['label'][400000:600000])
#from sklearn import preprocessing
#train[predictors] = preprocessing.scale(train[predictors])
#valid[predictors] = preprocessing.scale(valid[predictors])

model = gdbt.fit(train[predictors], train['label'])

importances = gdbt.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(train[predictors].shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30, predictors[indices[f]], importances[indices[f]]))
    
y_valid_pred = model.predict_proba(valid[predictors])
valid1 = valid.copy()
valid1['pred_prob'] = y_valid_pred[:, 1]

from sklearn.metrics import roc_auc_score, accuracy_score
auc_score = roc_auc_score(y_true=valid.label, y_score=y_valid_pred[:,1])
acc = accuracy_score(y_true=valid.label, y_pred=y_valid_pred.argmax(axis=1))
print("Validation AUC: {:.3f}, Accuracy: {:.3f}".format(auc_score, acc))
print("tol=0.1, subsample=0.37, n_estimators=80, max_features=7,max_depth=6, learning_rate=0.01, min_samples_leaf = 10")

 1) Coupon_id_count_x              0.226376
 2) User_id_count                  0.225429
 3) Distance                       0.186579
 4) discount_man                   0.104187
 5) discount_rate                  0.076061
 6) Coupon_id                      0.053939
 7) Merchant_id_count_x            0.047145
 8) Merchant_id                    0.044607
 9) discount_jian                  0.027571
10) weekday                        0.005067
11) User_id                        0.002833
12) discount_type                  0.000205
Validation AUC: 0.851, Accuracy: 0.952
tol=0.1, subsample=0.37, n_estimators=80, max_features=7,max_depth=6, learning_rate=0.01, min_samples_leaf = 10


In [38]:

#

model = check_model_gdbt(train[:100000], predictors)
model.best_params_

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


ValueError: Invalid parameter learning_rate for estimator GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=None, shuffle=True),
       error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.01, loss='deviance', max_depth=6,
              max_features=7, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=10, min_sample...        subsample=0.37, tol=0.1, validation_fraction=0.1, verbose=0,
              warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'learning_rate': [0.1], 'max_depth': [10], 'n_estimators': [10, 30, 50]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1). Check the list of available parameters with `estimator.get_params().keys()`.

In [34]:
#model.predict(valid[predictors])
#acc = accuracy_score(y_true=valid.label, y_pred=y_valid_pred.argmax(axis=1))
#print("Acuuracy: ", acc)
print(model.best_params_)

{'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 30}


In [114]:
y_valid_pred = model.predict_proba(valid[predictors])
valid1 = valid.copy()
valid1['pred_prob'] = y_valid_pred[:, 1]

In [115]:
from sklearn.metrics import roc_auc_score, accuracy_score
auc_score = roc_auc_score(y_true=valid.label, y_score=y_valid_pred[:,1])
acc = accuracy_score(y_true=valid.label, y_pred=y_valid_pred.argmax(axis=1))
print("Validation AUC: {:.3f}, Accuracy: {:.3f}".format(auc_score, acc))

Validation AUC: 0.833, Accuracy: 0.950


In [None]:
means = model.cv_results_['mean_test_score']
stds = model.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, model.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

In [116]:
targetset = dftest.copy()
print(targetset.shape)
targetset = targetset[~targetset.Coupon_id.isna()]
targetset.reset_index(drop=True, inplace=True)
testset = targetset[predictors].copy()

y_test_pred = model.predict_proba(testset[predictors])
test1 = testset.copy()
test1['pred_prob'] = y_test_pred[:, 1]
print(test1.shape)

(306313, 25)
(306313, 13)


In [117]:
output = pd.concat((targetset[["User_id", "Coupon_id", "Date_received"]], test1["pred_prob"]), axis=1)
print(output.shape)

output.loc[:, "User_id"] = output["User_id"].apply(lambda x:str(int(x)))
output.loc[:, "Coupon_id"] = output["Coupon_id"].apply(lambda x:str(int(x)))
output.loc[:, "Date_received"] = output["Date_received"].apply(lambda x:str(int(x)))
output["uid"] = output[["User_id", "Coupon_id", "Date_received"]].apply(lambda x: '_'.join(x.values), axis=1)
output.reset_index(drop=True, inplace=True)

(306313, 4)


In [118]:
### NOTE: YOUR SUBMITION FILE SHOULD HAVE COLUMN NAME: uid, label
out = output.groupby("uid", as_index=False).mean()
out = out[["uid", "pred_prob"]]
out.columns = ["uid", "label"]
out.to_csv("baseline_example_rfClassifier.csv", header=["uid", "label"], index=False) # submission format
out.head()

Unnamed: 0,uid,label
0,1000020_2705_20160519,0.04
1,1000020_8192_20160513,0.17
2,1000065_1455_20160527,0.106667
3,1000085_8067_20160513,0.18
4,1000086_2418_20160613,0.03
