# Importing , Reading CSV

In [1]:
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns
from pathlib import Path
import gc

In [2]:
######################################
# Setting paths, columns and dtypes. #
######################################
current_dir = Path('.')
path_dir = current_dir / 'datasets'
path_log = path_dir / 'log.tsv'
path_events = path_dir / 'events.tsv'
path_users = path_dir / 'users.tsv'



log_cols = [
    'user_id', 'event_id', 'time_stamp', 'action_type',
    'num_of_people', 'payment_methond', 'total_price'
]

dtypes = {
        'user_id'          : 'int',
        'event_id'         : 'int',
        'time_stamp'       : 'str',
        'action_type'      : 'int',
        'num_of_people'    : 'float',
        'payment_methond'  : 'str',
        'total_price'      : 'float',
        'female_age_lower' : 'int',
        'female_age_upper' : 'int',
        'male_age_lower'   : 'int',
        'male_age_upper'   : 'int',
        'event_start_at'   : 'str',
        'prefecture'       : 'str',
        'first_published_at': 'str',
        'female_price'     : 'int',
        'male_price'       : 'int',
        'interest'         : 'str',
        'age'              : 'int',
        'gender'           : 'str',
        'createed_at'      : 'str'
}


prefecture_dict = {
    '北海道'  :   0,
    '青森県'  :   1,
    '岩手県'  :   2,
    '宮城県'  :   3,
    '秋田県'  :   4,
    '山形県'  :   5,
    '福島県'  :   6,
    '茨城県'  :   7,
    '栃木県'  :   8,
    '群馬県'  :   9,
    '埼玉県'  :  10,
    '千葉県'  :  11,
    '東京都'  :  12,
    '神奈川県':  13,
    '新潟県'  :  14,
    '富山県'  :  15,
    '石川県'  :  16,
    '福井県'  :  17,
    '山梨県'  :  18,
    '長野県'  :  19,
    '岐阜県'  :  20,
    '静岡県'  :  21,
    '愛知県'  :  22,
    '三重県'  :  23,
    '滋賀県'  :  24,
    '京都府'  :  25,
    '大阪府'  :  26,
    '兵庫県'  :  27,
    '奈良県'  :  28,
    '和歌山県':  29,
    '鳥取県'  :  30,
    '島根県'  :  31,
    '岡山県'  :  32,
    '広島県'  :  33,
    '山口県'  :  34,
    '徳島県'  :  35,
    '香川県'  :  36,
    '愛媛県'  :  37,
    '高知県'  :  38,
    '福岡県'  :  39,
    '佐賀県'  :  40,
    '長崎県'  :  41,
    '熊本県'  :  42,
    '大分県'  :  43,
    '宮崎県'  :  44,
    '鹿児島県':  45,
    '沖縄県'  :  46,
    'その他（海外等）': 47
}


gender_dict = {'男性':0, '女性':1}

payment_method_dict = {
    'クレカ'   :  0,
    'コンビニ' :  1,
    'eマネー'  :  2,
    '銀振'     :  3
}

interest_dict = {
    'アニメコン'                           :   0,
    '謎解きコン'                           :   1,
    'その他（スポーツコン）'                 :   2,
    '料理合コン（料理コン）'                 :   3,
    'スポーツコン'                         :   4,
    'フットサルコン'                       :   5,
    'ボルダリングコン'                      :   6,
    'テニスコン'                           :   7,
    '旅コン'                              :   8,
    'サバコン'                            :   9,
    '婚活バスツアー（お見合いバスツアー）'     :  10,
    '釣りコン'                            :  11,
    'その他'                              :  12
}



column_list = ['prefecture_event', 'prefecture_user', 'gender', 'payment_method' 'interest_dict', 'action_type']
dict_list = [prefecture_dict, prefecture_dict, gender_dict, payment_method_dict, interest_dict, action_type_dict]

In [3]:
###########################################################
# Reading tsv and merge them on 'user_id' and 'event_id'. #
###########################################################

print ('Loading and merging csv')

df_log = pd.read_table(path_log, header=0, parse_dates=True, na_values='n/a')
df_events = pd.read_table(path_events, header=0, parse_dates=True, na_values='n/a')
df_users = pd.read_table(path_users, header=0, parse_dates=True, na_values='n/a')


Loading and merging csv


In [4]:
df_users.head()

Unnamed: 0,user_id,age,gender,prefecture,created_on
0,1,34,女性,静岡県,2012-06-26
1,2,31,男性,鳥取県,2012-07-12
2,3,32,男性,東京都,2012-07-15
3,4,30,男性,東京都,2012-07-17
4,5,29,女性,埼玉県,2012-07-17


In [5]:
df_events.head()

Unnamed: 0,event_id,female_age_lower,female_age_upper,male_age_lower,male_age_upper,event_start_at,prefecture,first_published_at,female_price,male_price,interest
0,1,20,99.0,25,44.0,2015-03-08 18:00:00,宮城県,,1500.0,5000.0,
1,2,30,99.0,30,49.0,2015-03-01 15:00:00,富山県,,1000.0,6500.0,
2,3,22,99.0,24,39.0,2015-03-07 19:00:00,富山県,,1000.0,6000.0,
3,4,20,99.0,25,44.0,2015-03-08 18:00:00,新潟県,,1000.0,6000.0,
4,5,20,99.0,20,39.0,2015-03-14 19:00:00,新潟県,,1000.0,6000.0,


In [6]:
df_log.head()

Unnamed: 0,user_id,event_id,time_stamp,action_type,num_of_people,payment_method,total_price
0,1,6261,2015-03-19 09:15:50,3,1.0,クレカ,4000.0
1,1,127600,2017-06-05 12:33:17,1,,,
2,1,127600,2017-06-05 12:37:09,1,,,
3,1,125296,2017-06-06 14:01:27,1,,,
4,1,125128,2017-06-06 14:03:57,1,,,


# Replacement (str -> int, NaN -> 9999)

In [8]:
# Replacing str for int and fillin NaN with 9999

df_events['prefecture'] = df_events.prefecture.map(prefecture_dict)
df_users['prefecture'] = df_users.prefecture.map(prefecture_dict)
df_users['gender'] = df_users.gender.map(gender_dict)
df_log['payment_method'] = df_log.payment_method.map(payment_method_dict)
df_events['interest'] = df_events.interest.map(interest_dict)


In [9]:
df_log = df_log.fillna(9999)
df_users = df_users.fillna(9999)
df_events = df_events.fillna(9999)

# About Gender

In [14]:
df_users.gender.value_counts()

1    37277
0    32390
Name: gender, dtype: int64

In [26]:
# How many usres in df_users
df_users.user_id.nunique()

69667

In [27]:
# How many usrs in df_log
df_log.user_id.nunique()

69773

In [32]:
# Extract unkown users' log
unknown_users_log = df_log[~df_log['user_id'].isin(df_users['user_id'])]

In [33]:
unknown_users

Unnamed: 0,user_id,event_id,time_stamp,action_type,num_of_people,payment_method,total_price
1450405,69668,154479,2017-09-23 16:02:45,2,9999.0,9999.0,9999.0
1450406,69669,153106,2017-09-23 15:31:05,3,2.0,0.0,11500.0
1450407,69670,147080,2017-09-23 15:47:47,3,1.0,0.0,1000.0
1450408,69671,153538,2017-09-23 15:47:37,3,1.0,1.0,3000.0
1450409,69673,151812,2017-09-23 17:17:56,3,1.0,0.0,4500.0
1450410,69674,139288,2017-06-25 12:13:27,1,9999.0,9999.0,9999.0
1450411,69674,120320,2017-06-25 12:13:41,1,9999.0,9999.0,9999.0
1450412,69674,137125,2017-06-25 12:14:19,1,9999.0,9999.0,9999.0
1450413,69674,131674,2017-06-25 12:15:11,1,9999.0,9999.0,9999.0
1450414,69675,144005,2017-08-03 19:04:36,1,9999.0,9999.0,9999.0


In [None]:
# Splitting into each gender's dataframes
df_male = df_users[df_users['gender']==0]
df_female = df_users[df_users['gender']==1]
# unknown_users holds logs of people we don't know about genders

## Able to infer genders of action_type == 2 users

### Should we make an algorithm to infer gender of users whose gender == 9999 ?
### Not sure how to handle info from gender == 9999 users. 

In [35]:
# Guessing ['gender'] == 9999 users' gender

# 1450406 seems like 2 men entried with a \500 off coupon
# 1450407 ('user_id' == 69670) is female
# 1450408 ('user_id' == 69671) is female
# 1450409 ('user_id' == 69673) is female
# 1450527 ('user_id' == 69676) is male
# 1450632 ('user_id' == 69677) seems like male, entrying with a 50% off coupon

unknown_users[unknown_users['total_price']!=9999]

Unnamed: 0,user_id,event_id,time_stamp,action_type,num_of_people,payment_method,total_price
1450406,69669,153106,2017-09-23 15:31:05,3,2.0,0.0,11500.0
1450407,69670,147080,2017-09-23 15:47:47,3,1.0,0.0,1000.0
1450408,69671,153538,2017-09-23 15:47:37,3,1.0,1.0,3000.0
1450409,69673,151812,2017-09-23 17:17:56,3,1.0,0.0,4500.0
1450527,69676,152514,2017-09-23 21:12:25,3,1.0,0.0,7000.0
1450632,69677,153699,2017-09-23 21:20:49,3,1.0,0.0,4500.0


## Wait, how many people entried for each entry?

### Not sure how to handle multi-people entries. 

In [36]:
df_log.num_of_people.value_counts()

9999.0    1198034
1.0        160847
2.0         88665
3.0          3235
4.0          1203
5.0            55
6.0            45
7.0             2
0.0             2
9.0             1
Name: num_of_people, dtype: int64

# Data Augmentation

## Converting str date data to Datetime

In [40]:
df_users.iloc[:,4] = pd.to_datetime(df_users.iloc[:,4], format='%Y-%m-%d %H:%M:%S')
df_events.iloc[:,5] = pd.to_datetime(df_events.iloc[:,5], format='%Y-%m-%d %H:%M:%S')
df_events.iloc[:,7] = pd.to_datetime(df_events.iloc[:,7], format='%Y-%m-%d %H:%M:%S')
df_log.iloc[:,2] =  pd.to_datetime(df_log.iloc[:,2], format='%Y-%m-%d %H:%M:%S')
unknown_users.iloc[:,2] = pd.to_datetime(unknown_users.iloc[:,2], format='%Y-%m-%d %H:%M:%S')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [43]:
# Is is okay tha NaN in first_published_at converted to 1970-01-01 00:00:00.000009999 ?

df_events.iloc[:,7].value_counts()

1970-01-01 00:00:00.000009999    71847
2017-03-10 11:01:37.000000000        3
2017-04-25 13:17:16.000000000        3
2017-06-07 10:00:16.000000000        3
2017-08-03 12:15:02.000000000        3
2017-08-03 09:22:29.000000000        3
2017-01-17 11:05:48.000000000        3
2017-03-16 10:14:22.000000000        3
2017-07-19 10:03:33.000000000        3
2017-06-06 09:09:06.000000000        2
2017-06-06 10:33:15.000000000        2
2017-06-15 10:49:50.000000000        2
2017-01-24 12:37:22.000000000        2
2017-06-30 09:29:07.000000000        2
2017-03-09 15:06:38.000000000        2
2017-07-24 16:38:53.000000000        2
2017-06-02 11:14:46.000000000        2
2017-05-12 09:28:54.000000000        2
2017-05-19 09:38:32.000000000        2
2017-02-23 10:09:50.000000000        2
2017-04-25 15:35:18.000000000        2
2017-05-09 11:27:19.000000000        2
2017-04-17 13:05:03.000000000        2
2016-11-02 10:08:02.000000000        2
2016-11-08 17:40:32.000000000        2
2017-05-22 12:09:00.00000

In [46]:
df_first_published_at = df_events.copy()

In [50]:
df_first_published_at.first_published_at = df_first_published_at.first_published_at.replace({'1970-01-01 00:00:00.000009999': 'NaN'})

  result = op(a, b)


TypeError: Cannot compare types 'ndarray(dtype=datetime64[ns])' and 'str'

### AHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH!

### CANNOT REVOME ROWS WITH 1970-01-01 00:00:00.000009999
### WHAT I WANTED TO DO IS...
### 1. REMOVE ROWS WITH 1970-01-01 00:00:00.000009999 
### 2. GET THE MEAN OF DIFFERENCE BETWEEN EVENT_START_AT AND FIRST_PUBLISHED_AT
### 3. FILL NAN IN FIRST_PUBLISHED_AT BY ADDING THE MEAN TO EVENT_START_AT

### AHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH!

-> where, mask might be helpful

## Makng DataFrame for each user and each event

In [51]:
# Extracting user 1 for testing
df_test_user = df_users.iloc[0, :]
df_test_user

user_id                         1
age                            34
gender                          1
prefecture                     21
created_on    2012-06-26 00:00:00
Name: 0, dtype: object

In [53]:
# Making event_list
event_list = pd.DataFrame()
event_list['event_id'] = df_events.event_id
event_list['event_start_at'] = df_events.event_start_at
event_list

Unnamed: 0,event_id,event_start_at
0,1,2015-03-08 18:00:00
1,2,2015-03-01 15:00:00
2,3,2015-03-07 19:00:00
3,4,2015-03-08 18:00:00
4,5,2015-03-14 19:00:00
5,6,2015-03-15 18:00:00
6,7,2015-01-23 14:00:00
7,8,2015-05-24 14:00:00
8,9,2015-03-15 18:00:00
9,10,2016-01-31 14:00:00


In [54]:
# Making df_test_index
df_test_index = pd.DataFrame()


In [55]:
# Getting logs of usre 1
log_user1 = df_log[df_log['user_id'].isin([df_test_user['user_id']])]
print(log_user1)

      user_id  event_id          time_stamp  action_type  num_of_people  \
0           1      6261 2015-03-19 09:15:50            3            1.0   
1           1    127600 2017-06-05 12:33:17            1         9999.0   
2           1    127600 2017-06-05 12:37:09            1         9999.0   
3           1    125296 2017-06-06 14:01:27            1         9999.0   
4           1    125128 2017-06-06 14:03:57            1         9999.0   
5           1    124841 2017-06-06 14:16:39            1         9999.0   
6           1    138061 2017-06-06 17:35:18            1         9999.0   
7           1    127103 2017-06-06 09:32:50            1         9999.0   
8           1    127600 2017-06-05 12:32:04            1         9999.0   
9           1    125305 2017-06-06 09:35:40            1         9999.0   
10          1    125305 2017-06-06 09:39:47            1         9999.0   
11          1    142900 2017-06-06 13:11:55            1         9999.0   
12          1    142900 2

In [63]:
df_test_index = event_list

df_test_index


Unnamed: 0,event_id,event_start_at
0,1,2015-03-08 18:00:00
1,2,2015-03-01 15:00:00
2,3,2015-03-07 19:00:00
3,4,2015-03-08 18:00:00
4,5,2015-03-14 19:00:00
5,6,2015-03-15 18:00:00
6,7,2015-01-23 14:00:00
7,8,2015-05-24 14:00:00
8,9,2015-03-15 18:00:00
9,10,2016-01-31 14:00:00


In [65]:
# Merging event_list and df_test_user whrere event_start_at > created_on

df_test_index = pd.merge(df_test_index[event_list['event_start_at'] > df_test_user['created_on']],
                            log_user1[event_list['event_start_at'] > df_test_user['created_on']], on='event_id', how='left')

df_test_index

  


Unnamed: 0,event_id,event_start_at,user_id,time_stamp,action_type,num_of_people,payment_method,total_price
0,1,2015-03-08 18:00:00,,NaT,,,,
1,2,2015-03-01 15:00:00,,NaT,,,,
2,3,2015-03-07 19:00:00,,NaT,,,,
3,4,2015-03-08 18:00:00,,NaT,,,,
4,5,2015-03-14 19:00:00,,NaT,,,,
5,6,2015-03-15 18:00:00,,NaT,,,,
6,7,2015-01-23 14:00:00,,NaT,,,,
7,8,2015-05-24 14:00:00,,NaT,,,,
8,9,2015-03-15 18:00:00,,NaT,,,,
9,10,2016-01-31 14:00:00,,NaT,,,,


In [66]:
# Checking if it works
df_test_index.iloc[6260, :]

event_id                         6261
event_start_at    2015-03-22 14:00:00
user_id                             1
time_stamp        2015-03-19 09:15:50
action_type                         3
num_of_people                       1
payment_method                      0
total_price                      4000
Name: 6260, dtype: object

In [67]:
# Checking whti a fake user whose craeted_on date is later
df_test_user_2 = df_test_user.copy()
df_test_user_2['created_on'] = '2017-09-20 00:00:00'

In [69]:
# Getting logs of the fake user
log_user2 = df_log[df_log['user_id'].isin([df_test_user_2['user_id']])]
print(log_user2)

      user_id  event_id          time_stamp  action_type  num_of_people  \
0           1      6261 2015-03-19 09:15:50            3            1.0   
1           1    127600 2017-06-05 12:33:17            1         9999.0   
2           1    127600 2017-06-05 12:37:09            1         9999.0   
3           1    125296 2017-06-06 14:01:27            1         9999.0   
4           1    125128 2017-06-06 14:03:57            1         9999.0   
5           1    124841 2017-06-06 14:16:39            1         9999.0   
6           1    138061 2017-06-06 17:35:18            1         9999.0   
7           1    127103 2017-06-06 09:32:50            1         9999.0   
8           1    127600 2017-06-05 12:32:04            1         9999.0   
9           1    125305 2017-06-06 09:35:40            1         9999.0   
10          1    125305 2017-06-06 09:39:47            1         9999.0   
11          1    142900 2017-06-06 13:11:55            1         9999.0   
12          1    142900 2

In [72]:
# Merging test to find it's valid
df_test_index_2 = event_list
df_test_index_2 = pd.merge(df_test_index_2[event_list['event_start_at'] > df_test_user_2['created_on']],
                            log_user2[event_list['event_start_at'] > df_test_user_2['created_on']], on='event_id', how='left')
df_test_index_2

  after removing the cwd from sys.path.


Unnamed: 0,event_id,event_start_at,user_id,time_stamp,action_type,num_of_people,payment_method,total_price
0,143632,2017-09-20 17:30:00,,NaT,,,,
1,143677,2017-09-23 11:00:00,,NaT,,,,
2,143678,2017-09-24 11:00:00,,NaT,,,,
3,143679,2017-09-30 11:00:00,,NaT,,,,
4,143687,2017-09-23 13:30:00,,NaT,,,,
5,143688,2017-09-24 13:30:00,,NaT,,,,
6,143689,2017-09-30 13:30:00,,NaT,,,,
7,143697,2017-09-23 16:00:00,,NaT,,,,
8,143698,2017-09-24 16:00:00,,NaT,,,,
9,143699,2017-09-30 16:00:00,,NaT,,,,


In [73]:
df_test_index = df_test_index.drop(['event_start_at','time_stamp', 'num_of_people', 'payment_method', 'total_price'], axis=1)
df_test_index = df_test_index.iloc[:,[1, 0, 2]]
df_test_index['user_id'] = df_test_user['user_id']
df_test_index = df_test_index.fillna(0)
df_test_index                                   

Unnamed: 0,user_id,event_id,action_type
0,1,1,0.0
1,1,2,0.0
2,1,3,0.0
3,1,4,0.0
4,1,5,0.0
5,1,6,0.0
6,1,7,0.0
7,1,8,0.0
8,1,9,0.0
9,1,10,0.0


In [74]:
df_test_index.action_type.value_counts()

0.0    157894
1.0      1282
3.0         6
2.0         1
Name: action_type, dtype: int64

# Making an iterable algorithm working like above for each users

In [106]:
# I DON'T KNOW WHY THIS ERROR IS HAPPENING
df_users_test = df_users.iloc[:2, :]
df_users_test.iloc[1, -1] = '2017-09-20'
df_users_test

TypeError: 

In [107]:
# Making a test user dataframe and change [user_id = 2]'s created-on date
df_users_test = df_users.iloc[:2, :]
df_users_test.created_on[1] = '2017-09-20'
df_users_test

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,user_id,age,gender,prefecture,created_on
0,1,34,1,21,2012-06-26
1,2,31,0,30,2017-09-20


In [178]:
for i, v in enumerate(df_users_test['user_id']):
    # Merging event_list and df_test_user whrere event_start_at > created_on 
    this_users_created_on = df_users_test.created_on[i]
    df_index = pd.DataFrame()
    df_index = event_list
    log_user = df_log[df_log['user_id'] == v]
    df_index = pd.merge(df_index[event_list.event_start_at > this_users_created_on],
                            log_user[event_list.event_start_at > this_users_created_on], on='event_id', how='left')

    # Droping log.tsv's info and fill action_type=NaN with 0
    df_index = df_index.drop(['event_start_at','time_stamp', 'num_of_people', 'payment_method', 'total_price'], axis=1)
    df_index = df_index.iloc[:,[1, 0, 2]]
    df_index['user_id'] = v
    df_index = df_index.fillna(0)

    # Adding info of user.tsv and event.tsv to df_index 
    df_index = pd.merge(df_index, df_users, on='user_id', how='left')
    df_index = pd.merge(df_index, df_events, on='event_id', how='left', suffixes=('_user', '_event'))
    
    df_index.to_csv('log_{}.csv'.format(v), index=False)


  


In [179]:
log_1 = pd.read_csv('log_1.csv')
log_1.head()

Unnamed: 0,user_id,event_id,action_type,age,gender,prefecture_user,created_on,female_age_lower,female_age_upper,male_age_lower,male_age_upper,event_start_at,prefecture_event,first_published_at,female_price,male_price,interest
0,1,1,0.0,34,1,21,2012-06-26,20,99.0,25,44.0,2015-03-08 18:00:00,3,1970-01-01 00:00:00.000009999,1500.0,5000.0,9999.0
1,1,2,0.0,34,1,21,2012-06-26,30,99.0,30,49.0,2015-03-01 15:00:00,15,1970-01-01 00:00:00.000009999,1000.0,6500.0,9999.0
2,1,3,0.0,34,1,21,2012-06-26,22,99.0,24,39.0,2015-03-07 19:00:00,15,1970-01-01 00:00:00.000009999,1000.0,6000.0,9999.0
3,1,4,0.0,34,1,21,2012-06-26,20,99.0,25,44.0,2015-03-08 18:00:00,14,1970-01-01 00:00:00.000009999,1000.0,6000.0,9999.0
4,1,5,0.0,34,1,21,2012-06-26,20,99.0,20,39.0,2015-03-14 19:00:00,14,1970-01-01 00:00:00.000009999,1000.0,6000.0,9999.0


In [180]:
log_1.columns

Index(['user_id', 'event_id', 'action_type', 'age', 'gender',
       'prefecture_user', 'created_on', 'female_age_lower', 'female_age_upper',
       'male_age_lower', 'male_age_upper', 'event_start_at',
       'prefecture_event', 'first_published_at', 'female_price', 'male_price',
       'interest'],
      dtype='object')

In [181]:
log_2 = pd.read_csv('log_2.csv')
log_2.head()

Unnamed: 0,user_id,event_id,action_type,age,gender,prefecture_user,created_on,female_age_lower,female_age_upper,male_age_lower,male_age_upper,event_start_at,prefecture_event,first_published_at,female_price,male_price,interest
0,2,143632,0.0,31,0,30,2017-09-20,22,38.0,25,39.0,2017-09-20 17:30:00,12,2017-08-10 11:18:13,1000.0,5500.0,9999.0
1,2,143677,0.0,31,0,30,2017-09-20,30,42.0,34,47.0,2017-09-23 11:00:00,39,2017-08-07 12:02:41,2000.0,3000.0,9999.0
2,2,143678,0.0,31,0,30,2017-09-20,32,39.0,36,43.0,2017-09-24 11:00:00,39,2017-08-07 12:29:31,2000.0,3000.0,9999.0
3,2,143679,0.0,31,0,30,2017-09-20,39,46.0,43,50.0,2017-09-30 11:00:00,39,2017-08-07 12:48:29,2000.0,3000.0,9999.0
4,2,143687,0.0,31,0,30,2017-09-20,32,39.0,36,43.0,2017-09-23 13:30:00,39,2017-08-07 12:09:54,2000.0,3000.0,9999.0


In [183]:
log_2.columns

Index(['user_id', 'event_id', 'action_type', 'age', 'gender',
       'prefecture_user', 'created_on', 'female_age_lower', 'female_age_upper',
       'male_age_lower', 'male_age_upper', 'event_start_at',
       'prefecture_event', 'first_published_at', 'female_price', 'male_price',
       'interest'],
      dtype='object')