In [41]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
import random
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error

np.set_printoptions(suppress=True)

In [42]:
header  = ["Click", "Weekday", "Hour", "Timestamp", "Log Type", "User ID", "User‐Agent", "IP", "Region", "City", "Ad Exchange", "Domain", "URL", "Anonymous URL ID", "Ad slot ID", "Ad slot width", "Ad slot height", "Ad slot visibility", "Ad slot format", "Ad slot floor price (RMB/CPM)", "Creative ID", "Key Page URL", "Advertiser ID", "User Tags"]
print(header)

['Click', 'Weekday', 'Hour', 'Timestamp', 'Log Type', 'User ID', 'User\xe2\x80\x90Agent', 'IP', 'Region', 'City', 'Ad Exchange', 'Domain', 'URL', 'Anonymous URL ID', 'Ad slot ID', 'Ad slot width', 'Ad slot height', 'Ad slot visibility', 'Ad slot format', 'Ad slot floor price (RMB/CPM)', 'Creative ID', 'Key Page URL', 'Advertiser ID', 'User Tags']


In [43]:
trainfile = 'train_sample.txt'
testfile = 'test.txt'

train = pd.read_csv(trainfile, header = None, sep = '\t', names = header)
test = pd.read_csv(testfile, header = None, sep = '\t', names = header[1:])

trainlen = len(train)
testlen = len(test)

### Show head lines of files
print(train.head())

   Click  Weekday  Hour          Timestamp  Log Type          User ID  \
0      0        6     0  20130606000104589         1  VhkS1DK53UjsBVL   
1      0        6     0  20130606000104622         1  VhL0O5FD32m63hl   
2      0        6     0  20130606000104809         1  Vhk7ZAnxD9lfQoL   
3      0        6     0  20130606000104878         1  Vhk7ZAnx3Tmdjda   
4      0        6     0  20130606000104991         1  Vhk7ZAnx3cB9tbc   

   User‐Agent             IP  Region  City  \
0  windows_ie  221.228.142.*      80    82   
1  windows_ie     221.10.5.*     276   277   
2  windows_ie    60.160.94.*     308   321   
3  windows_ie    210.21.84.*     216   217   
4  windows_ie  222.243.167.*     201   204   

                  ...                                     Ad slot ID  \
0                 ...                    mm_14539978_2071324_8355258   
1                 ...                   mm_24597501_3474831_11374379   
2                 ...                    mm_27762412_2408764_9403472

In [44]:
### Combine train and test to re-factor the structure and build binary features for logistic regression
print train.iloc[:,1:].shape
print test.shape
all_data = pd.concat([train.iloc[:,1:], test], axis=0)
print(all_data.shape)

(500000, 23)
(545421, 23)
(1045421, 23)


In [45]:
### Feature splitting for combining data
### Test for pd.get_dummies

### Expand Weekdays
weekdays = pd.get_dummies(all_data["Weekday"], prefix = "Weekday")

### Expand Hour
hour = pd.get_dummies(all_data["Hour"], prefix = "Hour")

### Expand OS and Browser
os_and_browser = all_data["User‐Agent"].str.split("_", expand = True)
os = os_and_browser[0]
browser = os_and_browser[1]
os_dummies = pd.get_dummies(os, prefix="OS")
browser_dummies = pd.get_dummies(browser, prefix="Browser")

### Expand floor price
def price_transfer(x):
  if x == 0:
    return '0'
  elif x >= 1 and x <= 10:
    return '1-10'
  elif x >= 11 and x <= 50:
    return '11-50'
  elif x >= 51 and x <= 100:
    return '51-100'
  else:
    return 'larger'

### Directly use dataframe method 'apply' to process series, instead of using map and lambda
floor_price = all_data["Ad slot floor price (RMB/CPM)"].astype(int).apply(price_transfer)
# floor_price = map(lambda price: price_transfer(price), train["Ad slot floor price (RMB/CPM)"].astype(int))

floor_price = pd.get_dummies(floor_price, prefix = "Price")

In [46]:
### Combining features and split all_data to train and test
all_data_dummies = pd.concat([weekdays, hour, os_dummies, browser_dummies, floor_price], axis = 1)
mytrain = all_data_dummies.iloc[:trainlen]
test = all_data_dummies.iloc[trainlen:]
print(mytrain.shape)
mytrain.head()

(500000, 51)


Unnamed: 0,Weekday_0,Weekday_1,Weekday_2,Weekday_3,Weekday_4,Weekday_5,Weekday_6,Hour_0,Hour_1,Hour_2,...,Browser_opera,Browser_other,Browser_safari,Browser_sogou,Browser_theworld,Price_0,Price_1-10,Price_11-50,Price_51-100,Price_larger
0,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,1,0,0,0,0


In [47]:
### Read user tags features from csv file
usertagsfile = 'usertags_train.txt'
usertags_train = pd.read_csv(usertagsfile, sep = ' ')
usertagsfile = 'usertags_test.txt'
usertags_test = pd.read_csv(usertagsfile, sep = ' ')

mytrain = pd.concat([mytrain, usertags_train], axis = 1)
test = pd.concat([test, usertags_test], axis = 1)
test.shape

(545421, 118)

In [48]:
### Logistic Regression
X,x,Y,y = train_test_split(mytrain, train["Click"], test_size = 0.1)
     
lr = LogisticRegression(C=0.1)
lr.fit(X, Y)
result = lr.predict_proba(x)

In [49]:
### AUC Evaluation
auc = roc_auc_score(y, result[:,1])
print(auc)

0.671194649036


In [50]:
### MSE Evaluation
mse = mean_squared_error(y, result[:,1])
print(mse)

0.000713610092193


In [79]:
### Make Prediction for test set
preds_lr =  lr.predict_proba(test)[:,1]

def write_prediction(filename, preds):
    submission = open(filename, "w")
    submission.write("Id,Prediction\n")

    # for i in range(10):
    for i in range(len(preds)):
        submission.write("%d,%.5f\n" % (i + 1, preds[i]))

    submission.close()

write_prediction("submission_lr.csv", preds_lr)

In [56]:
### GBRT feature engineering
def transfer(x, dict):
    return dict[x]


counts = all_data["City"].value_counts()
counts = dict(counts)
city = all_data["City"].apply(transfer, args=(counts,))

counts = all_data["Region"].value_counts()
counts = dict(counts)
region = all_data["Region"].apply(transfer, args=(counts,))
# print(city_transfer(1, counts))

In [61]:
counts = os.value_counts()
counts = dict(counts)
os_freq = os.apply(transfer, args=(counts,))
counts = browser.value_counts()
counts = dict(counts)
browser_freq = browser.apply(transfer, args=(counts,))


In [65]:
gbrt_all_data = pd.concat([all_data["Weekday"], all_data["Hour"], os_freq, browser_freq, city,region, all_data["Ad slot floor price (RMB/CPM)"]], axis = 1)
gbrt_all_data.shape

(1045421, 7)

In [66]:
### Split to train and test
gbrt_train = gbrt_all_data.iloc[:trainlen]
gbrt_test = gbrt_all_data.iloc[trainlen:]
print(gbrt_train.shape)
gbrt_test.head()

(500000, 7)


Unnamed: 0,Weekday,Hour,0,1,City,Region,Ad slot floor price (RMB/CPM)
0,1,6,1019875,337416,1161,21820,0
1,6,0,1019875,658381,17457,32077,5
2,1,16,1019875,658381,15665,83404,0
3,6,23,1019875,658381,24669,61313,5
4,6,20,1019875,337416,53926,53926,20


In [76]:
### GBRT
X,x,Y,y = train_test_split(gbrt_train, train["Click"], test_size = 0.1)
     
### Based on the Paper "RTB benchmarking with iPingyou dataset" to set attributes
gbrt = GradientBoostingRegressor(n_estimators=50, learning_rate=0.05,max_depth=5).fit(X, Y)
gbrt_result = gbrt.predict(x)

In [77]:
### AUC Evaluation
auc = roc_auc_score(y, gbrt_result)
print(auc)

0.691459333476


In [78]:
### MSE Evaluation
mse = mean_squared_error(y, gbrt_result)
print(mse)

0.000938282332524


In [82]:
### Predict and Write to prediction file
preds_gbrt = gbrt.predict(gbrt_test)
write_prediction("submission_gbrt.csv", preds_gbrt)
# print preds_gbrt