In [191]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
import random
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score

np.set_printoptions(suppress=True)

In [192]:
header  = ["Click", "Weekday", "Hour", "Timestamp", "Log Type", "User ID", "User‐Agent", "IP", "Region", "City", "Ad Exchange", "Domain", "URL", "Anonymous URL ID", "Ad slot ID", "Ad slot width", "Ad slot height", "Ad slot visibility", "Ad slot format", "Ad slot floor price (RMB/CPM)", "Creative ID", "Key Page URL", "Advertiser ID", "User Tags"]
print(header)

['Click', 'Weekday', 'Hour', 'Timestamp', 'Log Type', 'User ID', 'User\xe2\x80\x90Agent', 'IP', 'Region', 'City', 'Ad Exchange', 'Domain', 'URL', 'Anonymous URL ID', 'Ad slot ID', 'Ad slot width', 'Ad slot height', 'Ad slot visibility', 'Ad slot format', 'Ad slot floor price (RMB/CPM)', 'Creative ID', 'Key Page URL', 'Advertiser ID', 'User Tags']


In [193]:
trainfile = 'train.txt'
testfile = 'testhead.txt'

n = 2847802
s = 100000
skip = sorted(random.sample(xrange(n),n-s))

train = pd.read_csv(trainfile, header = None, sep = '\t', names = header, skiprows=skip)
#test = pd.read_csv(testfile, header = None, sep = '\t', names = header[1:])

### Show head lines of files
print(train.head())
#print(test.head())

### output to csv file
# train.head().to_csv('out.csv', sep = '\t')

   Click  Weekday  Hour          Timestamp  Log Type          User ID  \
0      0        6     0  20130606000105010         1  Vhk7ZAnyOHTVebl   
1      0        6     0  20130606000105592         1  VhTKLasLOQb9jsn   
2      0        6     0  20130606000105664         1  Vhk7ZAnyDtKZjhl   
3      0        6     0  20130606000105696         1  VhkrCksLO6d2eI2   
4      0        6     0  20130606000105745         1  Vh1h1wj4OIBWjYk   

   User‐Agent             IP  Region  City  \
0  windows_ie  125.220.161.*     183   184   
1  windows_ie   111.122.64.*     298   301   
2  windows_ie  124.229.159.*     201   212   
3  windows_ie    218.81.48.*      79    79   
4  windows_ie  124.117.200.*     374   375   

                         ...                          \
0                        ...                           
1                        ...                           
2                        ...                           
3                        ...                           
4   

In [136]:
### Check the structure of input files
# print(train.describe())
# print(train.shape)

In [7]:
#len(train[train["Click"] == 1])

65

In [194]:
### Test for pd.get_dummies
### Expand Weekdays
weekdays = pd.get_dummies(train["Weekday"], prefix = "Weekday")
weekdays.head()

Unnamed: 0,Weekday_0,Weekday_1,Weekday_2,Weekday_3,Weekday_4,Weekday_5,Weekday_6
0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,1
2,0,0,0,0,0,0,1
3,0,0,0,0,0,0,1
4,0,0,0,0,0,0,1


In [195]:
### Expand Hour
hour = pd.get_dummies(train["Hour"], prefix = "Hour")
hour.head()

Unnamed: 0,Hour_0,Hour_1,Hour_2,Hour_3,Hour_4,Hour_5,Hour_6,Hour_7,Hour_8,Hour_9,...,Hour_14,Hour_15,Hour_16,Hour_17,Hour_18,Hour_19,Hour_20,Hour_21,Hour_22,Hour_23
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [196]:
### Expand OS and Browser
os_and_browser = train["User‐Agent"].str.split("_", expand = True)
os = os_and_browser[0]
browser = os_and_browser[1]
os = pd.get_dummies(os, prefix="OS")
browser = pd.get_dummies(browser, prefix="Browser")
browser.head()


Unnamed: 0,Browser_chrome,Browser_firefox,Browser_ie,Browser_maxthon,Browser_opera,Browser_other,Browser_safari,Browser_theworld
0,0,0,1,0,0,0,0,0
1,0,0,1,0,0,0,0,0
2,0,0,1,0,0,0,0,0
3,0,0,1,0,0,0,0,0
4,0,0,1,0,0,0,0,0


In [197]:
### Expand floor price
def price_transfer(x):
  if x == 0:
    return '0'
  elif x >= 1 and x <= 10:
    return '1-10'
  elif x >= 11 and x <= 50:
    return '11-50'
  elif x >= 51 and x <= 100:
    return '51-100'
  else:
    return 'larger'

### Directly use dataframe method 'apply' to process series, instead of using map and lambda
floor_price = train["Ad slot floor price (RMB/CPM)"].astype(int).apply(price_transfer)
# floor_price = map(lambda price: price_transfer(price), train["Ad slot floor price (RMB/CPM)"].astype(int))

floor_price = pd.get_dummies(floor_price, prefix = "Price")

floor_price.head()
# type(floor_price)




Unnamed: 0,Price_0,Price_1-10,Price_11-50,Price_51-100,Price_larger
0,0,1,0,0,0
1,1,0,0,0,0
2,0,0,0,0,1
3,0,1,0,0,0
4,0,0,0,1,0


In [68]:
# ### Expand User tags
# print train["User Tags"].head()
# user_list = []
# for tag in train["User Tags"]:
#     if tag == 'null':
#         pass
#     else:
#         taglist = tag.strip().split(',')
#         for t in taglist:
#             if t not in user_list:
#                 user_list.append(t)
        
# print user_list

0                      10006,10057,14273
1                      10063,10067,10006
2                                   null
3                      10052,10006,13800
4    13800,10006,10093,10075,10063,10111
Name: User Tags, dtype: object
['10006', '10057', '14273', '10063', '10067', '10052', '13800', '10093', '10075', '10111', '10024', '13866', '10110', '13776', '10048', '10031', '10059', '11944', '13403', '10083', '10684', '10076', '13678', '13042', '11379', '10077', '11423', '16661', '11278', '10079', '11724', '10074', '16617', '13496', '10102', '11576', '11092', '16706', '16593', '11680', '13874', '11632', '11512']


In [198]:
mytrain = pd.concat([weekdays, hour, os, browser, floor_price], axis = 1)
mytrain.shape
mytrain.head()

Unnamed: 0,Weekday_0,Weekday_1,Weekday_2,Weekday_3,Weekday_4,Weekday_5,Weekday_6,Hour_0,Hour_1,Hour_2,...,Browser_maxthon,Browser_opera,Browser_other,Browser_safari,Browser_theworld,Price_0,Price_1-10,Price_11-50,Price_51-100,Price_larger
0,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,1,0


In [211]:
### Logistic Regression
# mytrain = train.ix[:, [1,2,8,9,10,15,16,17,18,19]]
# mytest = test.ix[:, [0,1,7,8,9,14,15,16,17,18]]
#print(mytest.describe())

# newtrain = mytrain.astype(str)
# newtrain = pd.get_dummies(newtrain)
# newtrain.shape


X,x,Y,y = train_test_split(mytrain, train["Click"], test_size = 0.1)
     
lr = LogisticRegression(C=0.1)
lr.fit(X, Y)
# print("The result of validation test is : %f" % lr.score(x,y))
### Predict class label for test samples
result = lr.predict_proba(x)
# print result.tolist()

### Predict probability for each class label
#result = lr.predict_proba(mytest)

#print(result[:100, 1])

In [212]:
print(result[:,1])

[ 0.0010136   0.00061848  0.00077711 ...,  0.00056158  0.0007984
  0.00111377]


In [222]:
x = result[:,1]
# print(y)

### AUC Evaluation
auc = roc_auc_score(y, x)

0.19571957195719572

In [155]:
### GBRT feature engineering
def city_transfer(x, dict):
    return dict[int(x)]


counts = train["City"].value_counts()
counts = dict(counts)
city = train["City"].apply(city_transfer, args=(counts,))

counts = train["Region"].value_counts()
counts = dict(counts)
region = train["Region"].apply(city_transfer, args=(counts,))
# print(city_transfer(1, counts))

mytrain = pd.concat([weekdays, city,region], axis = 1)
mytrain.head()


Unnamed: 0,Weekday_0,Weekday_1,Weekday_2,Weekday_3,Weekday_4,Weekday_5,Weekday_6,City,Region
0,0,0,0,0,0,0,1,1639,3090
1,0,0,0,0,0,0,1,2042,5584
2,0,0,0,0,0,0,1,930,5798
3,0,0,0,0,0,0,1,3439,11828
4,0,0,0,0,0,0,1,2922,11828


In [160]:
### GBRT

X,x,Y,y = train_test_split(mytrain, train["Click"], test_size = 0.1)
     
# lr = LogisticRegression(C=0.1)
# lr.fit(X, Y)
# print("The result of validation test is : %f" % lr.score(x,y))
### Predict class label for test samples
# result = lr.predict_proba(x)
# gbrt = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0).fit(X, Y)
gbrt = GradientBoostingRegressor(n_estimators=50, learning_rate=0.05,max_depth=5).fit(X, Y)
result = gbrt.predict(x)


In [162]:
print max(result)
print(result)

0.165817410448
[ 0.00069302  0.0012038   0.00014953 ...,  0.00070083  0.00034424
  0.00030705]


In [121]:
print(max(result[:,1]))
result[:]
for i in y:
    if i == 1:
        print i

IndexError: too many indices for array