In [7]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gzip
from sklearn.decomposition import PCA

In [8]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [9]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder().fit(train.PdDistrict)

In [10]:
### Address subfix 
addr_short_list = train.Address.apply(lambda x: x.rsplit(' ', 1)[1]).value_counts()[:12].keys()
addr_length = len(addr_short_list)
def addr_transfer(addr):
    for index, elem in enumerate(addr_short_list):
        if addr.endswith(elem):
            return index
    return addr_length

In [11]:
### This is function to load data for train and test
def loadData(df, test = None):
    
    dt = pd.to_datetime(df.Dates).dt
    df["Year"] = dt.year
    df["Month"] = dt.month
    df["Day"] = dt.day
    df["Hour"] = dt.hour
    df["Minute"] = dt.minute
    df["MinuteAbs"] = abs(dt.minute - 30)
    df['Time'] = dt.hour * 60 + dt.minute
    
    df["Week"] = dt.week
    df.drop("Dates", axis = 1, inplace = True)
    
    df["AddressIsOf"]= df.Address.str.contains('.?of.?')
    df["AddressShort"] = df["Address"].apply(addr_transfer)
    df.drop("Address", axis = 1, inplace = True)
    
    df["PdDistrict"] = le.transform(df.PdDistrict)
    
    dow = {
        "Monday": 0,
        "Tuesday": 1,
        "Wednesday": 2,
        "Thursday": 3,
        "Friday": 4,
        "Saturday": 5,
        "Sunday": 6
    }
    
    df["DayOfWeek"] = df.DayOfWeek.map(dow)
    
    pca = PCA(n_components=2)
    res = pca.fit_transform(df[["X", "Y"]])
    df["X"] = pd.Series(res[:, 0])
    df["Y"] = pd.Series(res[:, 1])
    
    if test:
        df.drop("Id", axis = 1, inplace = True)
        y = None
    else:
        df.drop("Descript", axis = 1, inplace = True)
        df.drop("Resolution", axis = 1, inplace = True)
        y = df.Category
        df.drop("Category", axis = 1, inplace = True)
        
    X = df
    
    return X, y


In [12]:
### Read in data to DataFrame
X, y = loadData(train)

In [13]:
### Validation 
# from sklearn.cross_validation import train_test_split
# from sklearn.ensemble import GradientBoostingClassifier
# from sklearn.metrics import log_loss

# X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)
X.head()

Unnamed: 0,DayOfWeek,PdDistrict,X,Y,Year,Month,Day,Hour,Minute,MinuteAbs,Time,Week,AddressIsOf,AddressShort
0,2,4,-0.003454,0.003406,2015,5,13,23,53,23,1433,20,False,0
1,2,4,-0.003454,0.003406,2015,5,13,23,53,23,1433,20,False,0
2,2,4,-0.029309,0.00284,2015,5,13,23,33,3,1413,20,False,0
3,2,4,-0.029669,0.005488,2015,5,13,23,30,0,1410,20,True,0
4,2,5,8e-05,0.016129,2015,5,13,23,30,0,1410,20,True,0


In [None]:
### Validation for xgboost
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import log_loss
import xgboost as xgb

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=42)

#gbm = xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05).fit(X_train, y_train)
gbm = xgb.XGBClassifier(objective="multi:softprob", max_depth=3).fit(X_train, y_train)

In [15]:
#preds = gbm.predict(X_test)
preds = gbm.predict_proba(X_test)
print(log_loss(y_test, preds))

2.37308204638


In [None]:
preds = gbm.predict_proba(X_test)
print(log_loss(y_test, preds))

In [None]:
submission = pd.DataFrame(preds, columns = rf.classes_)
submission.to_csv(gzip.open('sub.csv.gz', 'wt'), index = True, index_label = 'Id')

In [None]:
### backup code  
#     data = pd.DataFrame(index = range(len(df)))
    
#     ### Seems no improvement with features chosen from 'Address'
#     #addr_feats = ["Addr#" + str(i) for i in range(10)]
#     #data = df.get(['X','Y'] + addr_feats)
#     data = df.get(['X','Y'])
    
#     ### Dates
#     date_time = pd.to_datetime(df.Dates)
#     data['Year'] = date_time.dt.year
#     data['Month'] = date_time.dt.month
#     data['Day'] = date_time.dt.day
#     data['Hour'] = date_time.dt.hour
#     data['Time'] = data['Hour'] * 60 + date_time.dt.minute

#     ### District to discrete values
#     district_counts = df.PdDistrict.value_counts()
#     district_counts.sort()
#     keylist = district_counts.keys()
#     dict = {}
#     for i in range(len(keylist)):
#         dict[keylist[i]] = i
#     data['District'] = df.PdDistrict.map(dict)

#     ### Weekdays
#     district_counts = df.DayOfWeek.value_counts()
#     district_counts.sort()
#     keylist = district_counts.keys()
#     dict = {}
#     for i in range(len(keylist)):
#         dict[keylist[i]] = i
#     data['DayOfWeek'] = df.DayOfWeek.map(dict)
    
#     X = data.values
#     Y = None
#     if "Category" in df.columns:
#         Y = df.Category.values
        
#     return X, Y
