In [1]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot as plt
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import log_loss

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
def street1_eng(address):
    if '/' in address:
            j=address[address.index('/')+1:]
            l=address[:address.index('/')]
            if j>l:
                return j
            else:
                return l
    else:
        k=address.split(" ")
        return k[1]
            
def street2_eng(address):
    if '/' in address:
            j=address[address.index('/')+1:]
            l=address[:address.index('/')]
            if j>l:
                return l
            else:
                return j
    else:
        k=address.split(" ")
        return address[address.index('f')+1:]  

def minute_eng(minute):
    if minute>30:
        return minute-30
    else:
        return minute

def is_weekend(day):
    if day == 'Friday' or day == 'Saturday' or day == 'Sunday':
        return 1
    else:
        return 0 
    
def StreetNum_eng(address):
    if '/' in address:
        return 'junction'
    else:
        k=address.split(" ")
        return k[0]
     

In [4]:
def Preprocess(data):
#     count_vec = TfidfVectorizer(
#     max_df = 0.3,
#     min_df = 3,
#     lowercase = True,
#     ngram_range = (1,2),
#     analyzer = 'word'
#     )
#     data_count = count_vec.fit_transform(data.Address)
#     indices = pd.DataFrame(count_vec.get_feature_names())
    
#     n_comp = 50
#     svd_obj = TruncatedSVD(n_components = n_comp, algorithm = 'randomized')
#     svd_obj.fit(data_count)
#     data_svd = pd.DataFrame(svd_obj.transform(data_count))
#     data_svd.columns = ['svd_char_' + str(i) for i in range(n_comp)]
#     data = pd.concat([data, data_svd], axis=1)
#     del data_count, data_svd
    
    data = data[(data['X']< -121)]
    data = data[(data['Y']<40)]
    
    xyscaler=StandardScaler() 
    xyscaler.fit(data[["X","Y"]])
    data[["X","Y"]]=xyscaler.transform(data[["X","Y"]])
    
    data["X_1"] = .707* data["Y"] + .707* data["X"] 
    data["Y_1"] = .707* data["Y"] - .707* data["X"]
    data["X_2"] = (1.732/2)* data["X"] + (1./2)* data["Y"] 
    data["Y_2"] = (1.732/2)* data["Y"] - (1./2)* data["X"]
    data["X_3"] = (1./2)* data["X"] + (1.732/2)* data["Y"] 
    data["Y_3"] = (1./2)* data["Y"] - (1.732/2)* data["X"]
    data["radial_Distance"] = np.sqrt( np.power(data["Y"],2) + np.power(data["X"],2) )
    
    data['Dates'] = pd.to_datetime(data['Dates'])
    data['Year'] = data['Dates'].dt.year
    data['Month'] = data['Dates'].dt.month
    data['Day'] = data['Dates'].dt.day
    data['Hour'] = data['Dates'].dt.hour
    data['Minute'] = data['Dates'].dt.minute
    
    data['Minutes'] = data['Minute'].apply(minute_eng)
    
    data['Is_Weekend'] = data['DayOfWeek'].apply(is_weekend)
    
    data=data.drop(['Dates','Descript', 'Resolution'], axis=1)
    
    data['PdDistrict']=data['PdDistrict'].astype('category')
    data['PdDistrict']=data['PdDistrict'].cat.codes
    
    data["CrossRoad"] = data["Address"].str.contains("/")
    data["CrossRoad"]=data["CrossRoad"].astype('category')
    data["CrossRoad"]=data["CrossRoad"].cat.codes  
    
    data["AV"] = data["Address"].str.contains("AV")
    data["AV"]=data["AV"].astype('category')
    data["AV"]=data["AV"].cat.codes
    
    data["DayOfWeek"]=data["DayOfWeek"].astype('category')
    data["DayOfWeek"]=data["DayOfWeek"].cat.codes
    
    data['Street1'] = data['Address'].apply(street1_eng)
    data['Street2'] = data['Address'].apply(street2_eng)
    
    data["Street1"]=data["Street1"].astype('category')
    data["Street1"]=data["Street1"].cat.codes 
    
    data["Street2"]=data["Street2"].astype('category')
    data["Street2"]=data["Street2"].cat.codes
    
    data['StreetNo.'] = data['Address'].apply(StreetNum_eng)
    data["StreetNo."]=data["StreetNo."].astype('category')
    data["StreetNo."]=data["StreetNo."].cat.codes
    
    return data

In [5]:
train = Preprocess(train)
test = Preprocess(test)

In [6]:
train


Unnamed: 0,Category,DayOfWeek,PdDistrict,Address,X,Y,Id,X_1,Y_1,X_2,...,Day,Hour,Minute,Minutes,Is_Weekend,CrossRoad,AV,Street1,Street2,StreetNo.
0,SEX OFFENSES FORCIBLE,0,3,2100 Block of MISSION ST,0.135687,-0.197559,141546,-0.043743,-0.235605,0.018725,...,28,17,40,10,1,0,0,278,1269,14
1,LIQUOR LAWS,4,7,1000 Block of MARKET ST,0.451779,0.588986,794152,0.735820,0.097005,0.685733,...,19,2,46,16,0,0,0,278,1193,2
2,FRAUD,6,7,800 Block of BRYANT ST,0.765623,0.346893,531205,0.786549,-0.296042,0.836476,...,14,0,1,1,0,0,0,278,307,78
3,ROBBERY,4,0,2400 Block of SAN BRUNO AV,0.713787,-1.526104,523137,-0.574308,-1.583603,-0.144913,...,27,18,30,30,0,0,1,278,1632,17
4,OTHER OFFENSES,3,7,4TH ST / STEVENSON ST,0.693086,0.754293,200968,1.023297,0.043273,0.977359,...,9,17,2,2,1,1,0,65,1796,84
5,WARRANTS,6,3,15TH ST / NATOMA ST,0.172338,-0.011510,75749,0.113706,-0.129980,0.143490,...,14,6,14,14,0,1,0,10,1324,84
6,BURGLARY,6,1,0 Block of MARK LN,0.716342,0.968508,617521,1.191189,0.178281,1.104606,...,9,23,45,15,0,0,0,278,1192,0
7,SUSPICIOUS OCC,0,2,100 Block of PUTNAM ST,0.470323,-1.221296,186953,-0.530938,-1.195975,-0.203348,...,16,12,41,11,1,0,0,278,1498,1
8,OTHER OFFENSES,1,0,HALE ST / SAN BRUNO AV,0.666502,-1.410092,584559,-0.525718,-1.468152,-0.127855,...,5,23,53,23,0,1,1,683,1632,84
9,DRUG/NARCOTIC,5,4,ELLIS ST / LARKIN ST,0.199795,0.711709,653619,0.644433,0.361924,0.528877,...,7,7,41,11,0,1,0,532,1079,84


In [7]:
Features = list(train.columns.values)
Not_needed = ['Category','Address', 'Id', 'DayOfWeek', "Minute",'Street1', 'Street2', 'Day', 'Is_Weekend'] 
for i in Not_needed:
    Features.remove(i) 
Features    

['PdDistrict',
 'X',
 'Y',
 'X_1',
 'Y_1',
 'X_2',
 'Y_2',
 'X_3',
 'Y_3',
 'radial_Distance',
 'Year',
 'Month',
 'Hour',
 'Minutes',
 'CrossRoad',
 'AV',
 'StreetNo.']

In [9]:
x_train= train[Features]
y= train['Category']
test[Features].to_csv('Xtest.csv', encoding='utf-8', index=False)

In [9]:
label_encoded_y = LabelEncoder().fit_transform(y)

In [10]:
seed = 7
test_size = 0.15
X_train, X_test, y_train, y_test = train_test_split(x_train, label_encoded_y, test_size=test_size, random_state=seed)

In [11]:
model = XGBClassifier( 
                    max_depth=7, 
                    learning_rate=0.375, 
                    n_estimators=150,
                    gamma=0,
                    reg_alpha =0.1,
                    objective = 'multi:softprob',
                    booster='gbtree',
                    silent=True,
                    subsample = .8,
                    colsample_bytree = 0.8,
                    max_delta_step = 1,
                    n_jobs=-1,
                    random_state = 1711
#                        scale_pos_weight=.7
                     )

In [12]:
# param_grid={
#         'max_depth': [4,6,8],
# #         'gamma':[0.5,1,5],
# #         'min_child_weight':[1,5],
#         'learning_rate':[0.35],
#         'n_estimators':[36,80,110],
#         'subsample':[1],
#         'colsample_bytree':[.8]
#         }


# kfold = StratifiedKFold(n_splits=2, shuffle=True, random_state=7)
# grid_search = GridSearchCV(model, param_grid, scoring="neg_log_loss", n_jobs=-1, cv=kfold, verbose=1)
# grid_result = grid_search.fit(X_train, y_train, verbose = 1)

# print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# means = grid_result.cv_results_['mean_test_score']
# stds = grid_result.cv_results_['std_test_score']
# params = grid_result.cv_results_['params']
# for mean, stdev, param in zip(means, stds, params):
#     print("%f (%f) with: %r" % (mean, stdev, param))

In [13]:
model.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric='mlogloss',early_stopping_rounds=5,
        verbose=True)

[0]	validation_0-mlogloss:2.90579
Will train until validation_0-mlogloss hasn't improved in 5 rounds.
[1]	validation_0-mlogloss:2.71181
[2]	validation_0-mlogloss:2.59365
[3]	validation_0-mlogloss:2.51763
[4]	validation_0-mlogloss:2.46047
[5]	validation_0-mlogloss:2.41928
[6]	validation_0-mlogloss:2.38751
[7]	validation_0-mlogloss:2.36169
[8]	validation_0-mlogloss:2.34149
[9]	validation_0-mlogloss:2.32632
[10]	validation_0-mlogloss:2.31329
[11]	validation_0-mlogloss:2.30259
[12]	validation_0-mlogloss:2.29391
[13]	validation_0-mlogloss:2.28708
[14]	validation_0-mlogloss:2.28136
[15]	validation_0-mlogloss:2.27636
[16]	validation_0-mlogloss:2.27262
[17]	validation_0-mlogloss:2.26918
[18]	validation_0-mlogloss:2.26575
[19]	validation_0-mlogloss:2.26328
[20]	validation_0-mlogloss:2.26076
[21]	validation_0-mlogloss:2.25867
[22]	validation_0-mlogloss:2.2568
[23]	validation_0-mlogloss:2.25491
[24]	validation_0-mlogloss:2.25352
[25]	validation_0-mlogloss:2.2521
[26]	validation_0-mlogloss:2.25078

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.375,
       max_delta_step=1, max_depth=7, min_child_weight=1, missing=None,
       n_estimators=150, n_jobs=-1, nthread=None,
       objective='multi:softprob', random_state=1711, reg_alpha=0.1,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=0.8)

In [14]:
x_test = test[Features]
arr = model.predict_proba(x_test)

col = ['ARSON','ASSAULT', 'BAD CHECKS','BRIBERY','BURGLARY','DISORDERLY CONDUCT',
       'DRIVING UNDER THE INFLUENCE','DRUG/NARCOTIC','DRUNKENNESS','EMBEZZLEMENT',
       'EXTORTION','FAMILY OFFENSES','FORGERY/COUNTERFEITING','FRAUD','GAMBLING',
       'KIDNAPPING','LARCENY/THEFT','LIQUOR LAWS','LOITERING','MISSING PERSON','NON-CRIMINAL',
       'OTHER OFFENSES','PROSTITUTION','RECOVERED VEHICLE','ROBBERY','RUNAWAY','SECONDARY CODES',
       'SEX OFFENSES FORCIBLE','STOLEN PROPERTY','SUICIDE','SUSPICIOUS OCC','TRESPASS','VANDALISM',
       'VEHICLE THEFT','WARRANTS','WEAPON LAWS'
]
result = pd.DataFrame(data = arr, columns = col )

In [15]:
result.insert(0,"Id" , test['Id'])
result.to_csv('Result.csv', encoding='utf-8', index=False)

In [16]:
imp_arr = model.feature_importances_
for i in range(len(Features)):
    print(Features[i],imp_arr[i])

PdDistrict 0.0031691303
X 0.028792342
Y 0.027235651
svd_char_0 0.016488912
svd_char_1 0.013246879
svd_char_2 0.0134012615
svd_char_3 0.010566627
svd_char_4 0.0091085695
svd_char_5 0.00921578
svd_char_6 0.009485949
svd_char_7 0.010433686
svd_char_8 0.010583781
svd_char_9 0.00964462
svd_char_10 0.009520256
svd_char_11 0.00894561
svd_char_12 0.009318701
svd_char_13 0.009413046
svd_char_14 0.009721811
svd_char_15 0.010099191
svd_char_16 0.009511679
svd_char_17 0.009571717
svd_char_18 0.009653197
svd_char_19 0.009477372
svd_char_20 0.008932745
svd_char_21 0.0086883055
svd_char_22 0.007942123
svd_char_23 0.008418136
svd_char_24 0.009520256
svd_char_25 0.009039955
svd_char_26 0.008499616
svd_char_27 0.009807579
svd_char_28 0.009383027
svd_char_29 0.010073461
svd_char_30 0.009357297
svd_char_31 0.008726901
svd_char_32 0.008859842
svd_char_33 0.009370162
svd_char_34 0.009554563
svd_char_35 0.01045084
svd_char_36 0.010124921
svd_char_37 0.010214978
svd_char_38 0.010626664
svd_char_39 0.008979917

In [None]:
# model1= BernoulliNB()
# model1.fit(X_train, y_train)
# predicted = model1.predict_proba(X_test)
# print(("Log loss ") + str(log_loss(y_test, predicted)))

In [None]:
# from sklearn.linear_model import LogisticRegression

# model2 = LogisticRegression(
#     random_state = 1711,
#     max_iter = 200,
#     verbose = 1,
#     n_jobs = -1,
#     solver = 'sag',
#     multi_class = 'multinomial'
# )

In [None]:
# model2.fit(X_train, y_train)

In [None]:
# predicted = model2.predict_proba(X_test)
# print(("Log loss ") + str(log_loss(y_test, predicted)))

In [None]:
# from sklearn.ensemble import VotingClassifier
# final_model = VotingClassifier(estimators=[
#              ('XGB', model), ('LR', model2)], voting='soft', weights = [1,1])
# final_model.fit(X_train, y_train)
# predicted = final_model.predict_proba(X_test)
# print(("Log loss ") + str(log_loss(y_test, predicted)))

In [None]:
# # predicted
# result = pd.DataFrame(data = arr, columns = col )
# # result
# result.insert(0,"Id" , test['Id'])
# print(result)
# result.to_csv('Result1.csv', encoding='utf-8', index=False)