In [32]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss

In [4]:
# Load Data with pandas, and parse the first column into datetime

train=pd.read_csv('data/train.csv', parse_dates = ['Dates'])
test=pd.read_csv('data/test.csv', parse_dates = ['Dates'])

In [22]:
# For every address find probability to be in the class, based on previous probabilities
# table:  cat1 cat2 cat 3
# address  0.3  0.2  0.9

train.head(4)

cat_addr = {}

cats = list(train["Category"].unique())

for cat in cats:
    cat_addr[cat] = {}
    
for row in train.iterrows():
    addr = row[1]['Address']
    cat = row[1]['Category']
    if addr in cat_addr[cat]:
        cat_addr[cat][addr] += 1
    else:
        cat_addr[cat][addr] = 1


In [101]:
def addr_prob(addr):
    res = []
    for addr_dic in cat_addr.values():
        if addr not in addr_dic:
            res.append(0)
        else:
            res.append(addr_dic[addr])
    sum_all = float(sum(res))
    if sum_all > 0:
        res = [x/sum_all for x in res]
    return res

In [117]:
# Apply to train data 

train_addr = np.asarray(train['Address'].values)
train_addr_ = np.asarray([addr_prob(x) for x in train_addr])
train_addr_probs = pd.DataFrame(train_addr_, columns=cat_addr.keys())

Unnamed: 0,KIDNAPPING,WEAPON LAWS,SECONDARY CODES,WARRANTS,PROSTITUTION,EMBEZZLEMENT,LOITERING,SUICIDE,DRIVING UNDER THE INFLUENCE,SEX OFFENSES FORCIBLE,...,PORNOGRAPHY/OBSCENE MAT,LIQUOR LAWS,SEX OFFENSES NON FORCIBLE,TREA,VEHICLE THEFT,STOLEN PROPERTY,ASSAULT,MISSING PERSON,DISORDERLY CONDUCT,RUNAWAY
0,0.0,0.0,0.0,0.066667,0.0,0,0,0,0,0.0,...,0,0,0,0,0.088889,0,0.0,0.0,0,0
1,0.0,0.0,0.0,0.066667,0.0,0,0,0,0,0.0,...,0,0,0,0,0.088889,0,0.0,0.0,0,0
2,0.0,0.0,0.0,0.043478,0.0,0,0,0,0,0.0,...,0,0,0,0,0.0,0,0.043478,0.043478,0,0
3,0.006452,0.0,0.019355,0.019355,0.070968,0,0,0,0,0.006452,...,0,0,0,0,0.032258,0,0.051613,0.032258,0,0
4,0.0,0.019231,0.0,0.0,0.0,0,0,0,0,0.019231,...,0,0,0,0,0.096154,0,0.038462,0.038462,0,0


In [124]:
train_addr_probs = train_addr_probs.sort_index(axis=1)
train_addr_probs.head(5)

Unnamed: 0,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,...,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
0,0,0.0,0.0,0,0.0,0,0,0.044444,0,0,...,0,0,0,0.0,0,0.0,0.022222,0.088889,0.066667,0.0
1,0,0.0,0.0,0,0.0,0,0,0.044444,0,0,...,0,0,0,0.0,0,0.0,0.022222,0.088889,0.066667,0.0
2,0,0.043478,0.0,0,0.0,0,0,0.0,0,0,...,0,0,0,0.043478,0,0.0,0.0,0.0,0.043478,0.0
3,0,0.051613,0.0,0,0.122581,0,0,0.032258,0,0,...,0,0,0,0.025806,0,0.019355,0.03871,0.032258,0.019355,0.0
4,0,0.038462,0.019231,0,0.192308,0,0,0.0,0,0,...,0,0,0,0.057692,0,0.0,0.096154,0.096154,0.0,0.019231


In [128]:
log_loss(train.Category, np.array(train_addr_probs))
# 2.12
# Log loss on test data is 5.33

2.1264550154964645

In [129]:
# Apply to test data 

test_addr = np.asarray(test['Address'].values)
test_addr_ = np.asarray([addr_prob(x) for x in test_addr])
test_addr_probs = pd.DataFrame(test_addr_, columns=cat_addr.keys())

In [132]:
test_addr_probs = test_addr_probs.sort_index(axis=1)
print(test_addr_probs.shape)
test_addr_probs.head(5)

(884262, 39)


Unnamed: 0,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,...,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
0,0,0.117647,0,0.0,0.014706,0.0,0.0,0.029412,0.014706,0.014706,...,0,0.0,0.0,0.058824,0,0.014706,0.073529,0.088235,0.073529,0.014706
1,0,0.108374,0,0.0,0.0,0.0,0.004926,0.064039,0.0,0.0,...,0,0.0,0.0,0.034483,0,0.004926,0.004926,0.064039,0.103448,0.014778
2,0,0.045455,0,0.0,0.227273,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.045455,0,0.0,0.090909,0.227273,0.0,0.0
3,0,0.204868,0,0.002028,0.036511,0.01217,0.002028,0.05071,0.016227,0.002028,...,0,0.004057,0.002028,0.052738,0,0.006085,0.085193,0.014199,0.05071,0.026369
4,0,0.204868,0,0.002028,0.036511,0.01217,0.002028,0.05071,0.016227,0.002028,...,0,0.004057,0.002028,0.052738,0,0.006085,0.085193,0.014199,0.05071,0.026369


In [131]:
test_addr_probs.to_csv('data/addr_prob_result.csv', index = True, index_label = 'Id' )

In [None]:
with open("data/addr_prob.pkl", 'wb') as handle:
                    pickle.dump(tfidf_matrix_addr, handle)