In [21]:
import pandas as pd
import numpy as np
import random 
import pickle 
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [22]:
# Load Data with pandas, and parse the first column into datetime

train=pd.read_csv('data/train.csv', parse_dates = ['Dates'])
test=pd.read_csv('data/test.csv', parse_dates = ['Dates'])

## Take a look at data

In [13]:
print (train.shape)

train.head(10)

(878049, 9)


Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541
5,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM UNLOCKED AUTO,Wednesday,INGLESIDE,NONE,0 Block of TEDDY AV,-122.403252,37.713431
6,2015-05-13 23:30:00,VEHICLE THEFT,STOLEN AUTOMOBILE,Wednesday,INGLESIDE,NONE,AVALON AV / PERU AV,-122.423327,37.725138
7,2015-05-13 23:30:00,VEHICLE THEFT,STOLEN AUTOMOBILE,Wednesday,BAYVIEW,NONE,KIRKWOOD AV / DONAHUE ST,-122.371274,37.727564
8,2015-05-13 23:00:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,RICHMOND,NONE,600 Block of 47TH AV,-122.508194,37.776601
9,2015-05-13 23:00:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,CENTRAL,NONE,JEFFERSON ST / LEAVENWORTH ST,-122.419088,37.807802


In [14]:
train.describe()

Unnamed: 0,X,Y
count,878049.0,878049.0
mean,-122.422616,37.77102
std,0.030354,0.456893
min,-122.513642,37.707879
25%,-122.432952,37.752427
50%,-122.41642,37.775421
75%,-122.406959,37.784369
max,-120.5,90.0


In [15]:
# How many of test addreses we saw in train data

repet_crime_addr = np.in1d(test['Address'], train['Address']).astype(int)
print("Addresses already seen: ", float(sum(repet_crime_addr))/len(repet_crime_addr)*100)

('Addresses already seen: ', 99.64116969857349)


## Tf-idf for addresses

In [23]:
# Devide text information into separate files by category of crime

# Factor levels
categories = list(train['Category'].unique())

addr_cat = {}

# Subset by category
for cat in categories:
    df_category_mask = train['Category'] == cat
    df_cat = train[df_category_mask] # select by mask
    
    df_cat_addr = list(df_cat['Address']) # select only addresses
    addr_cat[cat] = df_cat_addr

In [17]:
# Check length
print(len(addr_cat.values()))
print(sum([len(x) for x in addr_cat.values()]))
print(len(train))

39
878049
878049


In [24]:
# For Addresses
addr_docs = [' '.join(x) for x in addr_cat.values()] # imulation of sentences from adresses

# define vectorizer parameters
# max_df: the maximum frequency within the documents a given feature can have to be used
# min_df: could be an integer (e.g. 5) and the term would have to be in at least 5 of the documents to be considered. 
# ngram_range: this just means I'll look at unigrams, bigrams, trigrams and 4grams.
tfidf_vectorizer_addr = TfidfVectorizer(max_df=0.8, analyzer='word', lowercase=False, stop_words=['Block', 'of', '/'],
                                        tokenizer=None, norm=None, smooth_idf=False,
                                 min_df=1, use_idf=True, ngram_range=(1,4))


%time tfidf_matrix_addr = tfidf_vectorizer_addr.fit_transform(addr_docs) #fit the vectorizer to addr_docs

print(tfidf_matrix_addr.shape)

CPU times: user 20.2 s, sys: 540 ms, total: 20.7 s
Wall time: 20.7 s
(39, 1118159)


In [19]:
# Check results visually

random.seed(120)
numbr = random.sample(range(len(train)), 10)

for n in numbr:
    
    response = tfidf_vectorizer_addr.transform(train.iloc[[n]]['Address'])
    cosine = cosine_similarity(response, tfidf_matrix_addr)

    print train.iloc[[n]]
    sort_rez = sorted(zip(addr_cat.keys(), cosine[0]), key=lambda x: x[1], reverse=True)

    for x in sort_rez:
        if x[1] > 0:
            print x[0], x[1] 

                     Dates Category Descript DayOfWeek PdDistrict Resolution  \
452140 2008-12-28 02:00:00  ASSAULT  BATTERY    Sunday   SOUTHERN       NONE   

                       Address           X          Y  
452140  400 Block of JESSIE ST -122.408534  37.782033  
LIQUOR LAWS 0.0944143307599
WARRANTS 0.0642728644903
DRUNKENNESS 0.0458374439827
DRUG/NARCOTIC 0.0448385089308
OTHER OFFENSES 0.0302629807004
ROBBERY 0.0277717684953
WEAPON LAWS 0.0262275157172
BRIBERY 0.022374887698
ASSAULT 0.022349958614
VANDALISM 0.0202910404107
ARSON 0.0186545930168
SEX OFFENSES FORCIBLE 0.0185487713129
SUSPICIOUS OCC 0.0183105367922
RECOVERED VEHICLE 0.0172797790989
FRAUD 0.0161277673937
NON-CRIMINAL 0.0153184954207
LARCENY/THEFT 0.014520811146
VEHICLE THEFT 0.0140520500996
STOLEN PROPERTY 0.0130202072633
BURGLARY 0.0129986623453
DISORDERLY CONDUCT 0.0122605516695
TRESPASS 0.0117751000725
KIDNAPPING 0.00906060162456
LOITERING 0.00663372630327
FORGERY/COUNTERFEITING 0.00645474086174
EMBEZZLEMENT 0

In [25]:
# Get tf-idf for train

response_train = tfidf_vectorizer_addr.transform(train['Address'])
cosine_train = cosine_similarity(response_train, tfidf_matrix_addr)

col_names_addr = ['tfidf_' + x for x in addr_cat.keys()]

tfidf_train = pd.DataFrame(cosine_train, columns=col_names_addr)
print tfidf_train.shape

(878049, 39)


In [26]:
# Get tf-idf for test

response_test = tfidf_vectorizer_addr.transform(test['Address'])
cosine_test = cosine_similarity(response_test, tfidf_matrix_addr)

tfidf_test = pd.DataFrame(cosine_test, columns=col_names_addr)
print tfidf_test.shape

(884262, 39)


In [29]:
# Save features to pkl 

tfidf_train.to_pickle("data/tfidf_train.pkl")
tfidf_test.to_pickle("data/tfidf_test.pkl")

In [None]:
# Save tf-df matrix to file

#store the content
with open("data/tfidf_matrix.pkl", 'wb') as handle:
                    pickle.dump(tfidf_matrix_addr, handle)
        
#load the content
tfidf = pickle.load(open("data/tfidf_matrix.pkl", "rb" ) )

In [None]:
# Ideas
# Convert cosine similarity to class probability