# Final Project: Initial EDA

Victoria Eastman, David Harding, and Faria Mardhani

https://www.kaggle.com/c/sf-crime

Interesting things:
   - distance?
   - time of day


In [4]:
%matplotlib inline

# General libraries
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import zipfile

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

In [7]:
# Read in zipped data file
zf = zipfile.ZipFile('train.csv.zip', 'r')
train_raw = pd.read_csv(zf.open('train.csv'), parse_dates=['Dates'])

zf2 = zipfile.ZipFile('test.csv.zip', 'r')
test_raw = pd.read_csv(zf2.open('test.csv'), parse_dates=['Dates'])

In [3]:
# Read in the raw csv
#df = pd.read_csv('train.csv', parse_dates=[0])
#df.head()

In [8]:
train_raw.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [26]:
def process_data(df):
    df['DOW'] = df.Dates.dt.weekday
    df['DOM'] = df.Dates.dt.day
    df['MONTH'] = df.Dates.dt.month
    df['Hour'] = df.Dates.dt.hour
    
    #df = df.drop(['Descript', 'Resolution'], axis=1)

    df_PdD = pd.get_dummies(df.PdDistrict)
    df_DOM = pd.get_dummies(df.DOM, prefix = 'DofM')
    df_DOW = pd.get_dummies(df.DOW, prefix = 'DofW')
    df_MONTH = pd.get_dummies(df.MONTH, prefix = 'Month')
    df_Hour = pd.get_dummies(df.Hour, prefix = 'Hour')

    new = pd.concat([df_PdD, df_DOM], axis = 1)
    new = pd.concat([new, df_DOW], axis = 1)
    new = pd.concat([new, df_MONTH], axis = 1)
    new = pd.concat([new, df_Hour], axis = 1)
    
    return new

In [6]:
df_label = train['Category'].astype('category')
newdf = process_data(train)
X_train, X_dev, y_train, y_dev = train_test_split(newdf, df_label, test_size = 0.5, random_state= 42)

In [7]:
print("X_train size: " + str(X_train.shape))
print("X_dev size: " + str(X_dev.shape))
print("y_train size: " + str(y_train.shape))
print("y_dev size: " + str(y_dev.shape))

X_train size: (439024, 84)
X_dev size: (439025, 84)
y_train size: (439024,)
y_dev size: (439025,)


In [8]:
# Run a knn model on the training data
start = time.time()
kn = KNeighborsClassifier(n_neighbors=1)
kn.fit(X_train[:50000], y_train[:50000])
y_pred = kn.predict(X_dev)
print("Model score: " + str(kn.score(X_dev, y_dev)))
print("Accuracy score: " + str(metrics.accuracy_score(y_dev, y_pred)))
print("total time for knn: " + str(time.time()-start))

Model score: 0.120314332897
Accuracy score: 0.120314332897
total time for knn: 13350.265929


In [11]:
# Process test data and predict outcomes
test = process_data(test_raw)
results = kn.predict(test)

In [16]:
#test.head()
#newdf.head()

In [12]:
# Generate output submission file
submit = pd.DataFrame({'Id': test.Id.tolist()})
for category in df_label.cat.categories:
    submit[category] = np.where(results == category, 1, 0)
    
submit.to_csv('submission1_vhe.csv', index = False)

AttributeError: 'DataFrame' object has no attribute 'Id'

In [27]:
train_labels = train_raw.Category.astype('category')
newdf = process_data(train_raw)
test = process_data(test_raw)

In [31]:
rf_model = RandomForestClassifier()
rf_model.fit(newdf, train_labels)

results = rf_model.predict(test)

In [32]:
df_results = pd.get_dummies(results)
df_results.to_csv('submission1_vdf.csv')

In [41]:
# Generate output submission file
submit = pd.DataFrame({'Id': test_raw.Id.tolist()})
for category in train_labels.cat.categories:
    submit[category] = np.where(results == category, 1, 0)
    
submit.to_csv('submission1_vhe.csv', index = False)

Unnamed: 0,Id,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,...,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
