In [24]:
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [25]:
train_data = pd.read_csv('./sf-crime/train.csv.zip')
test_data = pd.read_csv('./sf-crime/test.csv.zip')

In [26]:
train_data.info()
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Dates       878049 non-null  object 
 1   Category    878049 non-null  object 
 2   Descript    878049 non-null  object 
 3   DayOfWeek   878049 non-null  object 
 4   PdDistrict  878049 non-null  object 
 5   Resolution  878049 non-null  object 
 6   Address     878049 non-null  object 
 7   X           878049 non-null  float64
 8   Y           878049 non-null  float64
dtypes: float64(2), object(7)
memory usage: 60.3+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 884262 entries, 0 to 884261
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Id          884262 non-null  int64  
 1   Dates       884262 non-null  object 
 2   DayOfWeek   884262 non-null  object 
 3   PdDistrict  884262 non-null  object 
 4   Address 

# Train data analysis

Columns: Dates, Category, Descript, DayOfWeek, PdDistrict, Resolution, Address, X, Y

In [28]:
train_data.head()
#train_data.info()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


# Test data analysis

Columns: Id, Dates, DayOfWeek, Category, PdDistrict, Address, X, Y

In [29]:
test_data.head()
#test_data.info()

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432
2,2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212
3,3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
4,4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412


In [30]:
def updateDateColumns(data):
    if 'Dates' in data:
        dates = pd.to_datetime(data['Dates'])
        data['Dates'] = dates
        data['Year'] = dates.dt.year
        data['Month'] = dates.dt.month
        data['Day'] = dates.dt.day
        data['DayOfWeek'] = dates.dt.weekday
        data['Hour'] = dates.dt.hour
        data['Minute'] = dates.dt.minute
        data.drop(columns='Dates', inplace=True)

def dropColumn(data, column):
    if column in data:
        data.drop(columns=column, inplace=True)

le = LabelEncoder()
category_le = LabelEncoder()
def convertColumnToLabelOrOptimalType(data):
    if 'Category' in data:
        data['Category'] = category_le.fit_transform(data['Category'])
    for column in data.columns[data.dtypes == object]:
        data[column] = le.fit_transform(data[column])
    for column in data.columns[data.dtypes == np.int64]:
        data[column] = data[column].astype(np.int32)
    for column in data.columns[data.dtypes == np.float64]:
        data[column] = data[column].astype(np.float32)

def dropCoordinatesFor(data, column, min, max):
    if column in data:
        data.drop(data[(data[column] < min)].index, inplace=True)
        data.drop(data[(data[column] > max)].index, inplace=True)



In [31]:
train_data.info(verbose=False)
test_data.info(verbose=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Columns: 9 entries, Dates to Y
dtypes: float64(2), object(7)
memory usage: 60.3+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 884262 entries, 0 to 884261
Columns: 7 entries, Id to Y
dtypes: float64(2), int64(1), object(4)
memory usage: 47.2+ MB


In [32]:
dropColumn(train_data, 'Resolution') # useless?
dropColumn(train_data, 'Descript') # useless?
# dropColumn(train_data, 'Address') # useless?

train_data.info(verbose=False)
test_data.info(verbose=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Columns: 7 entries, Dates to Y
dtypes: float64(2), object(5)
memory usage: 46.9+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 884262 entries, 0 to 884261
Columns: 7 entries, Id to Y
dtypes: float64(2), int64(1), object(4)
memory usage: 47.2+ MB


In [33]:
dropCoordinatesFor(train_data, 'Y', 35.0, 45.0)
#dropCoordinatesFor(test_data, 'Y', 35.0, 45.0)

dropCoordinatesFor(train_data, 'X', -130.0, -115.0)
#dropCoordinatesFor(test_data, 'X', -130.0, -115.0)

train_data.info(verbose=False)
test_data.info(verbose=False)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 877982 entries, 0 to 878048
Columns: 7 entries, Dates to Y
dtypes: float64(2), object(5)
memory usage: 53.6+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 884262 entries, 0 to 884261
Columns: 7 entries, Id to Y
dtypes: float64(2), int64(1), object(4)
memory usage: 47.2+ MB


In [34]:
updateDateColumns(train_data) # seperate values better than string blob
updateDateColumns(test_data) # seperate values better than string blob

train_data.info(verbose=False)
test_data.info(verbose=False)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 877982 entries, 0 to 878048
Columns: 11 entries, Category to Minute
dtypes: float64(2), int64(6), object(3)
memory usage: 80.4+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 884262 entries, 0 to 884261
Columns: 11 entries, Id to Minute
dtypes: float64(2), int64(7), object(2)
memory usage: 74.2+ MB


In [35]:
convertColumnToLabelOrOptimalType(train_data)
convertColumnToLabelOrOptimalType(test_data)

train_data.info(verbose=False)
test_data.info(verbose=False)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 877982 entries, 0 to 878048
Columns: 11 entries, Category to Minute
dtypes: float32(2), int32(9)
memory usage: 43.5 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 884262 entries, 0 to 884261
Columns: 11 entries, Id to Minute
dtypes: float32(2), int32(9)
memory usage: 37.1 MB


In [36]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 877982 entries, 0 to 878048
Data columns (total 11 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Category    877982 non-null  int32  
 1   DayOfWeek   877982 non-null  int32  
 2   PdDistrict  877982 non-null  int32  
 3   Address     877982 non-null  int32  
 4   X           877982 non-null  float32
 5   Y           877982 non-null  float32
 6   Year        877982 non-null  int32  
 7   Month       877982 non-null  int32  
 8   Day         877982 non-null  int32  
 9   Hour        877982 non-null  int32  
 10  Minute      877982 non-null  int32  
dtypes: float32(2), int32(9)
memory usage: 43.5 MB


In [37]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 884262 entries, 0 to 884261
Data columns (total 11 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Id          884262 non-null  int32  
 1   DayOfWeek   884262 non-null  int32  
 2   PdDistrict  884262 non-null  int32  
 3   Address     884262 non-null  int32  
 4   X           884262 non-null  float32
 5   Y           884262 non-null  float32
 6   Year        884262 non-null  int32  
 7   Month       884262 non-null  int32  
 8   Day         884262 non-null  int32  
 9   Hour        884262 non-null  int32  
 10  Minute      884262 non-null  int32  
dtypes: float32(2), int32(9)
memory usage: 37.1 MB


# Training

In [72]:
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense,Activation,Dropout,Flatten, Input
from keras.callbacks import EarlyStopping
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,confusion_matrix

In [39]:
category_dict = dict(zip(category_le.classes_, category_le.transform(category_le.classes_)))

In [93]:
X = train_data.drop(columns='Category')
Y = train_data['Category']

X_train, X_test, y_train, y_test=train_test_split(X,Y,test_size=0.15,random_state=1)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

(746284, 10) (746284,)
(131698, 10) (131698,)


In [94]:
rfc = RandomForestClassifier(n_estimators=25,min_samples_split=100 )
rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)
print (classification_report(y_test,rfc_pred))
y_pred_proba = rfc.predict(test_data.drop('Id', 1))
result = pd.DataFrame(y_pred_proba, columns=category_le.classes_)
result.to_csv(path_or_buf="submission_rfc.csv.zip",index=True, index_label = 'Id', compression="zip")

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  y_pred_proba = rfc.predict_proba(test_data.drop('Id', 1))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00       235
           1       0.21      0.21      0.21     11549
           2       0.00      0.00      0.00        54
           3       0.00      0.00      0.00        36
           4       0.24      0.04      0.07      5568
           5       0.21      0.02      0.04       641
           6       0.00      0.00      0.00       343
           7       0.36      0.43      0.39      8145
           8       0.00      0.00      0.00       634
           9       0.00      0.00      0.00       187
          10       0.00      0.00      0.00        33
          11       0.00      0.00      0.00        87
          12       0.16      0.01      0.02      1647
          13       0.23      0.01      0.02      2440
          14       0.00      0.00      0.00        25
          15       0.00      0.00      0.00       329
          16       0.33      0.77      0.47     26184
          17       1.00    

In [95]:
X = train_data.drop(columns='Category')
Y = pd.get_dummies(train_data['Category'])

X_train, X_test, y_train, y_test=train_test_split(X,Y,test_size=0.15,random_state=1)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

(746284, 10) (746284, 39)
(131698, 10) (131698, 39)


In [118]:
model = Sequential()
model.add(Dense(78, input_shape=(X.shape[1],)))
model.add(Dense(158, activation="elu"))
model.add(Dense(78, activation="selu"))
model.add(Dense(158, activation="elu"))
model.add(Dense(39, activation="softmax")) 
model.summary()


Model: "sequential_16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_44 (Dense)            (None, 78)                858       
                                                                 
 dense_45 (Dense)            (None, 158)               12482     
                                                                 
 dense_46 (Dense)            (None, 78)                12402     
                                                                 
 dense_47 (Dense)            (None, 158)               12482     
                                                                 
 dense_48 (Dense)            (None, 39)                6201      
                                                                 
Total params: 44,425
Trainable params: 44,425
Non-trainable params: 0
_________________________________________________________________


In [120]:
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
history = model.fit(
  X_train,
  y_train,
  batch_size=32,
  epochs=4,
  verbose=1,
  validation_data=(X_test, y_test),
  callbacks=[EarlyStopping(monitor='val_accuracy', patience=10)]
)

SyntaxError: invalid syntax. Perhaps you forgot a comma? (1952827738.py, line 8)

In [112]:
y_pred_proba = model.predict(test_data.drop('Id', 1))
result = pd.DataFrame(y_pred_proba, columns=category_le.classes_)
result.to_csv(path_or_buf="submission_nn.csv.zip",index=True, index_label = 'Id', compression="zip")

  y_pred_proba = model.predict(test_data.drop('Id', 1))


