<div class="alert alert-secondary" role="alert">
    <h1 align = 'center'>San Francisco Crime Classification</h1>
    <h4 align = 'center' class="mb-0">Predict the category of crimes that occurred in the city by the bay</h4>
</div>

# 1: Importing The Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

# 2: Load Dateset

In [None]:
df = pd.read_csv('../input/sf-crime/train.csv.zip')
unseen_data = pd.read_csv('../input/sf-crime/test.csv.zip')
df.head()

# 3: EDA & Storytelling

In [None]:
df.shape

In [None]:
df.info()

In [None]:
#check for duplicated values
df.duplicated().sum()

In [None]:
#drop duplicated values
df.drop_duplicates(keep = 'first', inplace = True)
df.shape

In [None]:
df.columns

In [None]:
df.nunique()

In [None]:
df.Category.value_counts()

In [None]:
df.Resolution.value_counts()

In [None]:
df['Dates'] = pd.to_datetime(df['Dates'])

df['year'] = df['Dates'].dt.year
df['month'] = df['Dates'].dt.month
df['day_of_month'] = df['Dates'].dt.day
df['day_of_week'] = df['Dates'].dt.day_of_week
df['hour'] = df['Dates'].dt.hour
df.drop(['Dates', 'DayOfWeek'], axis = 1, inplace = True)
df.dtypes

In [None]:
df.columns = [item.lower() for item in df.columns]

In [None]:
pd.crosstab(df['day_of_week'], df['category'])

In [None]:
df.columns

In [None]:
category_count = df.category.value_counts().to_frame().reset_index()
top5category = category_count.iloc[:5] 

df_cat = df.hour.value_counts().to_frame().sort_index()
plt.figure(figsize = (15, 5))
plt.plot(df_cat)

for item in top5category['index']:
    df_cat = df[df.category == item].hour.value_counts().to_frame().sort_index()
    plt.plot(df_cat)
plt.legend(['All Categories'] + list(top5category['index'].values))
plt.xlabel('hour')
_ = plt.ylabel('count')

In [None]:
category_count = df.category.value_counts().to_frame().reset_index()
top5category = category_count.iloc[:5] 

df_cat = df.year.value_counts().to_frame().sort_index()
plt.figure(figsize = (15, 5))
plt.plot(df_cat)

for item in top5category['index']:
    df_cat = df[df.category == item].year.value_counts().to_frame().sort_index()
    plt.plot(df_cat)
plt.legend(['All Categories'] + list(top5category['index'].values))
plt.xlabel('year')
_ = plt.ylabel('count')

In [None]:
df1 = df.groupby('pddistrict').count().reset_index()[['pddistrict', 'category']].sort_values('category', ascending = False)
plt.figure(figsize = (15, 5))
sns.barplot(x = 'pddistrict',y = 'category',data = df1, palette= 'Reds_r')

# 4: Data preprocessing

#### 4.1: Variable Encoding

In [None]:
top200Des = df['descript'].value_counts()[:200].index
df['descript'] = df['descript'].apply(lambda x :x if x in top200Des else 'other' )
x = df.drop(['category', 'address'], axis = 1)
y = df['category']

In [None]:
le = LabelEncoder()
y = le.fit_transform(y)
x['descript'] = le.fit_transform(x['descript'])
x['pddistrict'] = le.fit_transform(x['pddistrict'])
x['resolution'] = le.fit_transform(x['resolution'])

#### 4.2: split data

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.15, random_state = 1)
print('x train :', x_train.shape,'\t\tx test :', x_test.shape)
print('y train :', y_train.shape,'\t\ty test :', y_test.shape)

#### 4.3: feature scaling


In [None]:
mms = MinMaxScaler()
x_train = mms.fit_transform(x_train)
x_test = mms.transform(x_test)

In [None]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

# 5: Train your model


#### 5.1: KNN

In [None]:
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)

In [None]:
accuracy_score(y_pred, y_test)

#### 5.2: Random Forest

In [None]:
rfc = RandomForestClassifier(random_state = 42)
rfc.fit(x_train, y_train)
y_pred = rfc.predict(x_test)

In [None]:
accuracy_score(y_pred, y_test)

#### 5.3: Decision Tree 

In [None]:
dtc = DecisionTreeClassifier(random_state= 1)
dtc.fit(x_train, y_train)
y_pred = dtc.predict(x_test)

In [None]:
accuracy_score(y_pred, y_test)

#### 5.4: XGBoost 

In [None]:
model = XGBClassifier()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [None]:
accuracy_score(y_pred, y_test)