In [25]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, KFold, cross_validate
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.multiclass import OneVsRestClassifier

# Doc du lieu
df = pd.read_csv("./Crime_Data_from_2020_to_Present.csv", sep=",")

# Lay thuoc tinh va nhan
data = df[['TIME OCC', 'AREA', 'Vict Age', 'Vict Sex', 'Vict Descent', 'Premis Cd', 'Weapon Used Cd', 'Crm Cd']]

data = data.dropna()
# print(data.info())



# Ma hoa nhan
le = LabelEncoder()
data['Vict Sex'] = le.fit_transform(data['Vict Sex'])
data['Vict Descent'] = le.fit_transform(data['Vict Descent'])
data['AREA'] = le.fit_transform(data['AREA'])

# #Scale du lieu (chuan hoa du lieu dung min max)
# scaler = MinMaxScaler()
# data[['Premis Cd', 'Vict Age']] = scaler.fit_transform(data[['Premis Cd','Vict Age']])

# Chuan hoa thoi gian
def categorize_time_period(time_occ):
    if 600 <= time_occ < 1200: # 06:00 to 12:00 Moring
        return 0
    elif 1200 <= time_occ < 1800 :  # 12:00 to 18:00 Afternoon
        return 1
    elif 1800 <= time_occ <= 2400 : # 18:00 to 24:00 Night
        return 2
    else: # Middal night
        return 3

def categorize_age(age):
    if 0 <= age <= 5:
        return 0
    elif 6 <= age <= 12:
        return 1
    elif 13 <= age <= 17:
        return 2
    elif 18 <= age <= 25:
        return 3
    elif 26 <= age <= 45:
        return 4
    elif 46 <= age <= 65:
        return 5
    else:
        return 6

def categorize_crime_type(crime_code):
    # Danh sách mã tội phạm cho từng loại
    violent_crimes = [
        110, 113, 121, 122, 210, 220, 230, 231,
        235, 236, 250, 251, 310, 320, 330, 622,
        623, 624, 625, 626, 627, 648, 753, 755,
        756, 810, 812, 813, 860, 865, 870, 880,
        882, 884, 886, 910, 921, 922, 928, 930,
        940, 950, 943, 944, 946, 948, 949
    ]

    sexual_offenses = [
        121, 122, 760, 761, 762, 763, 805, 806,
        810, 812, 813, 814, 815, 820, 821, 830,
        840, 845, 850
    ]

    theft_property_crimes = [
        210, 220, 310, 320, 330, 331, 341, 343,
        345, 347, 349, 350, 351, 352, 353, 354,
        410, 420, 421, 440, 441, 442, 443, 444,
        450, 451, 452, 473, 474, 480, 485, 487,
        510, 520, 522, 668, 670, 740, 745
    ]

    economic_fraud_crimes = [
        649, 651, 652, 653, 654, 660, 661,
        662, 664, 666, 668, 670, 950, 951, 956
    ]

    # social_legal_violations = [
    #     432, 433, 434, 435, 436, 437, 438,
    #     439, 440, 441, 442, 443, 444, 445,
    #     446, 450, 451, 452, 453, 470, 471,
    #     473, 474, 475, 480, 485, 487, 510,
    #     520, 522, 622, 623, 624, 625, 626,
    #     627, 647, 648, 649, 651, 652, 653,
    #     654, 660, 661, 662, 664, 666, 668,
    #     670, 740, 745, 753, 755, 756, 760,
    #     761, 762, 763, 805, 806, 810, 812,
    #     813, 814, 815, 820, 821, 822, 830,
    #     840, 845, 850, 860, 865, 870, 880,
    #     882, 884, 886, 888, 890, 900, 901,
    #     902, 903, 904, 906
    # ]

    # Phân loại mã tội phạm
    if crime_code in violent_crimes:
        return 0  # Tội phạm nghiêm trọng liên quan đến tính mạng và bạo lực
    elif crime_code in sexual_offenses:
        return 1  # Tội phạm tình dục
    elif crime_code in theft_property_crimes:
        return 2  # Tội liên quan đến trộm cắp và tài sản
    elif crime_code in economic_fraud_crimes:
        return 3  # Tội phạm kinh tế và gian lận
    else:
        return 4  # Tội phạm liên quan đến hành vi xã hội và luật pháp


def categorize_location(premis_code):
    residential = {502, 501, 504, 505, 507, 510, 511, 514, 515, 516, 518, 519, 508, 509, 513}
    transportation = {101, 128, 124, 212, 801, 802, 804, 111, 113, 115, 122, 905, 910, 912, 890, 929, 937, 940, 950,
                      893, 745, 110}
    commercial = {405, 248, 404, 403, 410, 406, 412, 413, 201, 202, 210, 207, 233, 235, 244, 401, 402, 217, 237, 250}
    public_space = {102, 108, 109, 104, 107, 127, 141, 143, 144, 145, 146, 147, 109, 208, 209, 149, 243, 718, 756, 757}
    government_facility = {725, 726, 753, 214, 240}
    educational = {720, 721, 704, 722, 730, 731, 912}

    if premis_code in residential:
        return 0  # Khu dân cư
    elif premis_code in transportation:
        return 1  # Giao thông vận tải
    elif premis_code in commercial:
        return 2  #Thương mại
    elif premis_code in public_space:
        return 3  # Khu vực công cộng
    elif premis_code in government_facility:
        return 4  # Cơ sở chính phủ
    elif premis_code in educational:
        return 5  # Cơ sở giáo dục
    else:
        return 6  # Khu vực khác


data['TIME OCC'] = data['TIME OCC'].apply(categorize_time_period)
data['Crm Cd'] = data['Crm Cd'].apply(categorize_crime_type)
data['Premis Cd'] = data['Premis Cd'].apply(categorize_location)
data['Vict Age'] = data['Vict Age'].apply(categorize_age)

#Scale du lieu (chuan hoa du lieu dung min max)
scaler = MinMaxScaler()
data[['Premis Cd', 'Vict Age', 'Vict Sex', 'Vict Descent', 'AREA', 'TIME OCC', 'Weapon Used Cd']] = scaler.fit_transform(data[['Premis Cd', 'Vict Age','Vict Sex', 'Vict Descent', 'AREA', 'TIME OCC', 'Weapon Used Cd']])


# Chuyen du lieu thanh dang array
# X = data.iloc[:,: -1 ].values
# y = data.iloc[:, -1].values


X = data.drop(['Crm Cd'], axis = 1)
y = data['Crm Cd']
#under_samling
from imblearn.under_sampling import NearMiss
nm = NearMiss()
# Oversampling
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler()
X_res, y_res = ros.fit_resample(X, y)



# Chuyen du lieu thanh dang array
X_main = X_res.values
y_main = y_res.values

print(X_res.shape)
print(y_res.shape)


# Phan chia tap du lieu
X_train, X_test, y_train, y_test = train_test_split(X_main, y_main, test_size=0.2, random_state=42)



# Tao model
model = LogisticRegression(solver='newton-cholesky', max_iter=1000, random_state=42)



kf = KFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'precision_micro', 'recall_micro', 'f1_micro']
cv_results = cross_validate(model, X_train, y_train, cv=kf, scoring=scoring, return_estimator=True)


(1438900, 7)
(1438900,)


In [26]:
print(cv_results)

{'fit_time': array([3.699368  , 3.75266957, 4.30302596, 4.08489537, 4.25619602]), 'score_time': array([0.26206231, 0.33508658, 0.27506375, 0.35609341, 0.28607011]), 'estimator': [LogisticRegression(max_iter=1000, random_state=42, solver='newton-cholesky'), LogisticRegression(max_iter=1000, random_state=42, solver='newton-cholesky'), LogisticRegression(max_iter=1000, random_state=42, solver='newton-cholesky'), LogisticRegression(max_iter=1000, random_state=42, solver='newton-cholesky'), LogisticRegression(max_iter=1000, random_state=42, solver='newton-cholesky')], 'test_accuracy': array([0.46795295, 0.467549  , 0.469951  , 0.46865227, 0.46896066]), 'test_precision_micro': array([0.46795295, 0.467549  , 0.469951  , 0.46865227, 0.46896066]), 'test_recall_micro': array([0.46795295, 0.467549  , 0.469951  , 0.46865227, 0.46896066]), 'test_f1_micro': array([0.46795295, 0.467549  , 0.469951  , 0.46865227, 0.46896066])}


In [27]:
cv_results['test_accuracy'].mean()




0.468613176732226

In [28]:
cv_results['test_precision_micro'].mean()

0.468613176732226

In [29]:
cv_results['test_recall_micro'].mean()

0.468613176732226

In [30]:
cv_results['test_f1_micro'].mean()

0.468613176732226