The objective will be to perform a Machine Learning model to predict whether a future procedure will end in arrest.

In [2]:
# List of all the libraries that will be used.

import pandas as pd
import pylab as plt
import numpy as np
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay, recall_score, roc_curve, auc
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

import warnings
warnings.simplefilter(action='ignore', category=UserWarning)


In [3]:
# 2.- Import and review of the data.
# The datasets of the arrests made in 2009 and 2010 are imported. Since the data source comes from the same database, they have the same columns. 
# Both sets are consolidated and a basic exploration of the data (number of rows/columns, data types, basic statistics, missing cases) is performed.

# both dataframes are loaded 
df_1 = pd.DataFrame(pd.read_csv('2009_1perc.csv'))
df_2 = pd.DataFrame(pd.read_csv('2010_1perc.csv'))

# the concatenation of the df's is performed in order to unify them
df = pd.concat([df_1, df_2],axis=0, ignore_index=True)
df.head(5)
print("The concatenated dataframe has a total of  ", df.shape[0], "rows.")
print("and ", df.shape[1], "columns.")


The concatenated dataframe has a total of   11825 rows.
and  112 columns.


In [4]:
# 2.- Import and review of data

print("**The following are the types of data contained in the df:**")
print(df.dtypes)

print("\n**We check which data is unique inside the df:**")
print(df.apply(pd.unique))


**The following are the types of data contained in the df:**
Unnamed: 0     int64
year           int64
pct            int64
ser_num        int64
datestop       int64
               ...  
xcoord        object
ycoord        object
dettypcm      object
linecm        object
detailcm      object
Length: 112, dtype: object

**We check which data is unique inside the df:**
Unnamed: 0    [178048, 498873, 463573, 43626, 563921, 261097...
year                                               [2009, 2010]
pct           [41, 108, 43, 77, 110, 14, 67, 75, 34, 113, 60...
ser_num       [1779, 5805, 8340, 932, 11224, 5194, 11758, 27...
datestop      [4032009, 10292009, 10062009, 1232009, 1213200...
                                    ...                        
xcoord        [1013067, 1012043, 1017599, 1002625, 1024535, ...
ycoord        [0238633, 0212157, 0240200, 0183442, 0209890, ...
dettypcm                                                [CM,  ]
linecm                                                [

In [5]:
# 2.- Import and review of data

# We identify the number of missing values in each column.
valores_NA = df.isnull().sum()

# Filter out only columns with null values
total_nulos = valores_NA[valores_NA > 0]
print("**The following columns have null values:**")
print(total_nulos)

# function to count the empty spaces
def contar_espacios(x):
    if x.dtype == 'object':
        return x.str.contains(' ').sum()
    else:
        return 0

print("\n**Identify the columns containing empty spaces:**")
print(df.apply(contar_espacios))


**The following columns have null values:**
premname    1
stinter     1
dtype: int64

**Identify the columns containing empty spaces:**
Unnamed: 0      0
year            0
pct             0
ser_num         0
datestop        0
             ... 
xcoord        417
ycoord        417
dettypcm        1
linecm          1
detailcm      830
Length: 112, dtype: int64


In [6]:
# 2.- Import and review of data

print("\nBelow are the basic statistics for the df:\n", df.describe())



Below are the basic statistics for the df:
           Unnamed: 0          year           pct       ser_num      datestop  \
count   11825.000000  11825.000000  11825.000000  11825.000000  1.182500e+04   
mean   295983.814799   2009.508499     68.625624   5697.994334  6.364221e+06   
std    170820.705702      0.499949     33.037826   5152.486022  3.429091e+06   
min        52.000000   2009.000000      1.000000      1.000000  1.012009e+06   
25%    147515.000000   2009.000000     42.000000   2002.000000  3.232009e+06   
50%    296652.000000   2010.000000     73.000000   4366.000000  6.162010e+06   
75%    443097.000000   2010.000000    102.000000   7840.000000  9.232009e+06   
max    601281.000000   2010.000000    123.000000  31694.000000  1.231201e+07   

           timestop        perobs       perstop  compyear  comppct  \
count  11825.000000  11825.000000  11825.000000   11825.0  11825.0   
mean    1413.725497      2.654391      5.649556       0.0      0.0   
std      744.726359     

In [None]:
# 3.- Data preprocessing
# 3.1 Obtain a list of all categorical variables with between 2 and 99 categories (inclusive). (hint: these are the categorical type variables

variables_categoricas = []
for columna in df.columns:
    if df[columna].dtype == 'object':  # Check if the column is of categorical type
        if 2 <= df[columna].nunique() <= 99:  # Check if you have between 2 and 99 categories
            variables_categoricas.append(columna)

print("**The following is a list of the categorical variables:**\n", variables_categoricas)

# we see the information of the categorical data
df_categoricas = df[variables_categoricas]
print("\n**Information on categorical variables:**")
df_categoricas.info()


In [None]:
# 3.2 Replace the following missing classes:
# if any category of the columns officrid, offshld or offverb is equal to “” change it to 'N' and otherwise leave it as 'Y'.

# function that allows to change the column category to N or Y
def cambiar_categoria(categoria):
    if categoria == " ":
        return 'N'
    else:
        return 'Y'

# first we check how the columns officrid, offshld, offverb are conformed and we see what their unique values are
print("Columns before applying replacement.")
print(df_categoricas[['officrid', 'offshld', 'offverb']])
print(df_categoricas[['officrid', 'offshld', 'offverb']].apply(pd.unique))
print("******************************************************************")

# then we call the function defined at the beginning of the block to change the category if applicable
df_categoricas[['officrid', 'offshld', 'offverb']] = df_categoricas[['officrid', 'offshld', 'offverb']].map(cambiar_categoria)

#check how the data is
print("Columns after applying the replacement.")
print(df_categoricas[['officrid', 'offshld', 'offverb']])
print(df_categoricas[['officrid', 'offshld', 'offverb']].apply(pd.unique))


In [None]:
# 3.2 Replace the following missing classes:

# if any category in the sector, trhsloc or beat columns equals “” (or NA, depending on how you have categorized the database),
# change it to 'U' and otherwise keep its value.
# Note, the values mean {N: No, Y: Yes, U: Unknown}

# check how the data in the columns 'officrid', 'offshld', 'offverb', 'sector', 'trhsloc', 'beat' are located
# identifying the unique values it has
print("Columns before applying replacement.")
print(df_categoricas[['sector', 'trhsloc', 'beat']].apply(pd.unique))
print("******************************************************************")

# categories that are found with NAN are filled in with U
df_categoricas[['sector', 'trhsloc', 'beat']] = df_categoricas[['sector', 'trhsloc', 'beat']].fillna('U')

# the unique values are shown, it is used to validate that the previous replacement has been made
print("Columns after applying the replacement.")
print(df_categoricas[['sector', 'trhsloc', 'beat']].apply(pd.unique))


In [None]:
# 3.3 Transform the columns ht_feet together with ht_inch into a single column (of the form “ht_feet.ht_inch”) called 'meters'.
# (hint: transform with the following calculation: meters = (feet+inches)*0.3048)

# a function is defined to calculate the meters 
def calcular_metros(feet, inch):
    return (feet + inch / 12) * 0.3048

# the function defined above is applied to the ht_feet and ht_inch columns to create the meters column.
df['meters'] = df.apply(lambda row: calcular_metros(row['ht_feet'], row['ht_inch']), axis=1)
df = df.drop(columns=["ht_feet","ht_inch"])
print(df.head(3))


In [None]:
# 3.4 Notice that the date comes in a MMDDYYYYYY format in the datestop column. Generate 2 new columns named month and
# year columns with only the month and year respectively.

# The date format of the datestop column is changed.
df['datestop'] = pd.to_datetime(df['datestop'], format='%m%d%Y')

# Then the month is extracted and added to the new month column, the same is not done for year because the year column already exists.
df['month'] = df['datestop'].dt.month
print(df.head(3))


In [None]:
# 3.5 Filter your DataFrame and leave only the columns selected in 3.1, the month, the year, the meters and the age. Then only leave
# the records whose ages are between 18 and 100 years inclusive.

# Filter to leave only the columns in 3.1 plus 'month', 'year', 'meters' and 'age'.
df_filtrado = pd.concat([df_categoricas, df[['month', 'year', 'meters', 'age']]], axis=1)

# Filter to leave only records with ages between 18 and 100 years old
df_filtrado = df_filtrado[(df_filtrado['age'] >= 18) & (df_filtrado['age'] <= 100)]

print("**We visualize the filtered df with the columns categories plus month, year, meters and age:**")
print(df_filtrado.head(3))
print("-------------------------------------------------------")

print("**We visualize the unique values of the df to see if there are still empty values:**")
print(df_filtrado.apply(pd.unique))


In [None]:
# Exploratory analysis
# 4.1 The response variable is studied on its own (arstmade), with the help of a graph.

plt.bar(df_filtrado['arstmade'].value_counts().index, df_filtrado['arstmade'].value_counts().values)

plt.title("Proceeding ends in arrest?")
plt.xlabel("arstmade")
plt.ylabel("number of procedures")
plt.show()

# Observation:
# Thanks to the graph we can notice that there is a noticeable difference between the number of procedures that DO NOT end in arrest vs. 
# number that do result in arrests. That is, most of the procedures do not end in arrest.


In [None]:
# 4.2.- Study the relationship of the behavioral response variable with race, comment.

# A crosstab is performed between race and the arrests made.
tabla_contingencia = pd.crosstab(df_filtrado['race'], df_filtrado['arstmade'])
print ("Table: Arrests by race")
print (tabla_contingencia)

# The table above is plotted to provide a graphical view of the relationship between these variables.
tabla_contingencia.plot(kind='bar', stacked=True)
plt.title("Relationship between arrests made and race")
plt.xlabel("Race")
plt.ylabel("number of procedures")
plt.show()

#Observation:
# Thanks to the table and graph above, we can notice that the breed with the highest number of procedures corresponds to category B, 
# followed by breeds Q and W. On the other hand, breeds I, U and Z have the lowest number of procedures.
# For breeds B and Q, these have the highest number of confirmed arrests, followed by breeds W and P, 
# which would indicate that these breeds are more prone to races are more likely to be arrested. 
# In the case of race I, it can be seen that it has 0 arrests.


In [None]:
# 4.3 Study the relationship of the behavioral response variable with sex, and comment.

# a crosstab is performed between the variables sex and arrests made.
tabla_sexo = pd.crosstab(df_filtrado['sex'], df_filtrado['arstmade'])
print ("Table: Arrests by Genre")
print (tabla_sexo)

tabla_sexo.plot(kind='bar', stacked=True)
plt.title("Relationship between arrests made and race")
plt.xlabel("Genre")
plt.ylabel("number of procedures")
plt.show()

# Observation:
# We can visualize that the majority of the arrests made correspond to persons of the Male sex, followed by the Female sex.
# and then followed by the Z category.


In [None]:
# 4.3 Study the relationship of the behavioral response variable with sex and age as a whole, comment.

plt.figure(figsize=(10,4))
sns.scatterplot(x='age', y='arstmade', hue='sex', data=df_filtrado, palette='bright')
plt.title("Relationship between arrests, gender and age")
plt.xlabel("Age")
plt.ylabel("Number of arrests")

plt.legend(title='Gender', loc='center right')
plt.xticks(range(0, max(df_filtrado['age'])+1,10))
plt.show()

# Observation:
# In the graph we can see that there are few procedures over 80 years of age, in addition we can see that the procedures that do not end up 
# arrest over 60 years of age correspond mostly to the male sex.


In [None]:
# 4.4 Recode the response variable to 1 and 0. Where 0 is N and 1 is Y.

# Create a mapping dictionary
mapeo = {'N': 0, 'Y': 1}

# Recode the 'answer' column
df_filtrado['arstmade_mapeo'] = df_filtrado['arstmade'].map(mapeo)
df_filtrado = df_filtrado.drop(columns=['arstmade'])

print("DataFrame with the recoded response variable:")
print(df_filtrado)


In [None]:
# 4.5 Show on a graph the probability that an individual will be arrested, conditional on gender and race. 
# What are the ethical implications of some of the conclusions you observe?

probabilidad = df_filtrado.groupby(['sex','race'])['arstmade_mapeo'].mean().reset_index()

plt.figure(figsize=(10,6))
sns.barplot(x='race', y='arstmade_mapeo', hue='sex', data=probabilidad)
plt.title("Probability of arrest by gender and race")
plt.xlabel("Race")
plt.ylabel("Probability of arrest")
plt.legend(title='Gender', loc='center right')
plt.show()

# Observation:
# In the graph we can see that people belonging to race U whose gender is unknown (Z) have a high probability of being arrested.
# As well as people of race Q with unknown gender.
# Next come female persons of races B, P, and W.
# There is no likelihood of female persons of races U and I to be arrested.
# There is also no probability that persons of unknown gender belonging to races A, B, P and W will be arrested.
# There is 0 probability that persons of race I of any gender will be arrested.


In [None]:
# 5.- Determine whether the police procedure will result in any violent action.
# The attributes prefixed with pf (['pf_hands'],['pf_wall'], ['pf_grnd'],['pf_drwep'], ['pf_ptwep'],['pf_baton'],['pf_hcuff'], 
# ['pf_pepsp'], ['pf_pepsp'] and ['pf_other']) 
# indicate whether there was physical force used by the officer at the time of the procedure, marked 'Y'.
# Generate a new variable called 'violence' which is 1 if in any of the 9 pf variables there was any 'Y', and 0 otherwise. 
# Then indicate the percentage of cases that ended with violence.

columnas_pf = df_filtrado.filter(regex='^pf')

# Generate the new variable 'violence'
df_filtrado['violencia'] = columnas_pf.apply(lambda row: 1 if 'Y' in row.values else 0, axis=1)

# Calculate the percentage of cases with violence
porcentaje_violencia = (df_filtrado['violencia'].sum() / len(df_filtrado)) * 100

print("Percentage of cases that ended in violence:", porcentaje_violencia, "%") 


In [None]:
# 6.- Modeling
# 6.1 The corresponding dummy variables are generated. Then the train-test sets are generated using the year 2009 for training, 
# and the year 2010 for testing.

# before continuing with the modeling, we will finish cleaning up the data.
# then to the addrtyp variable we add an N to the spaces
df_filtrado['addrtyp'] = df_filtrado['addrtyp'].replace(" ", "N")

# we remove the spaces from the beat column
df_filtrado['beat'] = df_filtrado['beat'].replace(" ", "N")

# we remove the blanks inside the values of the beat column
df_filtrado['beat'] = df_filtrado['beat'].str.replace(' ', '')

# As some columns still have empty values, we do a cleanup to remove spaces and empty values from the df.
df_filtrado = df_filtrado.map(lambda x: x.strip() if isinstance(x, str) else x)
df_filtrado.replace(r'^\s*$', pd.NA, regex=True, inplace=True)
df_filtrado.dropna(inplace=True)

# we will eliminate the columns city, dettypcm, linecm because they do not generate information for us
df_filtrado = df_filtrado.drop(columns={'city', 'dettypcm', 'linecm'})

df_filtrado.info()
df_filtrado.apply(pd.unique)


In [None]:
# 6.- Modeling
# 6.1 The corresponding dummy variables are generated. Then the train-test sets are generated using the year 2009 for training, 
# and the year 2010 for testing.

# we proceed to eliminate redundant columns
df_filtrado = df_filtrado.drop(columns= {'pf_hands', 'pf_wall', 'pf_grnd','pf_drwep', 'pf_ptwep','pf_baton','pf_hcuff','pf_pepsp','pf_other'})

#we can group all the data for additional circumstances as we did for the violence base
df_filtrado.loc[(df_filtrado[['ac_rept','ac_inves','ac_proxm', 'ac_evasv', 'ac_assoc','ac_cgdir', 'ac_incid', 'ac_time','ac_stsnd', 'ac_other']] == 'Y').max(axis=1), 'AC'] = 1

# assign a value of 0 to cases with no violence
df_filtrado['AC'] = df_filtrado['AC'].fillna(0)

# we eliminate the ac columns since they give the same AC record
df_filtrado = df_filtrado.drop(columns = {'ac_rept', 'ac_inves','ac_proxm', 'ac_evasv', 'ac_assoc','ac_cgdir', 'ac_incid', 'ac_time','ac_stsnd', 'ac_other'})

df_filtrado.info()


In [None]:
# same procedure for RF prefixes
df_filtrado.loc[(df_filtrado[['rf_vcrim', 'rf_othsw','rf_attir', 'rf_vcact','rf_rfcmp', 'rf_verbl',  'rf_knowl', 'rf_furt', 'rf_bulg']] == 'Y').max(axis=1), 'RF'] = 1

# assign a value of 0 to cases with no violence
df_filtrado['RF'] = df_filtrado['RF'].fillna(0)

# we eliminate the ac columns since they give the same RF register
df_filtrado = df_filtrado.drop(columns = {'rf_vcrim', 'rf_othsw','rf_attir', 'rf_vcact','rf_rfcmp', 'rf_verbl',  'rf_knowl', 'rf_furt', 'rf_bulg'})

df_filtrado['RF'].unique()
df_filtrado.info()

In [None]:
# same procedure for CS prefixes
df_filtrado.loc[(df_filtrado[['cs_objcs', 'cs_descr', 'cs_casng', 'cs_lkout', 'cs_cloth', 'cs_drgtr','cs_furtv','cs_vcrim', 'cs_bulge', 'cs_other']] == 'Y').max(axis=1), 'CS'] = 1

# assign a value of 0 to cases with no violence
df_filtrado['CS'] = df_filtrado['CS'].fillna(0)

# we eliminate the ac columns since they give the same CS record
df_filtrado = df_filtrado.drop(columns = {'cs_objcs', 'cs_descr', 'cs_casng', 'cs_lkout', 'cs_cloth', 'cs_drgtr','cs_furtv','cs_vcrim', 'cs_bulge', 'cs_other'})

df_filtrado['CS'].unique()
df_filtrado.info()


In [None]:
# same procedure for SB prefixes
df_filtrado.loc[(df_filtrado[['sb_hdobj', 'sb_outln', 'sb_admis', 'sb_other']] == 'Y').max(axis=1), 'SB'] = 1

# assign a value of 0 to cases with no violence
df_filtrado['SB'] = df_filtrado['SB'].fillna(0)

# we eliminate the ac columns since they give the same CS record
df_filtrado = df_filtrado.drop(columns = {'sb_hdobj', 'sb_outln', 'sb_admis', 'sb_other'})

df_filtrado['SB'].unique()
df_filtrado.info() 


In [None]:
# then all categorical variables with two values we change them to 0 and 1
df_filtrado.apply(pd.unique)

df_filtrado['recstat'] = df_filtrado['recstat'].replace({'1':1, 'A':0})
df_filtrado['inout'] = df_filtrado['inout'].replace({'I':1, '0':0})
df_filtrado['explnstp'] = df_filtrado['explnstp'].replace({'Y':1, 'N':0})
df_filtrado['othpers'] = df_filtrado['othpers'].replace({'Y':1, 'N':0})
df_filtrado['sumissue'] = df_filtrado['sumissue'].replace({'Y':1, 'N':0})
df_filtrado['offunif'] = df_filtrado['offunif'].replace({'Y':1, 'N':0})
df_filtrado['officrid'] = df_filtrado['officrid'].replace({'Y':1, 'N':0})
df_filtrado['frisked'] = df_filtrado['frisked'].replace({'Y':1, 'N':0})
df_filtrado['searched'] = df_filtrado['searched'].replace({'Y':1, 'N':0})
df_filtrado['contrabn'] = df_filtrado['contrabn'].replace({'Y':1, 'N':0})
df_filtrado['pistol'] = df_filtrado['pistol'].replace({'Y':1, 'N':0})
df_filtrado['riflshot'] = df_filtrado['riflshot'].replace({'Y':1, 'N':0})
df_filtrado['asltweap'] = df_filtrado['asltweap'].replace({'Y':1, 'N':0})
df_filtrado['knifcuti'] = df_filtrado['knifcuti'].replace({'Y':1, 'N':0})
df_filtrado['othrweap'] = df_filtrado['othrweap'].replace({'Y':1, 'N':0})
df_filtrado['radio'] = df_filtrado['radio'].replace({'Y':1, 'N':0})
df_filtrado['offverb'] = df_filtrado['offverb'].replace({'Y':1, 'N':0})
df_filtrado['offshld'] = df_filtrado['offshld'].replace({'Y':1, 'N':0})
df_filtrado['addrtyp'] = df_filtrado['addrtyp'].replace({'L':1, 'N':0})

df_filtrado

In [None]:
# we eliminate the columns that have only one CS data
df_filtrado = df_filtrado.drop(columns = {'CS'})
df_filtrado.info() 

In [None]:
# 6.- Modeling
# 6.1 The corresponding dummy variables are generated. Then the train-test sets are generated using the year 2009 for training, 
# and the year 2010 for testing.

# the data is clean so we will proceed to calculate the rest of the things we are asked to do

listado_dummies=["trhsloc","typeofid","sex","race","haircolr","eyecolor","build","sector","beat", 'inout']
df_clean = pd.get_dummies(df_filtrado, columns=listado_dummies, dtype=int)
df_clean.head(5)

df_clean.apply(contar_espacios)


In [None]:
# 6.- Modelación
# 6.1 Luego genere los sets de train-test utilizando el año 2009 para entrenar, y el año 2010 para testear.

# Separate the data into training and test sets.
train = df_clean.loc[df_clean['year'] == 2009]
test = df_clean.loc[df_clean['year'] == 2010]

# now we eliminate the year column of both df, because the test data has the 2009 data and the test data has the 2010 data.
train = train.drop(columns = ['year'])
test = test.drop(columns = ['year'])

X_train = train.drop(columns = ['arstmade_mapeo'], axis=1)
y_train = train['arstmade_mapeo']

X_test = test.drop(columns = ['arstmade_mapeo'], axis=1)
y_test = test['arstmade_mapeo']

# # then we revisit the data in the training data for X
X_train.apply(pd.unique)

# we eliminate the columns that have only one value, since they will not help us in training the model.
X_train = X_train.drop(columns = {'addrtyp', 'eyecolor_VI', 'beat_27', 'beat_30'})
X_test = X_test.drop(columns = {'addrtyp', 'eyecolor_VI', 'beat_27', 'beat_30'})


In [None]:
# 6.- Modeling
# 6.1 Then generate the train-test sets using year 2009 for training, and year 2010 for testing.

# We create the climber
scaler = MinMaxScaler()

# adjusting the scaler to the training data and transforming the data
X_train_scaled = scaler.fit_transform(X_train)

# transform the test data using the same scaler
X_test_scaled = scaler.transform(X_test)


In [None]:
#  function that prints the confusion matrix

from sklearn.metrics import classification_report, confusion_matrix
import itertools
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print("Confusion matrix, without normalization")

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    

In [None]:
# 6.2 Train 4 classification models, report the best model under some criterion. Cross validation is used to test different 
# hyperparameters for each model.

# Model 1

# we create a random forest model
rf_model = RandomForestClassifier()

# we fit the model to the training data
rf_model.fit(X_train_scaled, y_train)

# model cross validation
RFC_CV = cross_val_score(rf_model, X_train_scaled, y_train, cv=5).mean()
print("**Cross-validation of the random forest model:", RFC_CV, "**")

# Hyperparameter setting
param_dist = {
    'criterion': ['gini', 'entropy'],
    'n_estimators': list(range(50, 1501)), 
    'max_depth': list(range(1, 120)) 
}

# Randomized to find the best hyperparameter
rand_search_rf = RandomizedSearchCV(rf_model,
                                 param_distributions = param_dist,
                                 n_iter=5,
                                 cv=5)

# We adjust the model again to our training base.
rand_search_rf.fit(X_train_scaled, y_train)

# A variable is created for the best model
best_rf = rand_search_rf.best_estimator_

# The best hyperparameter
print('**The best hyperparameter:',  rand_search_rf.best_params_, "**")

# Predictions with test data
y_pred_tree = best_rf.predict(X_test_scaled)

# Compute confusion matrix
rf_matrix = confusion_matrix(y_test, y_pred_tree, labels=[1,0])
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(rf_matrix, classes=['arstmade_mapeo=1','arstmade_mapeo=0'],normalize= False,  title='Matriz de confusion')


In [None]:
# Model 2

# we create a KNN model
KNN_C= KNeighborsClassifier()
KNN_C.fit(X_train_scaled, y_train)

# model cross validation
KNN_CV=cross_val_score(KNN_C, X_train_scaled, y_train, cv=5).mean()
KNN_CV

# Hyperparameter setting
param_grid = {'n_neighbors': [2, 3]}
KNN= KNeighborsClassifier()

# Randomized to find the best hyperparameter
rand_search_knn = RandomizedSearchCV(KNN,
                                 param_distributions = param_grid,
                                 n_iter=5,
                                 cv=5)
rand_search_knn.fit(X_train_scaled, y_train)
best_KNN = rand_search_knn.best_estimator_

# The best hyperparameter
print('# The best hyperparameter:',  rand_search_knn.best_params_)

# Predictions with test data
y_pred_knn = best_KNN.predict(X_test_scaled)

# Compute confusion matrix
knn_matrix = confusion_matrix(y_test, y_pred_knn, labels=[1,0])
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(knn_matrix, classes=['arstmade_mapeo=1','arstmade_mapeo=0'],normalize= False,  title='Matriz de confusion')


In [None]:
# Model 3

# we created a logistic regression model

log_reg = LogisticRegression(max_iter=10000)

# Model adjustment
log_reg.fit(X_train_scaled, y_train)

# Cross validation of the model
log_reg_cv = cross_val_score(log_reg, X_train_scaled, y_train, cv=5).mean()
print("Cross-validation of the logistic regression model:", log_reg_cv)

# Hyperparameter setting
log_reg_param_grid = {'C': [0.1, 1, 10], 'penalty': [None, 'l2']}
rand_search_log_reg = RandomizedSearchCV(log_reg, param_distributions=log_reg_param_grid, n_iter=5, cv=5)
rand_search_log_reg.fit(X_train_scaled, y_train)
best_log_reg = rand_search_log_reg.best_estimator_

# The best hyperparameter
print('The best hyperparameter for logistic regression:', rand_search_log_reg.best_params_)

# Predictions with test data
y_pred_log_reg = best_log_reg.predict(X_test_scaled)

# Compute confusion matrix
log_reg_matrix = confusion_matrix(y_test, y_pred_log_reg, labels=[1,0])
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(log_reg_matrix, classes=['arstmade_mapeo=1','arstmade_mapeo=0'],normalize= False,  title='Matriz de confusion')


In [None]:
# Model 4

# we created a Gradient Boosting model
gb_clf = GradientBoostingClassifier()

# Model adjustment
gb_clf.fit(X_train_scaled, y_train)

# Cross validation of the model
gb_cv = cross_val_score(gb_clf, X_train_scaled, y_train, cv=5).mean()
print("Cross-validation of Gradient Boosting:", gb_cv)

# Hyperparameter setting
gb_param_grid = {'n_estimators': [50, 100, 150],
                 'learning_rate': [0.05, 0.1, 0.2],
                 'max_depth': [3, 5, 7]}
rand_search_gb = RandomizedSearchCV(gb_clf, param_distributions=gb_param_grid, n_iter=5, cv=5)
rand_search_gb.fit(X_train_scaled, y_train)
best_gb = rand_search_gb.best_estimator_

# The best hyperparameter
print('The best hyperparameter for Gradient Boosting:', rand_search_gb.best_params_)

# Predictions with test data
y_pred_gb = best_gb.predict(X_test_scaled)

# Compute confusion matrix
gb_matrix = confusion_matrix(y_test, y_pred_gb, labels=[1,0])
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(gb_matrix, classes=['arstmade_mapeo=1','arstmade_mapeo=0'],normalize= False,  title='confusion matrix')


In [None]:
# Comparison of the models

# Classification report model 1: random forest
rforest_metricas = classification_report(y_test, y_pred_tree)
print("Metric Model 1: Random Forest")
print(rforest_metricas)

# Classification report model 2: knn
KKM_metricas = classification_report(y_test, y_pred_knn)
print("Metricas model 2: KNN")
print(KKM_metricas)

# Classification report model 3: logistic regression
log_reg_metricas = classification_report(y_test, y_pred_log_reg)
print("Metrics Model 3: Logistic Regression")
print(log_reg_metricas)

# Model 4 Classification Report: Gradient Boosting
gb_metricas = classification_report(y_test, y_pred_gb)
print("Metric model 4: Gradient Boosting")
print(gb_metricas)
