In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 100)

In [2]:
# Reading the Data
df = pd.read_csv('./CrimesPR.csv',encoding='latin-1',delimiter=';')

In [3]:
def clean_data(df):

    # Replacing "-" by Null Values
    df = df.replace("-",np.nan)

    # converting all the columns, except "Localidade"to the float type because 
    # some features were, originally, as "object" type.
    for col in df.drop(columns=['Localidade']).columns:
        df[col] = df[col].astype(float)
    # dropping features with too much Null values
    df = df.drop(columns=['Índice de Desenvolvimento Humano Municipal (IDHM) '])
    df = df.drop(columns=['Produto Interno Bruto (PIB) per Capita (R$ 1,00)','Roubos de Veículos '])

    # sorting the Data
    df = df.sort_values(by=['Localidade','Ano'])

    # Let us fill Null values of each column with the median of each column
    # grouped by Locality. The remaining null values after these transformations
    # must be dropped, since no data was found in these rows.
    list_cols = df.drop(columns=['Localidade']).columns
    def g(df):
        return df.fillna(df.groupby(by=['Localidade'],as_index=False)[list_cols].transform('median'))

    df = df.groupby(by=['Localidade'],as_index=False).apply(g)
    # There are 30 remaining Null Values, which must be dropped.
    df = df.dropna()

    return df

In [4]:
# Applying the function that cleans the data
df = clean_data(df)

In [5]:
# Verifying Null values along each column
df.isnull().sum()

Ano                                              0
Localidade                                       0
Crimes de Ameaça                                 0
Crimes de Estelionato                            0
Crimes de Estupro                                0
Crimes de Furto                                  0
Crimes de Lesão Corporal                         0
Crimes de Roubo                                  0
Furtos de Veículos                               0
Ocorrências Envolvendo Tráfico de Drogas         0
Ocorrências Envolvendo Uso/Consumo de Drogas     0
Perturbação do Sossego/Tranquilidade             0
dtype: int64

In [6]:
# Getting information about the type of each column
# in order to verify it they are in an appropriate format
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2370 entries, (0, 1) to (399, 2015)
Data columns (total 12 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   Ano                                            2370 non-null   float64
 1   Localidade                                     2370 non-null   object 
 2   Crimes de Ameaça                               2370 non-null   float64
 3   Crimes de Estelionato                          2370 non-null   float64
 4   Crimes de Estupro                              2370 non-null   float64
 5   Crimes de Furto                                2370 non-null   float64
 6   Crimes de Lesão Corporal                       2370 non-null   float64
 7   Crimes de Roubo                                2370 non-null   float64
 8   Furtos de Veículos                             2370 non-null   float64
 9   Ocorrências Envolvendo Tráfico de Drogas

In [7]:
# Exporting the DataFrame to .csv
# This data can be used to create
#df.to_csv('./cleaned_crimesPR.csv',index=False)