## Examining the dataset

In [1]:
# Import the pandas library as pd
import pandas as pd

In [2]:
# Read 'police.csv' into a DataFrame named ri
ri = pd.read_csv('RI-clean.csv', nrows=50000, low_memory=False)

In [4]:
# Examine the head of the DataFrame
ri.head()

Unnamed: 0,id,state,stop_date,stop_time,location_raw,county_name,county_fips,fine_grained_location,police_department,driver_gender,...,search_conducted,search_type_raw,search_type,contraband_found,stop_outcome,is_arrested,stop_duration,out_of_state,drugs_related_stop,district
0,RI-2005-00001,RI,2005-01-02,01:55,Zone K1,,,,600,M,...,False,,,False,Citation,False,0-15 Min,False,False,Zone K1
1,RI-2005-00002,RI,2005-01-02,20:30,Zone X4,,,,500,M,...,False,,,False,Citation,False,16-30 Min,False,False,Zone X4
2,RI-2005-00003,RI,2005-01-04,11:30,Zone X1,,,,0,,...,False,,,False,,,,,False,Zone X1
3,RI-2005-00004,RI,2005-01-04,12:55,Zone X4,,,,500,M,...,False,,,False,Citation,False,0-15 Min,False,False,Zone X4
4,RI-2005-00005,RI,2005-01-06,01:30,Zone X4,,,,500,M,...,False,,,False,Citation,False,0-15 Min,False,False,Zone X4


In [6]:
# Count the number of missing values in each column
print(ri.isnull().sum())

id                           0
state                        0
stop_date                    0
stop_time                    0
location_raw                 0
county_name              50000
county_fips              50000
fine_grained_location    50000
police_department            0
driver_gender             1990
driver_age_raw            1971
driver_age                2208
driver_race_raw           1988
driver_race               1988
violation_raw             1988
violation                 1988
search_conducted             0
search_type_raw          47988
search_type              47988
contraband_found             0
stop_outcome              1988
is_arrested               1988
stop_duration             1988
out_of_state              2203
drugs_related_stop           0
district                     0
dtype: int64


## Dropping columns

In [8]:
# Examine the shape of the DataFrame
print(ri.shape)

(50000, 26)


In [9]:
# Drop the 'county_name' and 'state' columns
ri.drop(['county_name', 'state'], axis='columns', inplace=True)

In [10]:
# Examine the shape of the DataFrame (again)
print(ri.shape)

(50000, 24)


## Dropping rows

In [11]:
# Drop all rows that are missing 'driver_gender'
ri.dropna(subset=['driver_gender'], inplace=True)

In [12]:
# Count the number of missing values in each column (again)
print(ri.isnull().sum())

id                           0
stop_date                    0
stop_time                    0
location_raw                 0
county_fips              48010
fine_grained_location    48010
police_department            0
driver_gender                0
driver_age_raw               0
driver_age                 232
driver_race_raw              0
driver_race                  0
violation_raw                0
violation                    0
search_conducted             0
search_type_raw          45998
search_type              45998
contraband_found             0
stop_outcome                 0
is_arrested                  0
stop_duration                0
out_of_state               215
drugs_related_stop           0
district                     0
dtype: int64


In [13]:
# Examine the shape of the DataFrame
print(ri.shape)

(48010, 24)


## Fixing a data type

In [14]:
# Examine the head of the 'is_arrested' column
print(ri.is_arrested.head())

0    False
1    False
3    False
4    False
5    False
Name: is_arrested, dtype: object


In [15]:
# Check the data type of 'is_arrested'
print(ri.is_arrested.dtype)

object


In [16]:
# Change the data type of 'is_arrested' to 'bool'
ri['is_arrested'] = ri.is_arrested.astype('bool')

In [17]:
# Check the data type of 'is_arrested' (again)
print(ri.is_arrested.dtype)

bool


## Combining object columns

In [18]:
# Concatenate 'stop_date' and 'stop_time' (separated by a space)
combined = ri.stop_date.str.cat(ri.stop_time, sep=' ')

In [19]:
# Convert 'combined' to datetime format
ri['stop_datetime'] = pd.to_datetime(combined)

In [20]:
# Examine the data types of the DataFrame
print(ri.dtypes)

id                               object
stop_date                        object
stop_time                        object
location_raw                     object
county_fips                     float64
fine_grained_location           float64
police_department                object
driver_gender                    object
driver_age_raw                  float64
driver_age                      float64
driver_race_raw                  object
driver_race                      object
violation_raw                    object
violation                        object
search_conducted                   bool
search_type_raw                  object
search_type                      object
contraband_found                   bool
stop_outcome                     object
is_arrested                        bool
stop_duration                    object
out_of_state                     object
drugs_related_stop                 bool
district                         object
stop_datetime            datetime64[ns]


In [21]:
# Set 'stop_datetime' as the index
ri.set_index('stop_datetime', inplace=True)

In [22]:
# Examine the index
print(ri.index)

DatetimeIndex(['2005-01-02 01:55:00', '2005-01-02 20:30:00',
               '2005-01-04 12:55:00', '2005-01-06 01:30:00',
               '2005-01-12 08:05:00', '2005-01-18 08:15:00',
               '2005-01-18 17:13:00', '2005-01-23 23:15:00',
               '2005-01-24 20:32:00', '2005-02-09 03:05:00',
               ...
               '2006-08-08 22:22:00', '2006-08-08 22:25:00',
               '2006-08-08 22:30:00', '2006-08-08 22:30:00',
               '2006-08-08 22:45:00', '2006-08-08 22:45:00',
               '2006-08-08 22:45:00', '2006-08-08 22:53:00',
               '2006-08-08 23:00:00', '2006-08-08 23:00:00'],
              dtype='datetime64[ns]', name='stop_datetime', length=48010, freq=None)


In [23]:
# Examine the columns
print(ri.columns)

Index(['id', 'stop_date', 'stop_time', 'location_raw', 'county_fips',
       'fine_grained_location', 'police_department', 'driver_gender',
       'driver_age_raw', 'driver_age', 'driver_race_raw', 'driver_race',
       'violation_raw', 'violation', 'search_conducted', 'search_type_raw',
       'search_type', 'contraband_found', 'stop_outcome', 'is_arrested',
       'stop_duration', 'out_of_state', 'drugs_related_stop', 'district'],
      dtype='object')
