In [78]:
import pandas as pd

# Try reading the CSV file with different encodings
try:
    df = pd.read_csv("/content/homicide-data.csv", encoding='utf-8')
except UnicodeDecodeError:
    # If utf-8 fails, try alternative encodings
    try:
        df = pd.read_csv("/content/homicide-data.csv", encoding='ISO-8859-1')
    except UnicodeDecodeError:
        # If ISO-8859-1 also fails, try another encoding such as 'Windows-1252'
        df = pd.read_csv("/content/homicide-data.csv", encoding='Windows-1252')

# Now you can continue working with the DataFrame 'df'


In [79]:
df.head()

Unnamed: 0,uid,reported_date,victim_last,victim_first,victim_race,victim_age,victim_sex,city,state,lat,lon,disposition
0,Alb-000001,20100504,GARCIA,JUAN,Hispanic,78,Male,Albuquerque,NM,35.095788,-106.538555,Closed without arrest
1,Alb-000002,20100216,MONTOYA,CAMERON,Hispanic,17,Male,Albuquerque,NM,35.05681,-106.715321,Closed by arrest
2,Alb-000003,20100601,SATTERFIELD,VIVIANA,White,15,Female,Albuquerque,NM,35.086092,-106.695568,Closed without arrest
3,Alb-000004,20100101,MENDIOLA,CARLOS,Hispanic,32,Male,Albuquerque,NM,35.078493,-106.556094,Closed by arrest
4,Alb-000005,20100102,MULA,VIVIAN,White,72,Female,Albuquerque,NM,35.130357,-106.580986,Closed without arrest


In [80]:
df['victim_age'] = pd.to_numeric(df['victim_age'], errors='coerce')

In [81]:
def age_to_slab(age):
    if age <= 20:
        return "0-20"
    elif age <= 30:
        return "21-30"
    elif age <= 40:
        return "31-40"
    elif age <= 50:
        return "41-50"
    elif age <= 60:
        return "51-60"
    elif age <= 70:
        return "61-70"
    elif age <= 80:
        return "71-80"
    elif age <= 90:
        return "81-90"
    elif age <= 100:
        return "91-100"
    elif age <= 110:
        return "101-110"
    else:
        return "Unknown"

In [82]:
df['age_slab'] = df['victim_age'].apply(age_to_slab)

In [83]:
df.head()

Unnamed: 0,uid,reported_date,victim_last,victim_first,victim_race,victim_age,victim_sex,city,state,lat,lon,disposition,age_slab
0,Alb-000001,20100504,GARCIA,JUAN,Hispanic,78.0,Male,Albuquerque,NM,35.095788,-106.538555,Closed without arrest,71-80
1,Alb-000002,20100216,MONTOYA,CAMERON,Hispanic,17.0,Male,Albuquerque,NM,35.05681,-106.715321,Closed by arrest,0-20
2,Alb-000003,20100601,SATTERFIELD,VIVIANA,White,15.0,Female,Albuquerque,NM,35.086092,-106.695568,Closed without arrest,0-20
3,Alb-000004,20100101,MENDIOLA,CARLOS,Hispanic,32.0,Male,Albuquerque,NM,35.078493,-106.556094,Closed by arrest,31-40
4,Alb-000005,20100102,MULA,VIVIAN,White,72.0,Female,Albuquerque,NM,35.130357,-106.580986,Closed without arrest,71-80


In [84]:
df = df.drop(columns=['reported_date',	'victim_last',	'victim_first'], axis = 1)

In [85]:
df.head()

Unnamed: 0,uid,victim_race,victim_age,victim_sex,city,state,lat,lon,disposition,age_slab
0,Alb-000001,Hispanic,78.0,Male,Albuquerque,NM,35.095788,-106.538555,Closed without arrest,71-80
1,Alb-000002,Hispanic,17.0,Male,Albuquerque,NM,35.05681,-106.715321,Closed by arrest,0-20
2,Alb-000003,White,15.0,Female,Albuquerque,NM,35.086092,-106.695568,Closed without arrest,0-20
3,Alb-000004,Hispanic,32.0,Male,Albuquerque,NM,35.078493,-106.556094,Closed by arrest,31-40
4,Alb-000005,White,72.0,Female,Albuquerque,NM,35.130357,-106.580986,Closed without arrest,71-80


In [86]:
df.isnull().sum()

uid               0
victim_race       0
victim_age     2999
victim_sex        0
city              0
state             0
lat              60
lon              60
disposition       0
age_slab          0
dtype: int64

In [87]:
df['victim_race'].unique()

array(['Hispanic', 'White', 'Other', 'Black', 'Asian', 'Unknown'],
      dtype=object)

In [88]:
df['victim_sex'].unique()

array(['Male', 'Female', 'Unknown'], dtype=object)

In [89]:
df['city'].unique()

array(['Albuquerque', 'Atlanta', 'Baltimore', 'Baton Rouge', 'Birmingham',
       'Boston', 'Buffalo', 'Charlotte', 'Chicago', 'Cincinnati',
       'Columbus', 'Dallas', 'Denver', 'Detroit', 'Durham', 'Fort Worth',
       'Fresno', 'Houston', 'Indianapolis', 'Jacksonville', 'Kansas City',
       'Las Vegas', 'Long Beach', 'Los Angeles', 'Louisville', 'Memphis',
       'Miami', 'Milwaukee', 'Minneapolis', 'Nashville', 'New Orleans',
       'New York', 'Oakland', 'Oklahoma City', 'Omaha', 'Philadelphia',
       'Phoenix', 'Pittsburgh', 'Richmond', 'San Antonio', 'Sacramento',
       'Savannah', 'San Bernardino', 'San Diego', 'San Francisco',
       'St. Louis', 'Stockton', 'Tampa', 'Tulsa', 'Washington'],
      dtype=object)

In [90]:
df['state'].unique()

array(['NM', 'GA', 'MD', 'LA', 'AL', 'MA', 'NY', 'NC', 'IL', 'OH', 'TX',
       'CO', 'MI', 'CA', 'IN', 'FL', 'MO', 'NV', 'KY', 'TN', 'wI', 'MN',
       'OK', 'NE', 'PA', 'AZ', 'VA', 'DC'], dtype=object)

In [91]:
df = df.drop(columns=['victim_age'], axis = 1)

In [92]:
df['disposition'].unique()

array(['Closed without arrest', 'Closed by arrest', 'Open/No arrest'],
      dtype=object)

In [93]:
df.head()

Unnamed: 0,uid,victim_race,victim_sex,city,state,lat,lon,disposition,age_slab
0,Alb-000001,Hispanic,Male,Albuquerque,NM,35.095788,-106.538555,Closed without arrest,71-80
1,Alb-000002,Hispanic,Male,Albuquerque,NM,35.05681,-106.715321,Closed by arrest,0-20
2,Alb-000003,White,Female,Albuquerque,NM,35.086092,-106.695568,Closed without arrest,0-20
3,Alb-000004,Hispanic,Male,Albuquerque,NM,35.078493,-106.556094,Closed by arrest,31-40
4,Alb-000005,White,Female,Albuquerque,NM,35.130357,-106.580986,Closed without arrest,71-80


In [94]:
df.to_csv('distinct_victim_races.csv', index=False)