# Gun Violence in the US: Data Cleaning

In [1]:
# import dependencies
import pandas as pd

In [2]:
# read tSV file (tab separated)
file_path = "../Resources/gun-violence-data_01-2013_03-2018.csv"
df = pd.read_csv(file_path, encoding="utf-8")
df.head()

Unnamed: 0,incident_id,date,state,city_or_county,address,n_killed,n_injured,incident_url,source_url,incident_url_fields_missing,...,participant_age,participant_age_group,participant_gender,participant_name,participant_relationship,participant_status,participant_type,sources,state_house_district,state_senate_district
0,461105,2013-01-01,Pennsylvania,Mckeesport,1506 Versailles Avenue and Coursin Street,0,4,http://www.gunviolencearchive.org/incident/461105,http://www.post-gazette.com/local/south/2013/0...,False,...,0::20,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Male||1::Male||3::Male||4::Female,0::Julian Sims,,0::Arrested||1::Injured||2::Injured||3::Injure...,0::Victim||1::Victim||2::Victim||3::Victim||4:...,http://pittsburgh.cbslocal.com/2013/01/01/4-pe...,,
1,460726,2013-01-01,California,Hawthorne,13500 block of Cerise Avenue,1,3,http://www.gunviolencearchive.org/incident/460726,http://www.dailybulletin.com/article/zz/201301...,False,...,0::20,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Male,0::Bernard Gillis,,0::Killed||1::Injured||2::Injured||3::Injured,0::Victim||1::Victim||2::Victim||3::Victim||4:...,http://losangeles.cbslocal.com/2013/01/01/man-...,62.0,35.0
2,478855,2013-01-01,Ohio,Lorain,1776 East 28th Street,1,3,http://www.gunviolencearchive.org/incident/478855,http://chronicle.northcoastnow.com/2013/02/14/...,False,...,0::25||1::31||2::33||3::34||4::33,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Male||1::Male||2::Male||3::Male||4::Male,0::Damien Bell||1::Desmen Noble||2::Herman Sea...,,"0::Injured, Unharmed, Arrested||1::Unharmed, A...",0::Subject-Suspect||1::Subject-Suspect||2::Vic...,http://www.morningjournal.com/general-news/201...,56.0,13.0
3,478925,2013-01-05,Colorado,Aurora,16000 block of East Ithaca Place,4,0,http://www.gunviolencearchive.org/incident/478925,http://www.dailydemocrat.com/20130106/aurora-s...,False,...,0::29||1::33||2::56||3::33,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Female||1::Male||2::Male||3::Male,0::Stacie Philbrook||1::Christopher Ratliffe||...,,0::Killed||1::Killed||2::Killed||3::Killed,0::Victim||1::Victim||2::Victim||3::Subject-Su...,http://denver.cbslocal.com/2013/01/06/officer-...,40.0,28.0
4,478959,2013-01-07,North Carolina,Greensboro,307 Mourning Dove Terrace,2,2,http://www.gunviolencearchive.org/incident/478959,http://www.journalnow.com/news/local/article_d...,False,...,0::18||1::46||2::14||3::47,0::Adult 18+||1::Adult 18+||2::Teen 12-17||3::...,0::Female||1::Male||2::Male||3::Female,0::Danielle Imani Jameison||1::Maurice Eugene ...,3::Family,0::Injured||1::Injured||2::Killed||3::Killed,0::Victim||1::Victim||2::Victim||3::Subject-Su...,http://myfox8.com/2013/01/08/update-mother-sho...,62.0,27.0


In [3]:
# Getting list of column names 
df.columns.to_list()

['incident_id',
 'date',
 'state',
 'city_or_county',
 'address',
 'n_killed',
 'n_injured',
 'incident_url',
 'source_url',
 'incident_url_fields_missing',
 'congressional_district',
 'gun_stolen',
 'gun_type',
 'incident_characteristics',
 'latitude',
 'location_description',
 'longitude',
 'n_guns_involved',
 'notes',
 'participant_age',
 'participant_age_group',
 'participant_gender',
 'participant_name',
 'participant_relationship',
 'participant_status',
 'participant_type',
 'sources',
 'state_house_district',
 'state_senate_district']

In [4]:
# dropping unimportant variables
cols_to_drop = ['city_or_county', 'address', 'incident_url', 'source_url', 'incident_url_fields_missing',
               'congressional_district', 'gun_stolen',  'gun_type', 'location_description', 'n_guns_involved','notes',
               'participant_name', 'participant_relationship', 'participant_status', 'participant_type', 'sources',
               'state_house_district', 'state_senate_district', 'participant_age', 
                'participant_age_group', 'participant_gender']
df.drop(columns=cols_to_drop, inplace=True)
df.head()

Unnamed: 0,incident_id,date,state,n_killed,n_injured,incident_characteristics,latitude,longitude
0,461105,2013-01-01,Pennsylvania,0,4,Shot - Wounded/Injured||Mass Shooting (4+ vict...,40.3467,-79.8559
1,460726,2013-01-01,California,1,3,"Shot - Wounded/Injured||Shot - Dead (murder, a...",33.909,-118.333
2,478855,2013-01-01,Ohio,1,3,"Shot - Wounded/Injured||Shot - Dead (murder, a...",41.4455,-82.1377
3,478925,2013-01-05,Colorado,4,0,"Shot - Dead (murder, accidental, suicide)||Off...",39.6518,-104.802
4,478959,2013-01-07,North Carolina,2,2,"Shot - Wounded/Injured||Shot - Dead (murder, a...",36.114,-79.9569


### Processing the 'incident_characteristics' column

In [5]:
# Removing unwanted text
df['incident_characteristics'] = df['incident_characteristics'].str.replace(r'\(s\)', '', regex=True) 
df['incident_characteristics'] = df['incident_characteristics'].str.replace(r'\(.*?\)', '', regex=True)
df['incident_characteristics'] = df['incident_characteristics'].str.replace(r'\|\|', ',', regex=True)
df['incident_characteristics'] = df['incident_characteristics'].str.replace(r'\|', ',', regex=True) 
df['incident_characteristics'] = df['incident_characteristics'].str.replace('^', '') 

df.head()

Unnamed: 0,incident_id,date,state,n_killed,n_injured,incident_characteristics,latitude,longitude
0,461105,2013-01-01,Pennsylvania,0,4,"Shot - Wounded/Injured,Mass Shooting ,Possessi...",40.3467,-79.8559
1,460726,2013-01-01,California,1,3,"Shot - Wounded/Injured,Shot - Dead ,Mass Shoot...",33.909,-118.333
2,478855,2013-01-01,Ohio,1,3,"Shot - Wounded/Injured,Shot - Dead ,Shots Fire...",41.4455,-82.1377
3,478925,2013-01-05,Colorado,4,0,"Shot - Dead ,Officer Involved Incident,Officer...",39.6518,-104.802
4,478959,2013-01-07,North Carolina,2,2,"Shot - Wounded/Injured,Shot - Dead ,Suicide,Mu...",36.114,-79.9569


In [6]:
# Splitting the 'incident_characteristics' column 
df_split = df['incident_characteristics'].str.split(',', expand=True).fillna('0')
df_split.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,Shot - Wounded/Injured,Mass Shooting,Possession,Possession of gun by felon or prohibited person,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Shot - Wounded/Injured,Shot - Dead,Mass Shooting,Gang involvement,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Shot - Wounded/Injured,Shot - Dead,Shots Fired - No Injuries,Bar/club incident - in or around establishment,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Shot - Dead,Officer Involved Incident,Officer Involved Shooting - subject/suspect/pe...,Drug involvement,Kidnapping/abductions/hostage,Under the influence of alcohol or drugs,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Shot - Wounded/Injured,Shot - Dead,Suicide,Murder/Suicide,Attempted Murder/Suicide,Domestic Violence,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [7]:
# Checking values in columns
for i in range(0,20):
    print(f'Column #{i}')
    print('-' * 30)        
    print(df_split[i].value_counts())
    print('~' * 50)

Column #0
------------------------------
Shot - Wounded/Injured                                           93926
Shot - Dead                                                      45054
Non-Shooting Incident                                            41541
Shots Fired - No Injuries                                        33949
Armed robbery with injury/death and/or evidence of DGU found      7528
Institution/Group/Business                                        4316
TSA Action                                                        2653
Brandishing/flourishing/open carry/lost/found                     1350
Home Invasion                                                     1323
Possession                                                        1310
Drug involvement                                                  1005
Shots fired                                                        961
Gun stolen from owner                                              834
Officer Involved Incident           

Name: 13, dtype: int64
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Column #14
------------------------------
0                                                                239659
Brandishing/flourishing/open carry/lost/found                         5
Stolen/Illegally owned gun{s} recovered during arrest/warrant         3
Domestic Violence                                                     2
Possession of gun by felon or prohibited person                       2
Gun stolen from owner                                                 1
Under the influence of alcohol or drugs                               1
Defensive Use - Stand Your Ground/Castle Doctrine established         1
Pistol-whipping                                                       1
Concealed Carry License - Victim                                      1
Guns stolen from law enforcement                                      1
Name: 14, dtype: int64
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Column #15
-------

In [8]:
# Renaming/Recategorizing Incidents in the original df, and then will split again
df['incident_characteristics'] = df['incident_characteristics']\
.str.replace('Possession of gun by felon or prohibited person', 'Possession')
df['incident_characteristics'] = df['incident_characteristics']\
.str.replace('Shot - Wounded/Injured', 'Shooting')
df['incident_characteristics'] = df['incident_characteristics']\
.str.replace('Shots Fired - No Injuries', 'Shooting')
df['incident_characteristics'] = df['incident_characteristics']\
.str.replace('Shot - Dead', 'Shooting')
df['incident_characteristics'] = df['incident_characteristics']\
.str.replace('Accidental Shooting', 'Shooting')
df['incident_characteristics'] = df['incident_characteristics']\
.str.replace('Armed robbery with injury/death and/or evidence of DGU found', 'Armed robbery')
df['incident_characteristics'] = df['incident_characteristics']\
.str.replace('Bar/club incident - in or around establishment', 'Bar/club incident') 
df['incident_characteristics'] = df['incident_characteristics']\
.str.replace('Officer Involved Shooting - subject/suspect/perpetrator killed', 'Officer Involved Incident') 
df['incident_characteristics'] = df['incident_characteristics']\
.str.replace('Attempted Murder/Suicide', 'Murder/Suicide')   
df['incident_characteristics'] = df['incident_characteristics']\
.str.replace('Home Invasion - Resident killed', 'Home Invasion') 
df['incident_characteristics'] = df['incident_characteristics']\
.str.replace('Officer Involved Shooting - subject/suspect/perpetrator shot', 'Officer Involved Incident')  
df['incident_characteristics'] = df['incident_characteristics']\
.str.replace('Officer Involved Shooting - Officer shot', 'Officer Involved Incident') 
df['incident_characteristics'] = df['incident_characteristics']\
.str.replace('Officer Involved Shooting - Officer killed', 'Officer Involved Incident')
df['incident_characteristics'] = df['incident_characteristics']\
.str.replace('Officer Involved Shooting - Accidental discharge - no injury required', 'Officer Involved Incident')
df['incident_characteristics'] = df['incident_characteristics']\
.str.replace('School Shooting - elementary/secondary school', 'School Shooting')
df['incident_characteristics'] = df['incident_characteristics']\
.str.replace('Suicide', 'Suicide/Attempt')
df['incident_characteristics'] = df['incident_characteristics']\
.str.replace('Suicide - Attempt', 'Suicide/Attempt')
df['incident_characteristics'] = df['incident_characteristics']\
.str.replace('Gun at school', 'School Incident')
df['incident_characteristics'] = df['incident_characteristics']\
.str.replace('Shots fired', 'Shooting')
df['incident_characteristics'] = df['incident_characteristics']\
.str.replace('Brandishing/flourishing/open carry/lost/found', 'Brandishing Gun')
df['incident_characteristics'] = df['incident_characteristics']\
.str.replace('Home Invasion - No death or injury', 'Home Invasion')
df['incident_characteristics'] = df['incident_characteristics']\
.str.replace('Home Invasion - Resident injured', 'Home Invasion')
df['incident_characteristics'] = df['incident_characteristics']\
.str.replace('Shooting - Injury', 'Shooting')
df['incident_characteristics'] = df['incident_characteristics']\
.str.replace('Officer Involved Incident - Weapon involved but no shots fired', 'Officer Involved Incident') 
df['incident_characteristics'] = df['incident_characteristics']\
.str.replace('Officer Involved Shooting - Shooting', 'Officer Involved Incident') 
df['incident_characteristics'] = df['incident_characteristics']\
.str.replace(r'\s*,\s*', ',', regex=True) # removing spaces before and after commas
df['incident_characteristics'] = df['incident_characteristics']\
.str.replace(r'\s+', ' ', regex=True) # removing any extra number of spaces with only one space 
df['incident_characteristics'] = df['incident_characteristics']\
.str.strip() # removing spaces in the begining and end of the string

df.head()

Unnamed: 0,incident_id,date,state,n_killed,n_injured,incident_characteristics,latitude,longitude
0,461105,2013-01-01,Pennsylvania,0,4,"Shooting,Mass Shooting,Possession,Possession",40.3467,-79.8559
1,460726,2013-01-01,California,1,3,"Shooting,Shooting,Mass Shooting,Gang involvement",33.909,-118.333
2,478855,2013-01-01,Ohio,1,3,"Shooting,Shooting,Shooting,Bar/club incident",41.4455,-82.1377
3,478925,2013-01-05,Colorado,4,0,"Shooting,Officer Involved Incident,Officer Inv...",39.6518,-104.802
4,478959,2013-01-07,North Carolina,2,2,"Shooting,Shooting,Suicide/Attempt,Murder/Suici...",36.114,-79.9569


In [9]:
# Re-splitting and checking values in columns
df_split = df['incident_characteristics'].str.split(',', expand=True).fillna('0')
for i in range(0,20):
    print(f'Column #{i}')
    print('-' * 30)        
    print(df_split[i].value_counts()[0:10])
    print('~' * 50)

Column #0
------------------------------
Shooting                      174119
Non-Shooting Incident          41541
Armed robbery                   7528
Institution/Group/Business      4316
TSA Action                      2653
Possession                      1373
Brandishing Gun                 1350
Home Invasion                   1323
Drug involvement                1005
Gun stolen from owner            834
Name: 0, dtype: int64
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Column #1
------------------------------
0                                  96995
Shooting                           16689
Officer Involved Incident          14419
Drive-by                           11873
Drug involvement                   11716
Home Invasion                       9829
Possession                          8332
Armed robbery                       6438
ATF/LE Confiscation/Raid/Arrest     5915
Domestic Violence                   5690
Name: 1, dtype: int64
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

In [10]:
# Categories keeping
catgories = ['Shooting', 'Armed robbery', 'Possession', 'Home Invasion', 'Drug involvement', 'Officer Involved Incident',
       'Car-jacking', 'Animal shot/killed','Domestic Violence', 'Gang involvement', 'School Incident', 'Drive-by',
       'Brandishing Gun', 'Institution/Group/Business', 'TSA Action']

# looping through the categoy list to create new 0/1 columns

for catg in catgories:
    
    # setting default value of new column to 0
    df_split[catg] = 0
    
    # looping through the splitted columns
    for i in range(0,20): 
        
        # if the value of the column equals the catg name, swtich the new category value to 1
        df_split.loc[df_split[i] == catg, catg] = 1  
        
df_split.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,Officer Involved Incident,Car-jacking,Animal shot/killed,Domestic Violence,Gang involvement,School Incident,Drive-by,Brandishing Gun,Institution/Group/Business,TSA Action
0,Shooting,Mass Shooting,Possession,Possession,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Shooting,Shooting,Mass Shooting,Gang involvement,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,Shooting,Shooting,Shooting,Bar/club incident,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Shooting,Officer Involved Incident,Officer Involved Incident,Drug involvement,Kidnapping/abductions/hostage,Under the influence of alcohol or drugs,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,Shooting,Shooting,Suicide/Attempt,Murder/Suicide/Attempt,Murder/Suicide/Attempt,Domestic Violence,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [11]:
# checking the new values
for catg in catgories:
    print(f'category: {catg}')
    print('-' * 50)
    print(df_split[catg].value_counts())
    print('~' * 50)
    

category: Shooting
--------------------------------------------------
1    175725
0     63952
Name: Shooting, dtype: int64
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
category: Armed robbery
--------------------------------------------------
0    219954
1     19723
Name: Armed robbery, dtype: int64
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
category: Possession
--------------------------------------------------
0    200625
1     39052
Name: Possession, dtype: int64
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
category: Home Invasion
--------------------------------------------------
0    229036
1     10641
Name: Home Invasion, dtype: int64
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
category: Drug involvement
--------------------------------------------------
0    222578
1     17099
Name: Drug involvement, dtype: int64
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
category: Officer Involved Incident
--------------------------------------------------
0  

In [12]:
# Dropping the original columns
cols_to_drop = range(0, 20)
df_split.drop(columns=cols_to_drop, inplace=True)
df_split.head()

Unnamed: 0,Shooting,Armed robbery,Possession,Home Invasion,Drug involvement,Officer Involved Incident,Car-jacking,Animal shot/killed,Domestic Violence,Gang involvement,School Incident,Drive-by,Brandishing Gun,Institution/Group/Business,TSA Action
0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [13]:
# Concatenate the split columns with the original DataFrame
violence_df = pd.concat([df, df_split], axis=1)
violence_df.head()

Unnamed: 0,incident_id,date,state,n_killed,n_injured,incident_characteristics,latitude,longitude,Shooting,Armed robbery,...,Officer Involved Incident,Car-jacking,Animal shot/killed,Domestic Violence,Gang involvement,School Incident,Drive-by,Brandishing Gun,Institution/Group/Business,TSA Action
0,461105,2013-01-01,Pennsylvania,0,4,"Shooting,Mass Shooting,Possession,Possession",40.3467,-79.8559,1,0,...,0,0,0,0,0,0,0,0,0,0
1,460726,2013-01-01,California,1,3,"Shooting,Shooting,Mass Shooting,Gang involvement",33.909,-118.333,1,0,...,0,0,0,0,1,0,0,0,0,0
2,478855,2013-01-01,Ohio,1,3,"Shooting,Shooting,Shooting,Bar/club incident",41.4455,-82.1377,1,0,...,0,0,0,0,0,0,0,0,0,0
3,478925,2013-01-05,Colorado,4,0,"Shooting,Officer Involved Incident,Officer Inv...",39.6518,-104.802,1,0,...,1,0,0,0,0,0,0,0,0,0
4,478959,2013-01-07,North Carolina,2,2,"Shooting,Shooting,Suicide/Attempt,Murder/Suici...",36.114,-79.9569,1,0,...,0,0,0,1,0,0,0,0,0,0


In [14]:
# Dropping the incident_characteristics column
violence_df.drop(columns="incident_characteristics", inplace=True)
violence_df.head()

Unnamed: 0,incident_id,date,state,n_killed,n_injured,latitude,longitude,Shooting,Armed robbery,Possession,...,Officer Involved Incident,Car-jacking,Animal shot/killed,Domestic Violence,Gang involvement,School Incident,Drive-by,Brandishing Gun,Institution/Group/Business,TSA Action
0,461105,2013-01-01,Pennsylvania,0,4,40.3467,-79.8559,1,0,1,...,0,0,0,0,0,0,0,0,0,0
1,460726,2013-01-01,California,1,3,33.909,-118.333,1,0,0,...,0,0,0,0,1,0,0,0,0,0
2,478855,2013-01-01,Ohio,1,3,41.4455,-82.1377,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,478925,2013-01-05,Colorado,4,0,39.6518,-104.802,1,0,0,...,1,0,0,0,0,0,0,0,0,0
4,478959,2013-01-07,North Carolina,2,2,36.114,-79.9569,1,0,0,...,0,0,0,1,0,0,0,0,0,0


In [15]:
violence_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 239677 entries, 0 to 239676
Data columns (total 22 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   incident_id                 239677 non-null  int64  
 1   date                        239677 non-null  object 
 2   state                       239677 non-null  object 
 3   n_killed                    239677 non-null  int64  
 4   n_injured                   239677 non-null  int64  
 5   latitude                    231754 non-null  float64
 6   longitude                   231754 non-null  float64
 7   Shooting                    239677 non-null  int64  
 8   Armed robbery               239677 non-null  int64  
 9   Possession                  239677 non-null  int64  
 10  Home Invasion               239677 non-null  int64  
 11  Drug involvement            239677 non-null  int64  
 12  Officer Involved Incident   239677 non-null  int64  
 13  Car-jacking   

In [19]:
# Changing date column to datetime format
violence_df['date'] = pd.to_datetime(violence_df['date'])
violence_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 239677 entries, 0 to 239676
Data columns (total 22 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   incident_id                 239677 non-null  int64         
 1   date                        239677 non-null  datetime64[ns]
 2   state                       239677 non-null  object        
 3   n_killed                    239677 non-null  int64         
 4   n_injured                   239677 non-null  int64         
 5   latitude                    231754 non-null  float64       
 6   longitude                   231754 non-null  float64       
 7   Shooting                    239677 non-null  int64         
 8   Armed robbery               239677 non-null  int64         
 9   Possession                  239677 non-null  int64         
 10  Home Invasion               239677 non-null  int64         
 11  Drug involvement            239677 non-

In [20]:
# extracting the year and the month as new columns
violence_df['year'] = violence_df['date'].dt.year
violence_df['month'] = violence_df['date'].dt.month_name()
violence_df.tail()

Unnamed: 0,incident_id,date,state,n_killed,n_injured,latitude,longitude,Shooting,Armed robbery,Possession,...,Animal shot/killed,Domestic Violence,Gang involvement,School Incident,Drive-by,Brandishing Gun,Institution/Group/Business,TSA Action,year,month
239672,1083142,2018-03-31,Louisiana,0,0,,,1,0,0,...,0,0,0,0,0,0,0,0,2018,March
239673,1083139,2018-03-31,Louisiana,1,0,31.7537,-93.0836,1,0,0,...,0,0,0,0,0,0,1,0,2018,March
239674,1083151,2018-03-31,Louisiana,0,1,29.9239,-90.0442,1,0,0,...,0,0,0,0,0,0,0,0,2018,March
239675,1082514,2018-03-31,Texas,1,0,29.7201,-95.611,1,0,0,...,0,0,0,0,0,0,0,0,2018,March
239676,1081940,2018-03-31,Maine,2,0,44.7293,-69.7691,1,0,0,...,0,1,0,0,0,0,0,0,2018,March


In [22]:
# Total number of victims
violence_df['n_killed_or_injured'] = violence_df['n_killed'] + violence_df['n_injured'] 
violence_df.tail()

Unnamed: 0,incident_id,date,state,n_killed,n_injured,latitude,longitude,Shooting,Armed robbery,Possession,...,Domestic Violence,Gang involvement,School Incident,Drive-by,Brandishing Gun,Institution/Group/Business,TSA Action,year,month,n_killed_or_injured
239672,1083142,2018-03-31,Louisiana,0,0,,,1,0,0,...,0,0,0,0,0,0,0,2018,March,0
239673,1083139,2018-03-31,Louisiana,1,0,31.7537,-93.0836,1,0,0,...,0,0,0,0,0,1,0,2018,March,1
239674,1083151,2018-03-31,Louisiana,0,1,29.9239,-90.0442,1,0,0,...,0,0,0,0,0,0,0,2018,March,1
239675,1082514,2018-03-31,Texas,1,0,29.7201,-95.611,1,0,0,...,0,0,0,0,0,0,0,2018,March,1
239676,1081940,2018-03-31,Maine,2,0,44.7293,-69.7691,1,0,0,...,1,0,0,0,0,0,0,2018,March,2


In [23]:
# Getting list of column names 
violence_df.columns.to_list()

['incident_id',
 'date',
 'state',
 'n_killed',
 'n_injured',
 'latitude',
 'longitude',
 'Shooting',
 'Armed robbery',
 'Possession',
 'Home Invasion',
 'Drug involvement',
 'Officer Involved Incident',
 'Car-jacking',
 'Animal shot/killed',
 'Domestic Violence',
 'Gang involvement',
 'School Incident',
 'Drive-by',
 'Brandishing Gun',
 'Institution/Group/Business',
 'TSA Action',
 'year',
 'month',
 'n_killed_or_injured']

In [24]:
# Define the desired column order
column_order = ['incident_id', 'date','year','month','state', 'latitude', 'longitude',
                'n_killed', 'n_injured', 'n_killed_or_injured', 
                'Shooting', 'Armed robbery', 'Possession', 'Home Invasion', 'Drug involvement',
                'Officer Involved Incident', 'Car-jacking', 'Animal shot/killed',
                'Domestic Violence', 'Gang involvement', 'School Incident', 'Drive-by',
                'Brandishing Gun', 'Institution/Group/Business', 'TSA Action']
# Reorder the columns in the DataFrame
violence_df = violence_df.reindex(columns=column_order)
violence_df.head()

Unnamed: 0,incident_id,date,year,month,state,latitude,longitude,n_killed,n_injured,n_killed_or_injured,...,Officer Involved Incident,Car-jacking,Animal shot/killed,Domestic Violence,Gang involvement,School Incident,Drive-by,Brandishing Gun,Institution/Group/Business,TSA Action
0,461105,2013-01-01,2013,January,Pennsylvania,40.3467,-79.8559,0,4,4,...,0,0,0,0,0,0,0,0,0,0
1,460726,2013-01-01,2013,January,California,33.909,-118.333,1,3,4,...,0,0,0,0,1,0,0,0,0,0
2,478855,2013-01-01,2013,January,Ohio,41.4455,-82.1377,1,3,4,...,0,0,0,0,0,0,0,0,0,0
3,478925,2013-01-05,2013,January,Colorado,39.6518,-104.802,4,0,4,...,1,0,0,0,0,0,0,0,0,0
4,478959,2013-01-07,2013,January,North Carolina,36.114,-79.9569,2,2,4,...,0,0,0,1,0,0,0,0,0,0


In [25]:
# exporting the cleaned data to csv
violence_df.to_csv("../Resources/gun_violence_2013to2018_cleaned.csv", encoding="utf-8", index=False, header=True)