In [63]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

In [64]:
df = pd.read_csv('../df_after_dp.csv', parse_dates=['date'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 187534 entries, 0 to 187533
Data columns (total 36 columns):
 #   Column                            Non-Null Count   Dtype         
---  ------                            --------------   -----         
 0   date                              187534 non-null  datetime64[ns]
 1   state                             187534 non-null  object        
 2   city_or_county                    187534 non-null  object        
 3   latitude                          187534 non-null  float64       
 4   longitude                         187534 non-null  float64       
 5   congressional_district            187534 non-null  int64         
 6   avg_age_participants              187534 non-null  float64       
 7   n_participants_child              187534 non-null  int64         
 8   n_participants_teen               187534 non-null  int64         
 9   n_females                         187534 non-null  float64       
 10  n_killed                        

In [65]:
#check if there is any null value
df.isnull().values.any()

False

In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 187534 entries, 0 to 187533
Data columns (total 36 columns):
 #   Column                            Non-Null Count   Dtype         
---  ------                            --------------   -----         
 0   date                              187534 non-null  datetime64[ns]
 1   state                             187534 non-null  object        
 2   city_or_county                    187534 non-null  object        
 3   latitude                          187534 non-null  float64       
 4   longitude                         187534 non-null  float64       
 5   congressional_district            187534 non-null  int64         
 6   avg_age_participants              187534 non-null  float64       
 7   n_participants_child              187534 non-null  int64         
 8   n_participants_teen               187534 non-null  int64         
 9   n_females                         187534 non-null  float64       
 10  n_killed                        

From the previous steps we decided to analyse the incidents from 2014 to 2017, so we don't need to filter the elements.

In [67]:
#Extracting the week number from the date
df["week_number"] = df["date"].dt.isocalendar().week

Group the filtered dataset by city and week, and count the number of incidents in each group.

In [68]:
counts = df.groupby(['city_or_county', 'week_number']).size().reset_index(name='count')


In [69]:
counts

Unnamed: 0,city_or_county,week_number,count
0,Abbeville,2,1
1,Abbeville,3,1
2,Abbeville,6,1
3,Abbeville,7,1
4,Abbeville,8,2
...,...,...,...
65788,Zionville,44,1
65789,Zumbro Falls,27,1
65790,Zuni (Zuni Pueblo),33,1
65791,Zwolle,15,1


In [70]:
city_counts = counts.groupby('city_or_county')['week_number'].nunique().reset_index(name='weeks')
city_counts

Unnamed: 0,city_or_county,weeks
0,Abbeville,20
1,Abbotsford,1
2,Abbott,1
3,Abbottstown,1
4,Aberdeen,30
...,...,...
11578,Zionville,1
11579,Zumbro Falls,1
11580,Zuni (Zuni Pueblo),1
11581,Zwolle,1


In [71]:
weeks_lower_bound = 0.15 * 4 * 52
cities_filtered = city_counts[city_counts['weeks'] > weeks_lower_bound]['city_or_county']
cities_filtered

5            Abilene
48             Aiken
55             Akron
71            Albany
82       Albuquerque
            ...     
11495         Yakima
11526        Yonkers
11528           York
11542     Youngstown
11548      Ypsilanti
Name: city_or_county, Length: 484, dtype: object

In [73]:
counts_filtered = counts[counts['city_or_county'].isin(cities_filtered)]
counts_filtered

Unnamed: 0,city_or_county,week_number,count
53,Abilene,1,7
54,Abilene,2,3
55,Abilene,3,1
56,Abilene,4,3
57,Abilene,5,4
...,...,...,...
65619,Ypsilanti,48,1
65620,Ypsilanti,49,2
65621,Ypsilanti,50,1
65622,Ypsilanti,51,1


Compute a score for each city and week by dividing the number of incidents by the total number of weeks in the 4-year period.


In [44]:
counts['score'] = counts['count'] / (4 * 52)
