In [6]:
import pandas as pd
import folium
from sklearn.cluster import KMeans
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

def cluster_and_plot(data, num_clusters):
    # Select relevant features for clustering (e.g., latitude and longitude)
    coordinates = data[['latitude', 'longitude']]

    # Perform K-means clustering
    kmeans = KMeans(n_clusters=num_clusters, random_state=0)
    kmeans.fit(coordinates)

    # Add cluster labels to the DataFrame
    data['cluster_label'] = kmeans.labels_

    # Initialize Folium map
    map_clusters = folium.Map(location=[data['latitude'].mean(), data['longitude'].mean()], zoom_start=4)

    # Set color scheme for the clusters
    colors = ['red', 'blue', 'green', 'orange', 'purple', 'yellow']

    # Add markers to the map
    for lat, lon, cluster_label in zip(data['latitude'], data['longitude'], data['cluster_label']):
        folium.CircleMarker(
            [lat, lon],
            radius=5,
            color=colors[cluster_label],
            fill=True,
            fill_color=colors[cluster_label],
            fill_opacity=0.7).add_to(map_clusters)

    # Save the map with a filename based on the number of clusters
    map_clusters.save(f'clusters_map_{num_clusters}.html')

    # Count occurrences of each cluster label
    cluster_counts = data['cluster_label'].value_counts()

    # Determine the most targeted cluster
    most_targeted_cluster = cluster_counts.idxmax()

    # Filter data for the most targeted cluster
    most_targeted_data = data[data['cluster_label'] == most_targeted_cluster]

    # Count occurrences of each state in the most targeted cluster
    state_counts = most_targeted_data['state'].value_counts()

    # Determine the most targeted state
    most_targeted_state = state_counts.idxmax()

    # Output the result
    print(f"For {num_clusters} clusters: The most targeted cluster is Cluster {most_targeted_cluster+1}, with the most targeted state being {most_targeted_state}.")

# Load data from CSV file into a DataFrame
data = pd.read_csv('fatal-police-shootings-data-continental_US.csv')

# Check for missing values and handle them if necessary
data.dropna(inplace=True)  # Drop rows with missing values, if needed

# Define a list of numbers of clusters
num_clusters_list = [2, 3, 4, 5, 6]

# Perform clustering and plotting for each number of clusters
for num_clusters in num_clusters_list:
    cluster_and_plot(data, num_clusters)


For 2 clusters: The most targeted cluster is Cluster 1, with the most targeted state being TX.
For 3 clusters: The most targeted cluster is Cluster 3, with the most targeted state being FL.
For 4 clusters: The most targeted cluster is Cluster 2, with the most targeted state being CA.
For 5 clusters: The most targeted cluster is Cluster 2, with the most targeted state being CA.
For 6 clusters: The most targeted cluster is Cluster 4, with the most targeted state being CA.


In [7]:
import pandas as pd
import folium
from sklearn.cluster import KMeans

def cluster_and_plot(data):
    # Perform K-means clustering
    kmeans = KMeans(n_clusters=6, random_state=0)
    kmeans.fit(data[['latitude', 'longitude']])

    # Add cluster labels to the DataFrame
    data['cluster_label'] = kmeans.labels_

    # Initialize Folium map
    map_clusters = folium.Map(location=[data['latitude'].mean(), data['longitude'].mean()], zoom_start=4)

    # Set color scheme for the clusters
    colors = ['red', 'blue', 'green', 'orange', 'purple', 'yellow']

    # Add markers to the map
    for lat, lon, cluster_label in zip(data['latitude'], data['longitude'], data['cluster_label']):
        folium.CircleMarker(
            [lat, lon],
            radius=5,
            color=colors[cluster_label],
            fill=True,
            fill_color=colors[cluster_label],
            fill_opacity=0.7).add_to(map_clusters)

    # Save the map with a filename based on the number of clusters
    map_clusters.save(f'clusters_map23456.html')

    # Group the data by cluster, state, age, manner of death, and gender, and count occurrences
    correlation_data = data.groupby(['cluster_label', 'state', 'age', 'manner_of_death', 'gender']).size().reset_index(name='count')

    # Sort the data by count in descending order
    correlation_data.sort_values(by='count', ascending=False, inplace=True)

    # Display the correlations for each cluster
    for cluster in range(6):
        cluster_data = correlation_data[correlation_data['cluster_label'] == cluster]
        print(f"\nCluster {cluster+1} (Color: {colors[cluster]}):")
        print("\n{:<10} {:<20} {:<20} {:<20} {:<10}".format('State', 'Age', 'Manner of Death', 'Gender', 'Count'))
        for state in cluster_data['state'].unique():
            state_cluster_data = cluster_data[cluster_data['state'] == state]
            age_range = f"{state_cluster_data['age'].min()} - {state_cluster_data['age'].max()}"
            genders = state_cluster_data['gender'].unique()
            gender_str = " & ".join(genders) if len(genders) > 1 else genders[0]
            print("{:<10} {:<20} {:<20} {:<20} {:<10}".format(state, age_range, state_cluster_data.iloc[0]['manner_of_death'], gender_str, state_cluster_data['count'].sum()))

# Load data from CSV file into a DataFrame
data = pd.read_csv('fatal-police-shootings-data-continental_US.csv')

# Check for missing values and handle them if necessary
data.dropna(inplace=True)  # Drop rows with missing values, if needed

# Perform clustering and plotting
cluster_and_plot(data)



Cluster 1 (Color: red):

State      Age                  Manner of Death      Gender               Count     
TX         6.0 - 84.0           shot                 M & F                438       
OK         16.0 - 83.0          shot                 M & F                151       
LA         6.0 - 71.0           shot                 M & F                99        
AR         16.0 - 83.0          shot                 M                    62        
MO         16.0 - 68.0          shot                 M & F                71        
KS         18.0 - 67.0          shot                 M & F                53        
NE         23.0 - 57.0          shot                 M & F                23        
MS         19.0 - 54.0          shot                 M & F                25        
TN         19.0 - 49.0          shot                 M                    10        
CA         59.0 - 59.0          shot                 M                    1         
IA         28.0 - 28.0          shot   

In [10]:
import pandas as pd
import folium
from sklearn.cluster import KMeans
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

# Load data from CSV file into a DataFrame
data = pd.read_csv('fatal-police-shootings-data-continental_US.csv')

# Convert date column to datetime type
data['date'] = pd.to_datetime(data['date'], errors='coerce')

# Drop rows with missing or invalid dates
data.dropna(subset=['date'], inplace=True)

# Extract year from the date
data['year'] = data['date'].dt.year

# Group data by year and state, count the occurrences, and reset index
year_state_counts = data.groupby(['year', 'state']).size().reset_index(name='count')

# Find the top five states for each year
top_states_by_year = year_state_counts.groupby('year').apply(lambda x: x.nlargest(5, 'count')).reset_index(drop=True)

# Initialize an empty dictionary to store the top cities for each state and year
top_cities_by_year = {}

# Iterate over each row of the top states DataFrame
for index, row in top_states_by_year.iterrows():
    state_data = data[(data['state'] == row['state']) & (data['year'] == row['year'])]
    top_cities_by_year.setdefault(row['year'], []).append((row['state'], state_data))

# Perform clustering and generate maps for each year
def generate_maps():
    for year, state_data_list in top_cities_by_year.items():
        # Initialize Folium map centered on the United States
        map = folium.Map(location=[37.0902, -95.7129], zoom_start=4)

        # Perform K-means clustering for each state's data
        for state, state_data in state_data_list:
            # Drop rows with NaN values
            state_data.dropna(subset=['latitude', 'longitude'], inplace=True)

            # Check if there are any remaining rows after dropping NaN values
            if not state_data.empty:
                # Select relevant features for clustering
                coordinates = state_data[['latitude', 'longitude']]

                # Convert coordinates to a numpy array
                coordinates_array = coordinates.values

                # Perform K-means clustering
                kmeans = KMeans(n_clusters=5, random_state=0)
                kmeans.fit(coordinates_array)

                # Get cluster centers
                cluster_centers = kmeans.cluster_centers_

                # Get cluster labels for each data point
                labels = kmeans.labels_

                # Add cluster labels to the DataFrame
                state_data['cluster_label'] = labels

                # Define colors for markers
                colors = ['red', 'blue', 'green', 'orange', 'purple']

                # Add markers for each cluster
                for cluster_label, cluster_center in enumerate(cluster_centers):
                    cluster_points = state_data[state_data['cluster_label'] == cluster_label]
                    for _, row in cluster_points.iterrows():
                        folium.CircleMarker(
                            location=[row['latitude'], row['longitude']],
                            radius=5,
                            color=colors[cluster_label],
                            fill=True,
                            fill_color=colors[cluster_label],
                            fill_opacity=0.7
                        ).add_to(map)

        # Save the map as an HTML file with the year
        map.save(f'clusters_map_{year}.html')

        # Print states and cities for each year
        print(f"States and cities according to year {year}:")
        for state_data in state_data_list:
            state = state_data[0]
            cities = state_data[1]['city'].unique()[:3]  # Select top 3 cities
            print(f"{state} ({year}): {', '.join(cities)}")

# Generate maps for each year and print states and cities
generate_maps()


States and cities according to year 2015:
CA (2015): Eureka, Weitchpec, San Francisco
TX (2015): El Paso, West Odessa, Odessa
FL (2015): Navarre, Tallahassee, North Port
AZ (2015): Kingman, Prescott, Sun City
OK (2015): Lawton, Oklahoma City, Healdton
States and cities according to year 2016:
CA (2016): Yreka, San Francisco, Redding
TX (2016): El Paso, Odessa, Midland
FL (2016): Milton, Noma, Lealman
AZ (2016): Yuma, Bullhead City, Phoenix
CO (2016): Grand Junction, Montrose County, Boulder
States and cities according to year 2017:
CA (2017): Arcata, Santa Rosa, San Francisco
TX (2017): El Paso, Amarillo, Abilene
FL (2017): Cantonment, Bradenton, Ocala
OH (2017): Springfield, Englewood, Vandalia
CO (2017): Pleasant View, Golden, Loveland
States and cities according to year 2018:
CA (2018): Nice, San Francisco, Redding
FL (2018): Tarpon Springs, Tampa, Lawtey
TX (2018): El Paso, Hereford, Midland
AZ (2018): Yuma, Quartzsite, Kingman
GA (2018): Rossville, Floyd County, LaGrange
States an

In [19]:
import pandas as pd
import folium
from sklearn.cluster import KMeans

# Load data from CSV file into a DataFrame
data = pd.read_csv('fatal-police-shootings-data-continental_US.csv')

# Convert date column to datetime type
data['date'] = pd.to_datetime(data['date'], errors='coerce')

# Drop rows with missing or invalid dates
data.dropna(subset=['date'], inplace=True)

# Extract year from the date
data['year'] = data['date'].dt.year

# Group data by state and city, count the occurrences, and reset index
state_city_counts = data.groupby(['state', 'city']).size().reset_index(name='count')

# Find the top five cities overall
top_cities = state_city_counts.nlargest(5, 'count')

# Perform K-means clustering
kmeans = KMeans(n_clusters=1, random_state=0)

# Initialize Folium map centered on the United States
map = folium.Map(location=[37.0902, -95.7129], zoom_start=4)

# Perform K-means clustering for the top cities
for index, row in top_cities.iterrows():
    city_data = data[(data['state'] == row['state']) & (data['city'] == row['city'])]
    city_data.dropna(subset=['latitude', 'longitude'], inplace=True)  # Drop rows with missing latitude or longitude
    coordinates = city_data[['latitude', 'longitude']]
    if not coordinates.empty:
        kmeans.fit(coordinates)
        cluster_center = kmeans.cluster_centers_[0]

        # Create a string with city and its incidents
        city_info = f"<b>{row['city']}, {row['state']}</b><br>{row['count']} incidents"

        # Add custom HTML to display city and its incidents as labels next to marker
        html = f'<div style="font-family: Arial; font-size: 12px">{city_info}</div>'
        folium.Marker(
            location=cluster_center,
            icon=folium.DivIcon(html=html)
        ).add_to(map)

        print(f"Processing {row['city']}, {row['state']} - {row['count']} incidents")

        # Add marker for each city
        for _, city_row in city_data.iterrows():
            folium.Marker(
                location=[city_row['latitude'], city_row['longitude']],
                popup=city_row['city'],
                icon=folium.Icon(color='red', icon='info-sign')
            ).add_to(map)

# Save the map as an HTML file
map.save('clusters_map_overall.html')


Processing Los Angeles, CA - 44 incidents
Processing Phoenix, AZ - 32 incidents
Processing Las Vegas, NV - 32 incidents
Processing Houston, TX - 32 incidents
Processing Oklahoma City, OK - 23 incidents


In [21]:
import pandas as pd
import folium
from sklearn.cluster import KMeans

# Load data from CSV file into a DataFrame
data = pd.read_csv('fatal-police-shootings-data-continental_US.csv')

# Convert date column to datetime type
data['date'] = pd.to_datetime(data['date'], errors='coerce')

# Drop rows with missing or invalid dates
data.dropna(subset=['date'], inplace=True)

# Extract year from the date
data['year'] = data['date'].dt.year

# Group data by state and city, count the occurrences, and reset index
state_city_counts = data.groupby(['state', 'city']).size().reset_index(name='count')

# Find the top five cities overall
top_cities = state_city_counts.nlargest(5, columns='count')

# Perform K-means clustering
kmeans = KMeans(n_clusters=1, random_state=0)

# Initialize Folium map centered on the United States
map = folium.Map(location=[37.0902, -95.7129], zoom_start=4)

# Perform K-means clustering for the top cities
for index, row in top_cities.iterrows():
    city_data = data[(data['state'] == row['state']) & (data['city'] == row['city'])]
    city_data.dropna(subset=['latitude', 'longitude'], inplace=True)  # Drop rows with missing latitude or longitude
    coordinates = city_data[['latitude', 'longitude']]
    if not coordinates.empty:
        kmeans.fit(coordinates)
        cluster_center = kmeans.cluster_centers_[0]

        # Count race occurrences
        race_counts = city_data['race'].value_counts()
        race_info = ', '.join([f"{race}: {count}" for race, count in race_counts.items()])

        # Create a string with city, state, and race incidents
        city_info = f"<b>{row['city']}, {row['state']}</b><br>{row['count']} incidents ({race_info})"

        # Add custom HTML to display city, state, and race incidents as labels next to marker
        html = f'<div style="font-family: Arial; font-size: 12px">{city_info}</div>'
        folium.Marker(
            location=cluster_center,
            icon=folium.DivIcon(html=html)
        ).add_to(map)

        print(f"Processing {row['city']}, {row['state']} - {row['count']} incidents ({race_info})")

        # Add marker for each city
        for _, city_row in city_data.iterrows():
            folium.Marker(
                location=[city_row['latitude'], city_row['longitude']],
                popup=f"{city_row['city']}, {city_row['state']}",
                icon=folium.Icon(color='red', icon='info-sign')
            ).add_to(map)

# Save the map as an HTML file
map.save('clusters_map_overall.html')


Processing Los Angeles, CA - 44 incidents (H: 19, B: 12, W: 7, A: 1)
Processing Phoenix, AZ - 32 incidents (B: 7, H: 7, W: 7)
Processing Las Vegas, NV - 32 incidents (W: 9, H: 8, B: 3)
Processing Houston, TX - 32 incidents (B: 13, H: 5, A: 2, W: 1)
Processing Oklahoma City, OK - 23 incidents (B: 9, W: 7, N: 2, H: 1)
