---

# Splitting Label data from the entire feature data set

This allows us to retain the information as to which events are the "target" events and which ones are just good context to predict the target events.

---

Load the cleaned data

---

In [9]:
# Imports
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import folium
from matplotlib import colors
from geopy.distance import geodesic

#%pip install folium pandas matplotlib geopy

In [10]:
# Load the data from the specified CSV file
data = pd.read_csv('csv/data_cleaned.csv')

# Display the first few rows of the dataframe
data.head()

Unnamed: 0,Date,EventCode,ActionGeo_FullName,ActionGeo_Lat,ActionGeo_Long,AvgTone
0,2024-08-23,145,"Union Park, Illinois, United States",41.8839,-87.6648,-3.046968
1,2024-08-22,145,"Union Park, Illinois, United States",41.8839,-87.6648,0.0
2,2024-08-20,145,"Union Park, Illinois, United States",41.8839,-87.6648,-4.319654
3,2024-08-20,145,"Union Park, Illinois, United States",41.8839,-87.6648,-4.319654
4,2024-06-27,145,"Buckingham Fountain, Illinois, United States",41.8756,-87.6189,-7.052186


---

# Location

Let's unpack the signifigance of the location of our data.

---

In [11]:
# Load the target location data from the specified CSV file
target_location = pd.read_csv('csv/target_location.csv')

# Display the first few rows of the target location dataframe
target_location.head()

Unnamed: 0,start_lat,start_lon,end_lat,end_lon
0,41.865756,-87.684843,41.918433,-87.617952


In [12]:
# Extract the values from the target_location DataFrame
start_lat = target_location['start_lat'].values[0]
start_lon = target_location['start_lon'].values[0]
end_lat = target_location['end_lat'].values[0]
end_lon = target_location['end_lon'].values[0]

# Display the variables
print(f"Start Latitude: {start_lat}")
print(f"Start Longitude: {start_lon}")
print(f"End Latitude: {end_lat}")
print(f"End Longitude: {end_lon}")

Start Latitude: 41.8657555523296
Start Longitude: -87.68484261360975
End Latitude: 41.91843294658821
End Longitude: -87.61795160281115


In [13]:
# Create a folium map centered around the average latitude and longitude of the data
map_center = [data['ActionGeo_Lat'].mean(), data['ActionGeo_Long'].mean()]
m = folium.Map(location=map_center, zoom_start=12)

# Add markers for each event in the data DataFrame
for index, row in data.iterrows():
    color = 'red'
    folium.Circle(
        location=[row['ActionGeo_Lat'], row['ActionGeo_Long']],
        radius=100,
        color=color,
        fill=True,
        fill_color=color,
        fill_opacity=0.7,
        popup=f"Date: {row['Date']}, EventCode: {row['EventCode']}, AvgTone: {row['AvgTone']}"
    ).add_to(m)

# Add a blue box around the target locations
folium.Rectangle(
    bounds=[[start_lat, start_lon], [end_lat, end_lon]],
    color='blue',
    fill=False
).add_to(m)

# Calculate the bounds for all the data
data_bounds = [[data['ActionGeo_Lat'].min(), data['ActionGeo_Long'].min()],
               [data['ActionGeo_Lat'].max(), data['ActionGeo_Long'].max()]]

# Add a green box around all the data
folium.Rectangle(
    bounds=data_bounds,
    color='green',
    fill=False
).add_to(m)

# Display the map
m


---

# Target Data vs Whole Dataset

Let's make sure we split our intended events from the entire data set.

---

In [14]:
# Filter the data DataFrame to include only rows within the target location bounds
target_data = data[
    (data['ActionGeo_Lat'] >= start_lat) & (data['ActionGeo_Lat'] <= end_lat) &
    (data['ActionGeo_Long'] >= start_lon) & (data['ActionGeo_Long'] <= end_lon)
]

# Display the resulting DataFrame
target_data

Unnamed: 0,Date,EventCode,ActionGeo_FullName,ActionGeo_Lat,ActionGeo_Long,AvgTone
0,2024-08-23,145,"Union Park, Illinois, United States",41.8839,-87.6648,-3.046968
1,2024-08-22,145,"Union Park, Illinois, United States",41.8839,-87.6648,0.000000
2,2024-08-20,145,"Union Park, Illinois, United States",41.8839,-87.6648,-4.319654
3,2024-08-20,145,"Union Park, Illinois, United States",41.8839,-87.6648,-4.319654
4,2024-06-27,145,"Buckingham Fountain, Illinois, United States",41.8756,-87.6189,-7.052186
...,...,...,...,...,...,...
3306,2016-03-20,145,"University Of Illinois At Chicago, Illinois, U...",41.8720,-87.6492,-7.417219
3308,2016-03-20,145,"University Of Illinois At Chicago, Illinois, U...",41.8720,-87.6492,-7.417219
3335,2016-03-13,145,"University Of Illinois At Chicago, Illinois, U...",41.8720,-87.6492,-8.571429
3338,2016-03-13,145,"University Of Illinois At Chicago, Illinois, U...",41.8720,-87.6492,-8.571429


---

Let's save the targeted data for reference later.

---

In [15]:
# Save the target_data DataFrame to a CSV file
target_data.to_csv('csv/target_data.csv', index=False)

---

Quick comparison. I noticed later in this project that the target data stopped at the spike in mid 2024. I don't know why.

Let's pull this info.

---

In [16]:
# Get the first date in the target_data DataFrame
first_date = target_data['Date'].min()
print(f"The first date in the target data is: {first_date}")

The first date in the target data is: 2015-12-25
