In [1]:
import pandas as pd

df_machine_annotated = pd.read_csv('data/news/all_articles_combined_machine_annotation.csv')
df_crime_pop_agg = pd.read_csv('data/crime_pop_agg.csv')


In [2]:
# Define valid US state codes (50 states + DC)
states = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 
                'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME',
                'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH',
                'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI',
                'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY']

# Filter to only valid US states
df_machine_annotated = df_machine_annotated[df_machine_annotated['location'].isin(states)]

df_frequency = df_machine_annotated.groupby(['location', 'date']).size().reset_index(name='frequency')

# Convert date to datetime to find full range
df_frequency['date_dt'] = pd.to_datetime(df_frequency['date'], format='%Y/%m')

# Get min and max dates
min_date = df_frequency['date_dt'].min()
max_date = df_frequency['date_dt'].max()

# Create complete date range (all months from min to max)
all_dates = pd.date_range(start=min_date, end=max_date, freq='MS')
all_dates_str = all_dates.strftime('%Y/%m').tolist()

# Create complete grid: all locations Ã— all dates
complete_grid = pd.MultiIndex.from_product(
    [states, all_dates_str],
    names=['location', 'date']
).to_frame(index=False)

# Merge with actual frequency data
df_frequency_filled = complete_grid.merge(
    df_frequency[['location', 'date', 'frequency']],
    on=['location', 'date'],
    how='left'
)

# Fill NaN with 0
df_frequency_filled['frequency'] = df_frequency_filled['frequency'].fillna(0).astype(int)


In [3]:
# Convert crime_pop_agg date to YYYY/MM format to match df_frequency_filled
df_crime_pop_agg['date'] = pd.to_datetime(df_crime_pop_agg['date']).dt.strftime('%Y/%m')

# Merge the filled frequency data with crime and population data
df_merged = df_frequency_filled.merge(
    df_crime_pop_agg,
    left_on=['location', 'date'],
    right_on=['state_code', 'date'],
    how='left'
)
df_merged.head()

Unnamed: 0,location,date,frequency,state,state_code,crime_type,crime_count,Clearances,year,population,total_population_at_date,total_crime_count_at_date,crime_rate,crime_count_standardized,crime_count_weighted,crime_relative_intensity,crime_rate_normalized,crime_count_normalized
0,AL,2021/11,0,Alabama,AL,Aggravated Assault Reported by Population,1165.0,353.0,2021.0,5049.196,332099.76,103323.0,23072.980332,-0.140585,17.712489,0.741609,0.082262,0.009142
1,AL,2021/12,0,Alabama,AL,Aggravated Assault Reported by Population,1233.0,394.0,2021.0,5049.196,332099.76,106837.0,24419.729399,-0.133327,18.746351,0.75908,0.087063,0.009676
2,AL,2022/01,0,Alabama,AL,Aggravated Assault Reported by Population,1214.0,417.0,2022.0,5076.181,334017.321,128093.0,23915.616878,-0.135355,18.449593,0.623627,0.085266,0.009527
3,AL,2022/02,0,Alabama,AL,Aggravated Assault Reported by Population,1090.0,417.0,2022.0,5076.181,334017.321,120218.0,21472.835582,-0.148591,16.565121,0.596608,0.076557,0.008554
4,AL,2022/03,1,Alabama,AL,Aggravated Assault Reported by Population,1236.0,440.0,2022.0,5076.181,334017.321,144091.0,24349.01356,-0.133007,18.783935,0.564434,0.086811,0.009699


In [4]:
df_merged.to_csv('data/crime_pop_freq.csv', index=False)
