In [2]:
import sys

# Setup the environment if running in Google Colab
IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    ! pip install -r /content/drive/MyDrive/Tübingen/Sem1/DLit/Project/requirements.txt


import os
import re
import json
import requests
import PyPDF2
import urllib.request
import tueplots
from tueplots import bundles
from tueplots.constants.color import rgb
from bs4 import BeautifulSoup
from scipy import stats

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
from bayes_opt import BayesianOptimization

sys.path.append('../src/')
import maps
from maps import CONCERNS_MAP, ACTIVITIES_MAP, AGE, SEX, STATECODE

if IN_COLAB:
    os.chdir('/content/drive/MyDrive/Tübingen/Sem1/DLit/Project')

In [3]:
# Set the data directory.
DATA_DIR = '../dat/'
year = '2021'
CSV_FILE_PATH = os.path.join(DATA_DIR, f"{year}/nov{year[2:]}pub.csv")

# Read the data.
df21 = pd.read_csv(CSV_FILE_PATH)
df21 = df21[(df21[AGE] >= 10) * (df21[SEX] != -1)]
columns = df21.columns.to_list()
df21

Unnamed: 0,HRHHID,HRMONTH,HRYEAR4,HURESPLI,HUFINAL,FILLER,HETENURE,HEHOUSUT,HETELHHD,HETELAVL,...,PXEDTRAI,PXEGOVTS,PXUSESVC,PXESRVCS,PXECOMME,PXEGOODS,PXFINANC,PXVOICEA,PXHOMIOT,PWPRMWGT
0,610905110108708,11,2021,1,201,,1,1,1,-1,...,0,0,0,0,0,0,0,0,0,99445490
1,610905110108708,11,2021,1,201,,1,1,1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,0
5,721914005500521,11,2021,1,201,,2,1,1,-1,...,0,0,0,0,0,0,0,0,0,38703072
6,201967201670009,11,2021,1,201,,1,1,1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,0
7,201967201670009,11,2021,1,201,,1,1,1,-1,...,1,1,1,1,1,1,1,1,1,40232619
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127369,156485901801505,11,2021,1,201,,1,1,1,-1,...,1,1,1,1,1,1,1,1,1,3958005
127370,156485901801505,11,2021,1,201,,1,1,1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,0
127372,167066394505701,11,2021,1,201,,1,1,1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,0
127373,167066394505701,11,2021,1,201,,1,1,1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,0


In [4]:
df21['state'] = df21.apply(lambda row: maps.StatesMap[row[STATECODE]], axis=1)
state_population = df21['state'].value_counts().to_dict()

crime_per_state = df21.query('HEPSCYBA == 1')['state'].value_counts().to_dict()
crime_per_state_normalized = {key: crime_per_state[key]/state_population[key] for key in crime_per_state}

import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go


fig = px.choropleth(locations=list(crime_per_state_normalized.keys()),
                    locationmode="USA-states",
                    color=list(crime_per_state_normalized.values()),
                    scope="usa",
                    color_continuous_scale=[(0, 'rgb(0.31372549, 0.66666667, 0.78431373)'), (1.0, 'rgb(0.68627451, 0.43137255, 0.58823529)')],
                    
                   )

fig.update_layout(
    coloraxis_colorbar=dict(
        title="Crime Rate",
        title_font_color='white',
    ),
    font=dict(
        family="Times, serif",
        color="Black",
        size=18
    ),
    margin=go.layout.Margin(l=0, r=0, b=0, t=0, pad=0.0),  # No margins
    annotations=[
        dict(
            text="Crime Rate",
            font_size=18,
            font_family='Times, serif',
            font_color='black',
            showarrow=False,
            # ^^ appearance
            xref="paper",
            yref="paper",
            x=1.145,
            y=0.99,
            # ^^ position
        )
    ]
)

# Save the figure as a PDF file
pio.write_image(fig, '../res/figures/statewise_distribution.pdf')
fig.show()

In [5]:
bundles.icml2022(column='half', nrows=1, ncols=1, usetex=False)

{'text.usetex': False,
 'font.serif': ['Times'],
 'mathtext.fontset': 'stix',
 'mathtext.rm': 'Times',
 'mathtext.it': 'Times:italic',
 'mathtext.bf': 'Times:bold',
 'font.family': 'serif',
 'figure.figsize': (3.25, 2.0086104634371584),
 'figure.constrained_layout.use': True,
 'figure.autolayout': False,
 'savefig.bbox': 'tight',
 'savefig.pad_inches': 0.015,
 'font.size': 8,
 'axes.labelsize': 8,
 'legend.fontsize': 6,
 'xtick.labelsize': 6,
 'ytick.labelsize': 6,
 'axes.titlesize': 8}

In [7]:
connectivity_per_state = conn = df21.query('HEINHOME == 1')['state'].value_counts().to_dict()
conn_normalized = {key: conn[key]/state_population[key] for key in conn}

crime_rates = []
conn_rates = []

for state in crime_per_state_normalized:
    crime_rates.append(crime_per_state_normalized[state])
    conn_rates.append(conn_normalized[state])

# check the pearson correlation between crime and connectivity
stats.pearsonr(crime_rates, conn_rates)

PearsonRResult(statistic=0.46858367239060483, pvalue=0.0005243866510034743)