# UCR Violent Crimes per Capita by State Cleaning
- acquired by using the UCR data tool www.ucrdatatool.gov
- voluntarily reporting jurisdictions submit crime data to the FBI, along with the population inside the jurisdiction
    - this is used to calculate a violent crime rate for the jurisdiction (violent crimes per 100,000 people)
    - results represent the mean of all reporting jurisdictions in the state
- columns are states, rows are years, fields are violent crimes per 100,000 people
- dataset is very clean, just need to change the way it is structured

### Dependencies

In [13]:
import pandas as pd
import numpy as np

### Read in CSV and create Dataframe

In [4]:
filepath = 'Input/ucr_violent_crime_rate_by_state.csv'
csv = pd.read_csv(filepath)
df = pd.DataFrame(csv)

### View head and dtypes

In [5]:
df.head()

Unnamed: 0,Year,Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,District of Columbia,...,South Dakota,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming
0,1990,708.6,524.5,652.4,532.2,1045.2,526.0,553.7,655.2,2458.2,...,162.8,670.4,761.4,283.9,127.2,350.6,501.6,169.3,264.7,301.4
1,1991,844.2,613.9,670.7,593.3,1089.9,559.3,539.7,714.3,2453.3,...,182.2,725.9,840.1,286.8,116.8,373.2,522.6,191.0,277.0,310.2
2,1992,871.7,660.5,670.8,576.5,1119.7,578.8,495.3,621.2,2832.8,...,194.5,746.2,806.3,290.5,109.5,374.9,534.5,211.5,275.7,319.5
3,1993,780.4,760.8,715.0,593.3,1077.8,567.3,456.2,685.9,2921.8,...,208.4,765.8,762.1,301.0,114.2,372.2,514.6,208.4,264.4,286.2
4,1994,683.7,766.3,703.1,595.1,1013.0,509.6,455.5,644.3,2662.6,...,227.6,747.9,706.5,304.5,96.9,357.7,511.3,215.8,270.5,272.5


### Reorganize DataFrame

In [23]:
# will have to iterate over columns, how many are there?
len(df.columns)

52

In [27]:
# .columns[position] accessor will return column name
df.columns[1]

'Alabama'

In [57]:
# create empty dataframe
columns = ['year', 'state', 'rate']
newdf = pd.DataFrame(columns=columns)

In [58]:
# test setting value on new dataframe (was worried because no index, but it works great)
newdf.loc[0, 'year'] = 1990
newdf.head()

Unnamed: 0,year,state,rate
0,1990,,


In [59]:
# start offset counter
offset = 0
# iterate over rows (years)
for index, row in df.iterrows():
    # iterate over states in the current year, start at 1 to skip year column
    for i in range(1, len(df.columns)): 
        # extract year, current state, and its violent crime per capita for that year
        year = row['Year']
        state = df.columns[i]
        rate = df.iloc[index, i]
        # set these values on empty dataframe, add offset to index
        newdf.loc[i-1 + offset, 'year'] = year
        newdf.loc[i-1 + offset, 'state'] = state
        newdf.loc[i-1 + offset, 'rate'] = rate
    # increment offset
    offset += 51
        

In [64]:
# Does the resulting dataframe represent all states, all years?
len(newdf) == (len(df.columns) - 1) * (len(df.Year))

True

In [63]:
newdf.head(120)

Unnamed: 0,year,state,rate
0,1990,Alabama,708.6
1,1990,Alaska,524.5
2,1990,Arizona,652.4
3,1990,Arkansas,532.2
4,1990,California,1045.2
5,1990,Colorado,526
6,1990,Connecticut,553.7
7,1990,Delaware,655.2
8,1990,District of Columbia,2458.2
9,1990,Florida,1244.3


### Export expanded dataframe


In [65]:
newdf.to_csv('Output/ucr_violent_crime_rate_by_state_expanded.csv')