# Initial Preprocessing of input data

Some of the dataset was available as individual .csv files.  1 per state.  Here we concatenate those files and save 1 file that can be loaded into GitHub

In [1]:
# Import dependencies
import pandas as pd
import numpy as np

## Load a file and see how it looks

In [2]:
# Test loading and processing 1 file
path = "Crime/"
filename = "Alabama_violent_crime_rate_1986-2019.csv"
path_filename = path + filename

state_df = pd.read_csv(path_filename, delimiter=",")
i = state_df[state_df['series']=='United States'].index
state_df.drop(i,inplace=True, axis=0)
state_df

Unnamed: 0,series,1986,1987,1988,1989,1990,1991,1992,1993,1994,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
1,Alabama,558.0,559.2,558.6,590.8,708.6,844.2,871.7,780.4,683.7,...,383.7,419.8,450.3,431.0,427.7,473.0,532.4,522.4,523.1,510.8


In [3]:
years = state_df.columns[1:].tolist()

In [4]:
state_df = pd.melt(state_df, id_vars = ["series"], value_vars = years, ignore_index= True)

In [5]:
state_df.columns = ['state','year','crime_rate']
state_df

Unnamed: 0,state,year,crime_rate
0,Alabama,1986,558.0
1,Alabama,1987,559.2
2,Alabama,1988,558.6
3,Alabama,1989,590.8
4,Alabama,1990,708.6
5,Alabama,1991,844.2
6,Alabama,1992,871.7
7,Alabama,1993,780.4
8,Alabama,1994,683.7
9,Alabama,1995,632.4


## Now get 1 file loaded and in the shape needed

In [6]:
# Test loading and processing 1 file
path = "Crime/"
filename = "Alabama_violent_crime_rate_1986-2019.csv"
path_filename = path + filename

state_df = pd.read_csv(path_filename, delimiter=",")
i = state_df[state_df['series']=='United States'].index
state_df.drop(i,inplace=True, axis=0)
state_df
years = state_df.columns[1:].tolist()
state_df = pd.melt(state_df, id_vars = ["series"], value_vars = years, ignore_index= True)
state_df.columns = ['state','year','crime_rate']
state_df

Unnamed: 0,state,year,crime_rate
0,Alabama,1986,558.0
1,Alabama,1987,559.2
2,Alabama,1988,558.6
3,Alabama,1989,590.8
4,Alabama,1990,708.6
5,Alabama,1991,844.2
6,Alabama,1992,871.7
7,Alabama,1993,780.4
8,Alabama,1994,683.7
9,Alabama,1995,632.4


## If happy make the above code into a function

In [7]:
# Take a path and filename, format to desired and return a DataFrame
def process_state(path, filename):
    path_filename = path + filename

    state_df = pd.read_csv(path_filename, delimiter=",")
    i = state_df[state_df['series']=='United States'].index
    state_df.drop(i,inplace=True, axis=0)
    
    years = state_df.columns[1:].tolist()
    state_df = pd.melt(state_df, id_vars = ["series"], value_vars = years, ignore_index= True)
    state_df.columns = ['state','year','crime_rate']
   
    return state_df

## Iterate through all files in a directory and concatenate

In [8]:
import os
os.listdir("Crime/")

['._Mississippi_violent_crime_rate_1986-2019.csv',
 'Maryland_violent_crime_rate_1986-2019.csv',
 '._Maryland_violent_crime_rate_1986-2019.csv',
 'Nebraska_violent_crime_rate_1986-2019.csv',
 '._Nebraska_violent_crime_rate_1986-2019.csv',
 '._Wyoming_violent_crime_rate_1986-2019.csv',
 'Mississippi_violent_crime_rate_1986-2019.csv',
 'Massachusetts_violent_crime_rate_1986-2019.csv',
 '._Massachusetts_violent_crime_rate_1986-2019.csv',
 'Oklahoma_violent_crime_rate_1986-2019.csv',
 '._Oklahoma_violent_crime_rate_1986-2019.csv',
 'Nevada_violent_crime_rate_1986-2019.csv',
 '._Nevada_violent_crime_rate_1986-2019.csv',
 '._New Jersey_violent_crime_rate_1986-2019.csv',
 'New Mexico_violent_crime_rate_1986-2019.csv',
 'Iowa_violent_crime_rate_1986-2019.csv',
 '._Iowa_violent_crime_rate_1986-2019.csv',
 'Kansas_violent_crime_rate_1986-2019.csv',
 '._Kansas_violent_crime_rate_1986-2019.csv',
 '.DS_Store',
 '._.DS_Store',
 'Oregon_violent_crime_rate_1986-2019.csv',
 '._Oregon_violent_crime_rate

In [9]:
import os
import sys

crime_df = pd.DataFrame()

path = "Crime/"

for filename in os.listdir(path):
    if filename[0] != ".":
    # print('Process state: '+ filename)
        state_df = process_state(path, filename)
        crime_df = pd.concat([crime_df, state_df], axis=0, ignore_index=True)

crime_df.head(20)

Unnamed: 0,state,year,crime_rate
0,Maryland,1986,833.0
1,Maryland,1987,767.8
2,Maryland,1988,806.8
3,Maryland,1989,855.4
4,Maryland,1990,919.0
5,Maryland,1991,956.2
6,Maryland,1992,1000.1
7,Maryland,1993,997.8
8,Maryland,1994,948.0
9,Maryland,1995,986.9


In [10]:
crime_df.loc[(crime_df["state"] == "Rhode Island")]

Unnamed: 0,state,year,crime_rate
782,Rhode Island,1986,335.5
783,Rhode Island,1987,359.7
784,Rhode Island,1988,396.7
785,Rhode Island,1989,378.0
786,Rhode Island,1990,431.9
787,Rhode Island,1991,462.0
788,Rhode Island,1992,394.5
789,Rhode Island,1993,401.7
790,Rhode Island,1994,375.5
791,Rhode Island,1995,368.0


## Check some statistics and details

In [11]:
print('---  STATISTICS  ---')
print(crime_df.describe())
print('---  TOTAL NAs  ---')
print(crime_df.isna().sum())
print('---  DTYPES  ---')
print(crime_df.dtypes)
print('---  LIST OF STATES  ---')
print(crime_df['state'].value_counts().count())
print(crime_df['state'].value_counts())

---  STATISTICS  ---
        crime_rate
count  1734.000000
mean    459.933391
std     286.924265
min      51.300000
25%     275.825000
50%     406.100000
75%     569.450000
max    2921.800000
---  TOTAL NAs  ---
state         0
year          0
crime_rate    0
dtype: int64
---  DTYPES  ---
state          object
year           object
crime_rate    float64
dtype: object
---  LIST OF STATES  ---
51
New Jersey              34
Missouri                34
Tennessee               34
Utah                    34
Alabama                 34
New Hampshire           34
Washington              34
Wisconsin               34
North Dakota            34
Massachusetts           34
Florida                 34
Alaska                  34
Mississippi             34
Hawaii                  34
Colorado                34
Montana                 34
Delaware                34
Rhode Island            34
Kentucky                34
Kansas                  34
Oregon                  34
North Carolina          34
New York

In [12]:
crime_df = crime_df.drop_duplicates()

In [13]:
with pd.option_context('display.max_rows', None):
        display(crime_df.loc[(crime_df["state"] == "Rhode Island")])

Unnamed: 0,state,year,crime_rate
782,Rhode Island,1986,335.5
783,Rhode Island,1987,359.7
784,Rhode Island,1988,396.7
785,Rhode Island,1989,378.0
786,Rhode Island,1990,431.9
787,Rhode Island,1991,462.0
788,Rhode Island,1992,394.5
789,Rhode Island,1993,401.7
790,Rhode Island,1994,375.5
791,Rhode Island,1995,368.0


In [14]:
print('---  LIST OF STATES  ---')
print(crime_df['state'].value_counts().count())
print(crime_df['state'].value_counts())

---  LIST OF STATES  ---
51
New Jersey              34
Missouri                34
Tennessee               34
Utah                    34
Alabama                 34
New Hampshire           34
Washington              34
Wisconsin               34
North Dakota            34
Massachusetts           34
Florida                 34
Alaska                  34
Mississippi             34
Hawaii                  34
Colorado                34
Montana                 34
Delaware                34
Rhode Island            34
Kentucky                34
Kansas                  34
Oregon                  34
North Carolina          34
New York                34
Georgia                 34
New Mexico              34
Oklahoma                34
Arkansas                34
Virginia                34
Nevada                  34
Texas                   34
Maryland                34
Vermont                 34
Michigan                34
South Carolina          34
West Virginia           34
District of Columbia    34


In [15]:
crime_df = crime_df[["year", "state", "crime_rate"]]
crime_df.head()

Unnamed: 0,year,state,crime_rate
0,1986,Maryland,833.0
1,1987,Maryland,767.8
2,1988,Maryland,806.8
3,1989,Maryland,855.4
4,1990,Maryland,919.0


In [16]:
# Load in crime dataset from 1970-1985
filepath = "crime_by_state_1970-1985.xlsx"
crime_70_85_df = pd.read_excel(filepath)
crime_70_85_df.head()

Unnamed: 0,Year,State,Violent
0,1970,Alabama,295.7
1,1970,Alaska,278.0
2,1970,Arizona,370.3
3,1970,Arkansas,222.3
4,1970,California,474.8


In [17]:
# Format column headers
crime_70_85_df.columns = ['year','state','crime_rate']
crime_70_85_df.head()


Unnamed: 0,year,state,crime_rate
0,1970,Alabama,295.7
1,1970,Alaska,278.0
2,1970,Arizona,370.3
3,1970,Arkansas,222.3
4,1970,California,474.8


In [18]:
print('---  LIST OF STATES  ---')
print(crime_70_85_df['state'].value_counts().count())
print(crime_70_85_df['state'].value_counts())

---  LIST OF STATES  ---
50
New Jersey        16
North Carolina    16
Minnesota         16
Maine             16
Kansas            16
Louisiana         16
South Dakota      16
Idaho             16
Oregon            16
Wyoming           16
Connecticut       16
Kentucky          16
Ohio              16
Arkansas          16
Illinois          16
Pennsylvania      16
Arizona           16
New York          16
Georgia           16
Texas             16
Hawaii            16
Montana           16
Nebraska          16
Utah              16
Iowa              16
Indiana           16
Tennessee         16
Rhode Island      16
New Hampshire     16
Washington        16
South Carolina    16
Nevada            16
Wisconsin         16
North Dakota      16
New Mexico        16
Massachusetts     16
Florida           16
Maryland          16
Vermont           16
Alaska            16
Missouri          16
Michigan          16
Mississippi       16
West Virginia     16
Oklahoma          16
California        16
Colora

In [19]:
#result = pd.concat(frames)
#result = pd.concat([df1, df4], axis=1, join="inner")
#result = df1.append(df2)
#frames = [crime_df, crime_70_85_df]
new_crime_df = crime_70_85_df.append(crime_df)
new_crime_df.head()

Unnamed: 0,year,state,crime_rate
0,1970,Alabama,295.7
1,1970,Alaska,278.0
2,1970,Arizona,370.3
3,1970,Arkansas,222.3
4,1970,California,474.8


In [20]:
new_crime_df['year']= new_crime_df.year.astype(int)

In [21]:
new_crime_df.dtypes

year            int64
state          object
crime_rate    float64
dtype: object

In [22]:
filepath= "National_crime_rate.xlsx"
national_crime_df = pd.read_excel(filepath)
national_crime_df

Unnamed: 0,Year,Violent
0,1970,363.5
1,1971,396.0
2,1972,401.0
3,1973,417.4
4,1974,461.1
5,1975,487.8
6,1976,467.8
7,1977,475.9
8,1978,497.8
9,1979,548.9


In [23]:
national_crime_df.columns = ['year', 'crime_rate']
national_crime_df['state'] = 'National'

In [24]:
national_crime_df = national_crime_df[['year', 'state', 'crime_rate']]
national_crime_df.head()

Unnamed: 0,year,state,crime_rate
0,1970,National,363.5
1,1971,National,396.0
2,1972,National,401.0
3,1973,National,417.4
4,1974,National,461.1


In [25]:
# Append national_crime to new_crime_df
us_crime_df = national_crime_df.append(new_crime_df)

In [26]:
us_crime_df.state.value_counts()

New Jersey              50
Nevada                  50
Maryland                50
Vermont                 50
South Carolina          50
California              50
South Dakota            50
Virginia                50
Tennessee               50
Indiana                 50
Maine                   50
Connecticut             50
Kentucky                50
Illinois                50
Pennsylvania            50
Arizona                 50
Oklahoma                50
Hawaii                  50
Alabama                 50
New Hampshire           50
Wisconsin               50
Florida                 50
Alaska                  50
New York                50
Georgia                 50
Arkansas                50
North Carolina          50
Oregon                  50
Kansas                  50
Texas                   50
Michigan                50
West Virginia           50
Iowa                    50
Minnesota               50
Idaho                   50
Louisiana               50
Wyoming                 50
O

In [27]:
with pd.option_context('display.max_rows', None):
        display(us_crime_df.loc[(us_crime_df["state"] == "National")])

Unnamed: 0,year,state,crime_rate
0,1970,National,363.5
1,1971,National,396.0
2,1972,National,401.0
3,1973,National,417.4
4,1974,National,461.1
5,1975,National,487.8
6,1976,National,467.8
7,1977,National,475.9
8,1978,National,497.8
9,1979,National,548.9


## Store into a csv,  drop the index when saving

In [28]:
us_crime_df.to_csv('crime_rate.csv', index = False, index_label=False)