Research Question:
What set of factors is most important in determining adult violent crime rates across U.S. states from 2013-2018? How could we use these factors to accurately predict future violent crime rates?

sources:
using inplace for the rename function: https://www.geeksforgeeks.org/what-does-inplace-mean-in-pandas/

In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

Data Overview: 
Jasmine

Data Cleaning

In [2]:
#loading crime data
crime_df = pd.read_csv('data/crime.csv')
print(crime_df)

     estimated_crimes_1979_2023  Unnamed: 1  Unnamed: 2  Unnamed: 3  \
0                          year  state_abbr  state_name  population   
1                          1979         NaN         NaN   220099000   
2                          1979          AK      Alaska      406000   
3                          1979          AL     Alabama     3769000   
4                          1979          AR    Arkansas     2180000   
...                         ...         ...         ...         ...   
2332                       2023          VA         NaN   8,715,698   
2333                       2023          WA         NaN   7,812,880   
2334                       2023          WV         NaN   1,770,071   
2335                       2023          WI         NaN   5,910,955   
2336                       2023          WY         NaN     584,057   

         Unnamed: 4 Unnamed: 5   Unnamed: 6    Unnamed: 7 Unnamed: 8  \
0     violent_crime   homicide  rape_legacy  rape_revised    robbery   
1  

In [3]:
#see what the names of each column are without any edits to the csv
print(crime_df.columns)

#rename the columns we want with proper names
crime_df.rename(columns={
    'estimated_crimes_1979_2023': 'Year', 
    'Unnamed: 1': 'State_Abbreviation', 
    'Unnamed: 2': 'State_Name', 
    'Unnamed: 3': 'Population',
    'Unnamed: 4': 'Violent_Crime', 
    'Unnamed: 5': 'Homicide',
    'Unnamed: 6': 'Rape', 
    'Unnamed: 7': 'Rape_Revised',
    'Unnamed: 8': 'Robbery', 
    'Unnamed: 9': 'Aggravated_Assault'
},inplace=True)

#drop columns that will not be used during the analysis
crime_df = crime_df.drop(columns=['Rape_Revised','Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12',
       'Unnamed: 13', 'Unnamed: 14'])
print(crime_df)


Index(['estimated_crimes_1979_2023', 'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3',
       'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8',
       'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12',
       'Unnamed: 13', 'Unnamed: 14'],
      dtype='object')
      Year State_Abbreviation  State_Name  Population  Violent_Crime  \
0     year         state_abbr  state_name  population  violent_crime   
1     1979                NaN         NaN   220099000        1208030   
2     1979                 AK      Alaska      406000           1994   
3     1979                 AL     Alabama     3769000          15578   
4     1979                 AR    Arkansas     2180000           7984   
...    ...                ...         ...         ...            ...   
2332  2023                 VA         NaN   8,715,698         20,589   
2333  2023                 WA         NaN   7,812,880         27,909   
2334  2023                 WV         NaN   1,770,071          4,699  

In [4]:
#dropping the first row (index 0) and reset the indexes. This way, I can turn the year from object to integer.
crime_df = crime_df.drop(index=0)

# Reset the index 
crime_df.reset_index(drop=True, inplace=True)

#making the year into an interger from an object
crime_df['Year']=crime_df['Year'].astype(int)

#shaping the data set so that we are only looking at years 2013 to 2018
crime_filtered_years = crime_df[(crime_df['Year'] >= 2013) & (crime_df['Year'] <= 2018)]

# Drop rows where 'State_Abbreviation' or 'State_Name' is NaN 
crime_filtered_years = crime_filtered_years.dropna(subset=['State_Abbreviation', 'State_Name'])
print(crime_filtered_years)


      Year State_Abbreviation     State_Name Population Violent_Crime  \
1769  2013                 AK         Alaska     737259          4709   
1770  2013                 AL        Alabama    4833996         20834   
1771  2013                 AR       Arkansas    2958765         13705   
1772  2013                 AZ        Arizona    6634997         27576   
1773  2013                 CA     California   38431393        154739   
...    ...                ...            ...        ...           ...   
2073  2018                 VT        Vermont     624358          1155   
2074  2018                 WA     Washington    7523869         23719   
2075  2018                 WI      Wisconsin    5807406         17365   
2076  2018                 WV  West Virginia    1804291          5411   
2077  2018                 WY        Wyoming     577601          1235   

     Homicide  Rape Robbery Aggravated_Assault  
1769       34   657     623               3127  
1770      346  1449    46

In [5]:
#loading poverty data
poverty_df = pd.read_csv('data/poverty.csv')
print(poverty_df)

                                             Unnamed: 0        Unnamed: 1  \
0     Table 19. Number of Poor and Poverty Rate by S...               NaN   
1     (Population in thousands. Population as of Mar...               NaN   
2                                                  2023               NaN   
3                                                 State  Total population   
4                                               Alabama             4,995   
...                                                 ...               ...   
2437                                      West Virginia             1,952   
2438                                          Wisconsin             4,724   
2439                                            Wyoming               468   
2440  Source: U.S. Census Bureau, Current Population...               NaN   
2441  Footnotes are available at <www.census.gov/top...               NaN   

             Unnamed: 2                Unnamed: 3          Unnamed: 4  \
0 

In [6]:
#cleaning the first few rows that are just words 
poverty_df = poverty_df.drop([0, 1])

#cutting off any row that is not between 2013 and 2018
indices_2018 = poverty_df[poverty_df['Unnamed: 0'] == '2018'].index
indices_2013 = poverty_df[poverty_df['Unnamed: 0'] == '2013 (4)'].index
print(indices_2018)
print(indices_2013)

# Get the index of 2013 and 2018
index_2013 = poverty_df[poverty_df['Unnamed: 0'] == '2013 (4)'].index[0]
print(index_2013)

index_2018 = poverty_df[poverty_df['Unnamed: 0'] == '2018'].index[0]
print(index_2018)

#cutting off one indices above this number so the year does not get cut off
poverty_df_filtered = poverty_df[index_2018:index_2013]
print(poverty_df)

Index([267], dtype='int64')
Index([585], dtype='int64')
585
267
                                             Unnamed: 0        Unnamed: 1  \
2                                                  2023               NaN   
3                                                 State  Total population   
4                                               Alabama             4,995   
5                                                Alaska               717   
6                                               Arizona             7,210   
...                                                 ...               ...   
2437                                      West Virginia             1,952   
2438                                          Wisconsin             4,724   
2439                                            Wyoming               468   
2440  Source: U.S. Census Bureau, Current Population...               NaN   
2441  Footnotes are available at <www.census.gov/top...               NaN   

           