## 2. Data Wrangling

### This script contains the following: 
1. Importing libraries and data
2. Renaming columns
3. Data consistency check
4. Data type check
5. Exporting data to pickle

### 1. Importing libraries and data

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
path = r'C:\Users\Neena Tilton\Dropbox\Projects\MinWage_Crime'

In [3]:
# Import dataframe of Minimum Wage

df_mw = pd.read_pickle(os.path.join(path, '02_Data', 'PreparedData', 'MinWage_clean.pkl'))

In [4]:
# Import dataframe of Crimes

df_crime = pd.read_pickle(os.path.join(path, '02_Data', 'PreparedData', 'crimes_clean.pkl'))

In [5]:
df_mw.head()

Unnamed: 0,Year,State,State.Minimum.Wage,State.Minimum.Wage.2020.Dollars,Federal.Minimum.Wage,Federal.Minimum.Wage.2020.Dollars,Effective.Minimum.Wage,Effective.Minimum.Wage.2020.Dollars
0,1968,Alabama,0.0,0.0,1.15,8.55,1.15,8.55
1,1968,Alaska,2.1,15.61,1.15,8.55,2.1,15.61
2,1968,Arizona,0.468,3.48,1.15,8.55,1.15,8.55
3,1968,Arkansas,0.15625,1.16,1.15,8.55,1.15,8.55
4,1968,California,1.65,12.26,1.15,8.55,1.65,12.26


In [6]:
df_crime.head()

Unnamed: 0,jurisdiction,year,prisoner_count,state_population,violent_crime_total,murder_manslaughter,robbery,burglary
0,FEDERAL,2001,149852,,,,,
1,ALABAMA,2001,24741,4468912.0,19582.0,379.0,5584.0,40642.0
2,ALASKA,2001,4570,633630.0,3735.0,39.0,514.0,3847.0
3,ARIZONA,2001,27710,5306966.0,28675.0,400.0,8868.0,54821.0
4,ARKANSAS,2001,11489,2694698.0,12190.0,148.0,2181.0,22196.0


In [7]:
df_mw.columns

Index(['Year', 'State', 'State.Minimum.Wage',
       'State.Minimum.Wage.2020.Dollars', 'Federal.Minimum.Wage',
       'Federal.Minimum.Wage.2020.Dollars', 'Effective.Minimum.Wage',
       'Effective.Minimum.Wage.2020.Dollars'],
      dtype='object')

In [8]:
df_crime.columns

Index(['jurisdiction', 'year', 'prisoner_count', 'state_population',
       'violent_crime_total', 'murder_manslaughter', 'robbery', 'burglary'],
      dtype='object')

In [9]:
# Create a dictionary of old name to new name in mw_dict.

mw_dict = {'State.Minimum.Wage':'state_mw',
       'State.Minimum.Wage.2020.Dollars':'state_mw_2020',
        'Federal.Minimum.Wage':'fed_mw',
       'Federal.Minimum.Wage.2020.Dollars': 'fed_mw_2020',
        'Effective.Minimum.Wage': 'effective_mw',
       'Effective.Minimum.Wage.2020.Dollars':'effective_mw_2020'}

In [10]:
# Rename columns for Min Wage df using mw_dict.

df_mw.rename(columns = mw_dict, inplace = True)

In [11]:
df_mw.head()

Unnamed: 0,Year,State,state_mw,state_mw_2020,fed_mw,fed_mw_2020,effective_mw,effective_mw_2020
0,1968,Alabama,0.0,0.0,1.15,8.55,1.15,8.55
1,1968,Alaska,2.1,15.61,1.15,8.55,2.1,15.61
2,1968,Arizona,0.468,3.48,1.15,8.55,1.15,8.55
3,1968,Arkansas,0.15625,1.16,1.15,8.55,1.15,8.55
4,1968,California,1.65,12.26,1.15,8.55,1.65,12.26


In [12]:
# Create a dictionary of old name to new name in crime_dict.

crime_dict = {'jurisdiction':'state','violent_crime_total':'violen_crime', 
              'murder_manslaughter':'murder'}

In [13]:
df_crime.rename(columns = crime_dict, inplace = True)

In [14]:
df_crime.head()

Unnamed: 0,state,year,prisoner_count,state_population,violen_crime,murder,robbery,burglary
0,FEDERAL,2001,149852,,,,,
1,ALABAMA,2001,24741,4468912.0,19582.0,379.0,5584.0,40642.0
2,ALASKA,2001,4570,633630.0,3735.0,39.0,514.0,3847.0
3,ARIZONA,2001,27710,5306966.0,28675.0,400.0,8868.0,54821.0
4,ARKANSAS,2001,11489,2694698.0,12190.0,148.0,2181.0,22196.0


### 3. Data consistency check

In [15]:
df_mw['State'].value_counts(dropna = False)

Alabama                 53
Puerto Rico             53
Nevada                  53
New Hampshire           53
New Jersey              53
New Mexico              53
New York                53
North Carolina          53
North Dakota            53
Ohio                    53
Oklahoma                53
Oregon                  53
Pennsylvania            53
Rhode Island            53
Alaska                  53
South Carolina          53
South Dakota            53
Tennessee               53
Texas                   53
U.S. Virgin Islands     53
Utah                    53
Vermont                 53
Virginia                53
Washington              53
West Virginia           53
Wisconsin               53
Nebraska                53
Montana                 53
Missouri                53
Mississippi             53
Arizona                 53
Arkansas                53
California              53
Colorado                53
Connecticut             53
Delaware                53
District of Columbia    53
F

In [16]:
df_mw['Year'].value_counts(dropna = False)

1968    54
1995    54
1997    54
1998    54
1999    54
2000    54
2001    54
2002    54
2003    54
2004    54
2005    54
2006    54
2007    54
2008    54
2009    54
2010    54
2011    54
2012    54
2013    54
2014    54
2015    54
2016    54
2017    54
2018    54
2019    54
1996    54
1994    54
1969    54
1993    54
1970    54
1971    54
1972    54
1973    54
1974    54
1975    54
1976    54
1977    54
1978    54
1979    54
1980    54
1981    54
1982    54
1983    54
1984    54
1985    54
1986    54
1987    54
1988    54
1989    54
1990    54
1991    54
1992    54
2020    54
Name: Year, dtype: int64

In [17]:
df_crime['state'].value_counts(dropna = False)

FEDERAL           16
PENNSYLVANIA      16
NEVADA            16
NEW HAMPSHIRE     16
NEW JERSEY        16
NEW MEXICO        16
NEW YORK          16
NORTH CAROLINA    16
NORTH DAKOTA      16
OHIO              16
OKLAHOMA          16
OREGON            16
RHODE ISLAND      16
MONTANA           16
SOUTH CAROLINA    16
SOUTH DAKOTA      16
TENNESSEE         16
TEXAS             16
UTAH              16
VERMONT           16
VIRGINIA          16
WASHINGTON        16
WEST VIRGINIA     16
WISCONSIN         16
NEBRASKA          16
MISSOURI          16
ALABAMA           16
IDAHO             16
ALASKA            16
ARIZONA           16
ARKANSAS          16
CALIFORNIA        16
COLORADO          16
CONNECTICUT       16
DELAWARE          16
FLORIDA           16
GEORGIA           16
HAWAII            16
ILLINOIS          16
MISSISSIPPI       16
INDIANA           16
IOWA              16
KANSAS            16
KENTUCKY          16
LOUISIANA         16
MAINE             16
MARYLAND          16
MASSACHUSETTS

In [18]:
df_crime['year'].value_counts(dropna = False)

2001    51
2002    51
2003    51
2004    51
2005    51
2006    51
2007    51
2008    51
2009    51
2010    51
2011    51
2012    51
2013    51
2014    51
2015    51
2016    51
Name: year, dtype: int64

Nothing out of ordinary as for data formatting for Year and State for both dataframes. 

### 4. Data type check

Here, any data type that take up too much memory will be updated to take up less space. 

In [22]:
df_mw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2862 entries, 0 to 2861
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Year               2862 non-null   int64  
 1   State              2862 non-null   object 
 2   state_mw           2862 non-null   float64
 3   state_mw_2020      2862 non-null   float64
 4   fed_mw             2862 non-null   float64
 5   fed_mw_2020        2862 non-null   float64
 6   effective_mw       2862 non-null   float64
 7   effective_mw_2020  2862 non-null   float64
dtypes: float64(6), int64(1), object(1)
memory usage: 179.0+ KB


In [23]:
df_mw.columns

Index(['Year', 'State', 'state_mw', 'state_mw_2020', 'fed_mw', 'fed_mw_2020',
       'effective_mw', 'effective_mw_2020'],
      dtype='object')

In [25]:
# Change data type of Min Wage dataframe.

df_mw = df_mw.astype({'Year': 'int32', 
                'State': 'category',
                'state_mw': 'float32', 
                'state_mw_2020': 'float32', 
                'fed_mw': 'float32', 
                'fed_mw_2020': 'float32',
                'effective_mw': 'float32', 
                'effective_mw_2020': 'float32'})

In [27]:
df_mw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2862 entries, 0 to 2861
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   Year               2862 non-null   int32   
 1   State              2862 non-null   category
 2   state_mw           2862 non-null   float32 
 3   state_mw_2020      2862 non-null   float32 
 4   fed_mw             2862 non-null   float32 
 5   fed_mw_2020        2862 non-null   float32 
 6   effective_mw       2862 non-null   float32 
 7   effective_mw_2020  2862 non-null   float32 
dtypes: category(1), float32(6), int32(1)
memory usage: 83.7 KB


Decreased memory usage from 179.0+ KB to 83.7 KB.

In [28]:
df_crime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 816 entries, 0 to 815
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   state             816 non-null    object 
 1   year              816 non-null    int64  
 2   prisoner_count    816 non-null    int64  
 3   state_population  800 non-null    float64
 4   violen_crime      800 non-null    float64
 5   murder            800 non-null    float64
 6   robbery           800 non-null    float64
 7   burglary          800 non-null    float64
dtypes: float64(5), int64(2), object(1)
memory usage: 51.1+ KB


In [29]:
df_crime.columns

Index(['state', 'year', 'prisoner_count', 'state_population', 'violen_crime',
       'murder', 'robbery', 'burglary'],
      dtype='object')

In [30]:
# Change data types of Crimes df.

df_crime = df_crime.astype({'state':'category',
                            'year':'int32', 
                            'prisoner_count':'float32', 
                            'state_population':'float32', 
                            'violen_crime':'float32',
                            'murder':'float32',  
                            'robbery':'float32', 
                            'burglary':'float32'})

In [31]:
df_crime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 816 entries, 0 to 815
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   state             816 non-null    category
 1   year              816 non-null    int32   
 2   prisoner_count    816 non-null    float32 
 3   state_population  800 non-null    float32 
 4   violen_crime      800 non-null    float32 
 5   murder            800 non-null    float32 
 6   robbery           800 non-null    float32 
 7   burglary          800 non-null    float32 
dtypes: category(1), float32(6), int32(1)
memory usage: 25.7 KB


Decreased memory usage from 51.1 KB to 25.7 KB.

Found typo in column name, change 'violen_crime' to 'violent_crime':

In [32]:
df_crime.rename(columns = {'violen_crime': 'violent_crime'}, inplace = True)

In [33]:
df_crime.columns

Index(['state', 'year', 'prisoner_count', 'state_population', 'violent_crime',
       'murder', 'robbery', 'burglary'],
      dtype='object')

### 5. Export dataframes to pickle

In [34]:
df_mw.to_pickle(os.path.join(path, '02_Data', 'PreparedData', 'MinWage_wrangled.pkl'))

In [35]:
df_crime.to_pickle(os.path.join(path, '02_Data', 'PreparedData', 'crimes_wrangled.pkl'))