<a href="https://colab.research.google.com/github/sunc-dev/Tensorflow-ml-learning/blob/master/Prison_Population.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from pandas.plotting import scatter_matrix


#google collab libraries
from google.colab import drive

#sklearn libraries
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
import joblib
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from scipy import stats

In [30]:
#mount google drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
def load_data(path, options):
  df = pd.read_csv(path, error_bad_lines=options)
  return df

In [33]:
# Define root paths

root_path = '/content/drive/My Drive/Datasets/Crime/'
path = os.path.join(root_path, 'crime_and_incarceration_by_state.csv')
print(path)

crime = load_data(path, False)


/content/drive/My Drive/Datasets/Crime/crime_and_incarceration_by_state.csv
      jurisdiction  includes_jails  year  ...  burglary   larceny vehicle_theft
0          FEDERAL           False  2001  ...       NaN       NaN           NaN
1          ALABAMA           False  2001  ...   40642.0  119992.0       12619.0
2           ALASKA            True  2001  ...    3847.0   16695.0        2618.0
3          ARIZONA           False  2001  ...   54821.0  186850.0       52203.0
4         ARKANSAS           False  2001  ...   22196.0   69590.0        7320.0
..             ...             ...   ...  ...       ...       ...           ...
811       VIRGINIA           False  2016  ...   20159.0  127285.0        9848.0
812     WASHINGTON           False  2016  ...   49249.0  173423.0       32322.0
813  WEST VIRGINIA           False  2016  ...    9127.0   25657.0        2498.0
814      WISCONSIN           False  2016  ...   19498.0   82455.0        9958.0
815        WYOMING           False  2016  ..

In [36]:
# Explore dataset

# Look at top 5 elements
crime.head(5)

# Look at column and data types
crime.info()
crime.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 816 entries, 0 to 815
Data columns (total 17 columns):
jurisdiction              816 non-null object
includes_jails            816 non-null bool
year                      816 non-null int64
prisoner_count            816 non-null int64
crime_reporting_change    799 non-null object
crimes_estimated          799 non-null object
state_population          799 non-null float64
violent_crime_total       799 non-null float64
murder_manslaughter       799 non-null float64
rape_legacy               749 non-null float64
rape_revised              199 non-null float64
robbery                   799 non-null float64
agg_assault               799 non-null float64
property_crime_total      799 non-null float64
burglary                  799 non-null float64
larceny                   799 non-null float64
vehicle_theft             799 non-null float64
dtypes: bool(1), float64(11), int64(2), object(3)
memory usage: 102.9+ KB


Unnamed: 0,year,prisoner_count,state_population,violent_crime_total,murder_manslaughter,rape_legacy,rape_revised,robbery,agg_assault,property_crime_total,burglary,larceny,vehicle_theft
count,816.0,816.0,799.0,799.0,799.0,749.0,199.0,799.0,799.0,799.0,799.0,799.0,799.0
mean,2008.5,28606.033088,6072322.0,26228.459324,313.702128,1788.339119,2406.19598,7696.824781,16256.270338,187800.6,40870.40801,127912.28786,19017.904881
std,4.612599,39556.940699,6725500.0,33866.838388,386.019821,1865.443299,2550.486639,11107.478615,20849.515589,213850.3,47829.948836,139434.591858,30780.350362
min,2001.0,1088.0,493754.0,496.0,5.0,99.0,110.0,43.0,270.0,8806.0,1689.0,6660.0,178.0
25%,2004.75,5698.0,1790026.0,5213.0,48.5,571.0,780.0,1106.0,3529.0,47497.5,9406.0,32765.5,4191.0
50%,2008.5,16915.0,4314113.0,15744.0,179.0,1238.0,1723.0,3933.0,10083.0,132773.0,27698.0,95079.0,10583.0
75%,2012.25,30920.5,6808844.0,31843.0,429.0,2092.0,2680.0,8702.0,20308.0,225957.5,47941.0,155688.0,20872.5
max,2016.0,216915.0,39296480.0,212867.0,2503.0,10198.0,13702.0,71142.0,136087.0,1227194.0,250521.0,731486.0,257543.0


In [48]:
#Preparing the data for visualisations
ids = ['jurisdiction',
       'year'
       ]

dims = ['prisoner_count',
              'state_population',        
              'violent_crime_total',       
              'murder_manslaughter',       
              'rape_legacy',
              'rape_revised',
              'robbery',
              'agg_assault',
              'property_crime_total',
              'burglary',
              'larceny',
              'vehicle_theft'
              ]

print(aggregates)

crime_melt = crime.melt(id_vars = ids,
                        value_vars=dims, 
                        var_name="type",
                        value_name="count")

print(crime_melt)

['prisoner_count', 'state_population', 'violent_crime_total', 'murder_manslaughter', 'rape_legacy', 'rape_revised', 'robbery', 'agg_assault', 'property_crime_total', 'burglary', 'larceny', 'vehicle_theft']
       jurisdiction  year            type     count
0           FEDERAL  2001  prisoner_count  149852.0
1           ALABAMA  2001  prisoner_count   24741.0
2            ALASKA  2001  prisoner_count    4570.0
3           ARIZONA  2001  prisoner_count   27710.0
4          ARKANSAS  2001  prisoner_count   11489.0
...             ...   ...             ...       ...
9787       VIRGINIA  2016   vehicle_theft    9848.0
9788     WASHINGTON  2016   vehicle_theft   32322.0
9789  WEST VIRGINIA  2016   vehicle_theft    2498.0
9790      WISCONSIN  2016   vehicle_theft    9958.0
9791        WYOMING  2016   vehicle_theft     800.0

[9792 rows x 4 columns]


In [0]:
#Prisoner counts per year