# 1.1 - Stanford Open Policing Project dataset

#### > Preparing the data

In [None]:
import pandas as pd
ri = pd.read_csv('police.csv')
ri.head(3)

#### > Locating missing values

In [None]:
ri.isnull()

In [None]:
ri.isnull().sum()

#### > Dropping a column

In [None]:
ri.drop('county_name',axis='columns', inplace=True)

#### > Dropping rows

In [None]:
ri.dropna(subset=['stop_date','stop_time'], inplace=True)

# 1.2 - Using proper data types

#### > Examining the data types

In [None]:
ri.dtypes

#### > Fixing a data type

In [None]:
apple

apple['price'] = apple.price.astype('float')

In [None]:
apple.price.dtype

# 1.3 - Creating a DatetimeIndex

#### > Using datetime format

In [None]:
ri.dtypes

In [None]:
apple

In [None]:
apple.date.str.replace('/', '-')

In [None]:
combined = apple.date.str.cat(apple.time, sep=' ')
combined

#### > Converting to datetime format

In [None]:
apple['date_and_time'] = pd.to_datetime(combined)
apple

In [None]:
apple.dtypes

#### > Setting the index

In [None]:
apple.set_index('date_and_time', inplace=True)
apple

# 2.1 - Do the genders commit different violations?

#### > Counting unique values

In [None]:
ri.stop_outcome.value_counts()

In [None]:
ri.stop_outcome.value_counts().sum()

#### > Expressing counts as proportions

In [None]:
ri.stop_outcome.value_counts()

In [None]:
ri.stop_outcome.value_counts(normalize=True)

#### > Filtering DataFrame rows

In [None]:
ri.driver_race.value_counts()

In [None]:
white = ri[ri.driver_race =='White']
white.shape

#### > Comparing stop outcomes for two groups

In [None]:
white.stop_outcome.value_counts(normalize=True)

In [None]:
asian = ri[ri.driver_race == 'Asian']
asian.stop_outcome.value_counts(normalize=True)

# 2.2 - Does gender affect who gets a ticket for speeding?

#### > Filtering by multiple conditions

In [None]:
female = ri[ri.driver_gender =='F']
female.shape

In [None]:
female_and_arrested = ri[(ri.driver_gender == 'F') & (ri.is_arrested == True)]

In [None]:
female_and_arrested.shape

In [None]:
female_or_arrested = ri[(ri.driver_gender =='F') | (ri.is_arrested == True)]

# 2.3 - Does gender affect whose vehicle is searched?

#### > Math with Boolean values

In [None]:
ri.isnull().sum()

In [None]:
import numpy as np
np.mean([0, 1, 0, 0])

In [None]:
np.mean([False, True,
False, False])

#### > Taking the mean of a Boolean Series

In [None]:
ri.is_arrested.value_counts(normalize=True)

In [None]:
ri.is_arrested.mean()

In [None]:
ri.is_arrested.dtype

#### > Comparing groups using groupby

In [None]:
ri.district.unique()

In [None]:
ri[ri.district == 'Zone K1'].is_arrested.mean()

In [None]:
ri[ri.district == 'Zone K2'].is_arrested.mean()

In [None]:
ri.groupby('district').is_arrested.mean()

#### > Grouping by multiple categories

In [None]:
ri.groupby(['district', 'driver_gender']).is_arrested.mean()

In [None]:
ri.groupby(['driver_gender', 'district']).is_arrested.mean()

# 2.4 - Does gender affect who is frisked during a search?

In [None]:
ri.search_conducted.value_counts()

In [None]:
ri.search_type.value_counts(dropna=False)

#### > Examining the search types

In [None]:
ri.search_type.value_counts()

#### > Searching for a string

In [None]:
ri['inventory'] = ri.search_type.str.contains('Inventory', na=False)

In [None]:
ri.inventory.dtype

In [None]:
ri.inventory.sum()

#### > Calculating the inventory rate

In [None]:
ri.inventory.mean()

In [None]:
searched = ri[ri.search_conducted == True]
searched.inventory.mean()

# 3.1 - Does time of day affect arrest rate?

#### > Accessing datetime attributes

In [None]:
apple.dtypes

In [None]:
apple.date_and_time.dt.month

In [None]:
apple.set_index('date_and_time', inplace=True)
apple.index

In [None]:
apple.index.month

#### > Calculating the monthly mean price

In [None]:
apple.price.mean()

In [None]:
apple.groupby(apple.index.month).price.mean()

In [None]:
monthly_price = apple.groupby(apple.index.month).price.mean()

#### > Plotting the monthly mean price

In [None]:
import matplotlib.pyplot as plt
monthly_price.plot()

In [None]:
plt.xlabel('Month')
plt.ylabel('Price')
plt.title('Monthly mean stock price for Apple')

In [None]:
plt.show()

# 3.2 - Are drug-related stops on the rise?

#### > Resampling the price

In [None]:
apple.groupby(apple.index.month).price.mean()

In [None]:
apple.price.resample('M').mean()

In [None]:
apple.volume.resample('M').mean()

#### > Concatenating price and volume

In [None]:
monthly_price = apple.price.resample('M').mean()
monthly_volume = apple.volume.resample('M').mean()

In [None]:
pd.concat([monthly_price, monthly_volume], axis='columns')

In [None]:
monthly = pd.concat([monthly_price, monthly_volume],axis='columns')

#### > Plotting price and volume

In [None]:
monthly.plot()
plt.show()

In [None]:
monthly.plot(subplots=True)
plt.show()

# 3.3 - What violations are caughtin each district?

#### > Computing a frequency table

In [None]:
pd.crosstab(ri.driver_race, ri.driver_gender)

In [None]:
ri[(ri.driver_race == 'Asian') & (ri.driver_gender == 'F')].shape

In [None]:
table = pd.crosstab(ri.driver_race,ri.driver_gender)

#### > Selecting a DataFrame slice

In [None]:
table.loc['Asian':'Hispanic']
table = table.loc['Asian':'Hispanic']

#### > Creating a line plot

In [None]:
table.plot()
plt.show()

#### > Creating a bar plot

In [None]:
table.plot(kind='bar')
plt.show()

#### > Stacking the bars

In [None]:
table.plot(kind='bar', stacked=True)
plt.show()

# 3.4 - How long might you be stopped for a violation?

#### > Mapping one set of values to another

In [None]:
mapping = {'up':True, 'down':False}
apple['is_up'] = apple.change.map(mapping)
apple

In [None]:
apple.is_up.mean()

#### > Calculating the search rate

In [None]:
ri.groupby('violation').search_conducted.mean()

In [None]:
search_rate = ri.groupby('violation').search_conducted.mean

#### > Creating a bar plot

In [None]:
search_rate.plot(kind='bar')
plt.show()

#### > Ordering the bars

In [None]:
search_rate.sort_values()

In [None]:
search_rate.sort_values().plot(kind='bar')
plt.show()

#### > Rotating the bars

In [None]:
search_rate.sort_values().plot(kind='barh')
plt.show()

# 4.1 - Exploring the weather dataset

In [None]:
weather = pd.read_csv('weather.csv')
weather.head(3)

#### > Examining the wind speed

In [None]:
weather[['AWND', 'WSF2']].head()

In [None]:
weather[['AWND', 'WSF2']].describe()

#### > Creating a box plot

In [None]:
weather[['AWND','WSF2']].plot(kind='box')
plt.show()

#### > Creating a histogram

In [None]:
weather['WDIFF'] = weather.WSF2 - weather.AWND
weather.WDIFF.plot(kind='hist')
plt.show()

In [None]:
weather.WDIFF.plot(kind='hist', bins=20)
plt.show()

# 4.2 - Categorizing the weather

#### > Selecting a DataFrame slice

In [None]:
weather.shape

In [None]:
weather.columns

In [None]:
temp = weather.loc[:,'TAVG':'TMAX']

#### > DataFrame operations

In [None]:
temp.sum(axis='columns').head()

#### > Mapping one set of values to another

In [None]:
ri.stop_duration.unique()

In [None]:
mapping = {'0-15 Min':'short','16-30 Min':'medium','30+ Min':'long'}
ri['stop_length'] = ri.stop_duration.map(mapping)
ri.stop_length.dtype

#### > Changing data type from objectto category

In [None]:
ri.stop_length.unique()

In [None]:
ri.stop_length.memory_usage(deep=True)

In [None]:
cats = ['short','medium','long']
ri['stop_length'] = ri.stop_length.astype('category',ordered=True,categories=cats)
ri.stop_length.memory_usage(deep=True)

In [None]:
ri.stop_length.head()

#### > Using ordered categories

In [None]:
ri[ri.stop_length > 'short'].shape

In [None]:
ri.groupby('stop_length').is_arrested.mean()

# 4.3 - Merging datasets

In [None]:
apple

In [None]:
apple.reset_index(inplace=True)
apple

#### > Preparing the second DataFrame

In [None]:
high_low

In [None]:
high = high_low[['DATE', 'HIGH']]
high

#### > Merging the DataFrames

In [None]:
apple_high = pd.merge(left=apple, right=high,left_on='date', right_on='DATE',how='left')

In [None]:
apple_high

In [None]:
apple

In [None]:
high

In [None]:
apple_high.set_index('date_and_time', inplace=True)
apple_high

# 4.4 - Does weather affect the arrest rate?

In [None]:
ri.search_conducted.mean()

In [None]:
ri.groupby('driver_gender').search_conducted.mean()

In [None]:
ri.groupby(['violation','driver_gender']).search_conducted.mean()

In [None]:
search_rate = ri.groupby(['violation','driver_gender']).search_conducted.mean()
search_rate

In [None]:
type(search_rate)
type(search_rate.index)

In [None]:
search_rate.loc['Equipment']

In [None]:
search_rate.loc['Equipment', 'M']

#### > Converting a multi-indexed Series to a DataFrame

In [None]:
search_rate.unstack()

In [None]:
type(search_rate.unstack())

In [None]:
ri.pivot_table(index='violation',columns='driver_gender',values='search_conducted')