In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn

listings = pd.read_csv('D:/data/crime_boston.csv', low_memory=False)
listings.head()

Unnamed: 0,INCIDENT_NUMBER,OFFENSE_CODE,OFFENSE_CODE_GROUP,OFFENSE_DESCRIPTION,DISTRICT,REPORTING_AREA,SHOOTING,OCCURRED_ON_DATE,YEAR,MONTH,DAY_OF_WEEK,HOUR,UCR_PART,STREET,Lat,Long,Location
0,TESTTEST2,423,,ASSAULT - AGGRAVATED,External,,0,2019-10-16 00:00:00,2019,10,Wednesday,0,,RIVERVIEW DR,,,"(0.00000000, 0.00000000)"
1,S97333701,3301,,VERBAL DISPUTE,C6,915.0,0,2020-07-18 14:34:00,2020,7,Saturday,14,,MARY BOYLE WAY,42.330813,-71.051368,"(42.33081300, -71.05136800)"
2,S47513131,2647,,THREATS TO DO BODILY HARM,E18,530.0,0,2020-06-24 10:15:00,2020,6,Wednesday,10,,READVILLE ST,42.239491,-71.135954,"(42.23949100, -71.13595400)"
3,I92102201,3301,,VERBAL DISPUTE,E13,583.0,0,2019-12-20 03:08:00,2019,12,Friday,3,,DAY ST,42.325122,-71.107779,"(42.32512200, -71.10777900)"
4,I92097173,3115,,INVESTIGATE PERSON,C11,355.0,0,2019-10-23 00:00:00,2019,10,Wednesday,0,,GIBSON ST,42.297555,-71.059709,"(42.29755500, -71.05970900)"


In [9]:
listings.isnull().sum()

INCIDENT_NUMBER             0
OFFENSE_CODE                0
OFFENSE_CODE_GROUP     106613
OFFENSE_DESCRIPTION         0
DISTRICT                 2468
REPORTING_AREA              0
SHOOTING               425093
OCCURRED_ON_DATE            0
YEAR                        0
MONTH                       0
DAY_OF_WEEK                 0
HOUR                        0
UCR_PART               106723
STREET                  21792
Lat                     29826
Long                    29826
Location                    0
dtype: int64

In [10]:
listings.SHOOTING.unique()

array(['0', nan, 'Y', '1'], dtype=object)

### SHOOTING Have 'Y' as well as '1', they are the same meaning, standardizing to '1', and fill nan as '0'

In [12]:
listings.loc[listings.SHOOTING=='Y', 'SHOOTING']='1'
listings.SHOOTING = listings.SHOOTING.fillna('0')
listings.SHOOTING.unique()

array(['0', '1'], dtype=object)

### We found 'Location' is dumplicated with 'Lat' and 'Long', let's drop this column.

In [21]:
listings = listings.drop(columns='Location')
listings.head()

Unnamed: 0,INCIDENT_NUMBER,OFFENSE_CODE,OFFENSE_CODE_GROUP,OFFENSE_DESCRIPTION,DISTRICT,REPORTING_AREA,SHOOTING,OCCURRED_ON_DATE,YEAR,MONTH,DAY_OF_WEEK,HOUR,UCR_PART,STREET,Lat,Long
0,TESTTEST2,423,,ASSAULT - AGGRAVATED,External,,0,2019-10-16 00:00:00,2019,10,Wednesday,0,,RIVERVIEW DR,,
1,S97333701,3301,,VERBAL DISPUTE,C6,915.0,0,2020-07-18 14:34:00,2020,7,Saturday,14,,MARY BOYLE WAY,42.330813,-71.051368
2,S47513131,2647,,THREATS TO DO BODILY HARM,E18,530.0,0,2020-06-24 10:15:00,2020,6,Wednesday,10,,READVILLE ST,42.239491,-71.135954
3,I92102201,3301,,VERBAL DISPUTE,E13,583.0,0,2019-12-20 03:08:00,2019,12,Friday,3,,DAY ST,42.325122,-71.107779
4,I92097173,3115,,INVESTIGATE PERSON,C11,355.0,0,2019-10-23 00:00:00,2019,10,Wednesday,0,,GIBSON ST,42.297555,-71.059709


### Check for misspelling

In [24]:
listings.DAY_OF_WEEK.unique()

array(['Wednesday', 'Saturday', 'Friday', 'Tuesday', 'Thursday', 'Sunday',
       'Monday'], dtype=object)

In [25]:
listings.OFFENSE_CODE_GROUP.unique()

array([nan, 'Auto Theft', 'Investigate Property', 'Investigate Person',
       'Vandalism', 'Verbal Disputes', 'Motor Vehicle Accident Response',
       'Aggravated Assault', 'Residential Burglary', 'Larceny',
       'Firearm Violations', 'Medical Assistance', 'Simple Assault',
       'Missing Person Reported', 'Robbery', 'Property Lost',
       'Violations', 'Warrant Arrests', 'Firearm Discovery', 'Other',
       'Ballistics', 'Towed', 'Drug Violation', 'Fire Related Reports',
       'Fraud', 'Disorderly Conduct', 'Larceny From Motor Vehicle',
       'Police Service Incidents', 'Missing Person Located', 'Harassment',
       'Property Found', 'Liquor Violation', 'Property Related Damage',
       'Confidence Games', 'Commercial Burglary',
       'Recovered Stolen Property', 'Homicide', 'Other Burglary',
       'Assembly or Gathering Violations', 'Counterfeiting',
       'Prisoner Related Incidents', 'License Plate Related Incidents',
       'Restraining Order Violations', 'Search Warran

In [None]:
sn.countplot(x= 'neighbourhood_group', data = listings)
plt.show()

sn.barplot(x = 'neighbourhood_group', y = 'price', data = listings)
plt.show()

#Now, let’s say we want to display the average price Airbnb listings
# in each neighborhood group of NY from the listings DataFrame
# without the black lines (confidence intervals) at the middle of each bar
# We need to set the parameter "ci" to False as follows:
sn.barplot(x = 'neighbourhood_group', y = 'price', data = listings, ci = False)
plt.show()

# we will use the “plt.hist” method to create a histogram of the quantitative
# data in the price column of the listings dataframe
plt.hist(listings['price'])
plt.xlabel('price(in US dollars)')
plt.show()

# For better visualization of the histogram, I will introduce the 'bins' parameter as follows# #
plt.hist(listings['price'], bins = np.arange(0,1100,40))
plt.xlabel('price(in US dollars)')
plt.show()

# For better visualization of the histogram, I will introduce the 'bins' parameter as follows
plt.scatter(x=listings ['price'], y= listings ['number_of_reviews'])
plt.xlabel('price')
plt.ylabel('number of reviews')
plt.show()

# Let’s say I want to restrict the x-axis, such that the scatter plot only goes up to a price of 1100.
# To do this, we shall recreate the scatter plot as follows
plt.scatter(x=listings ['price'], y= listings ['number_of_reviews'])
plt.xlabel('price')
plt.ylabel('number of reviews')
plt.xlim(0,1100)
plt.show()

# Let’s say I want to decrease the size of points on the scatter plot.
#To do this, we shall recreate the scatter plot using the ‘scatter’ method with the parameter ‘s’ set to 50.
plt.scatter(x=listings ['price'], y= listings ['number_of_reviews'], s = 5)
plt.xlabel('price')
plt.ylabel('number of reviews')
plt.xlim(0,1100)
plt.show()