# Data Types

In [1]:
import numpy as np
import pandas as pd

## 1. List
extend, index, pop

In [2]:
baby_names = ['Ximena', 'Aliza', 'Ayden', 'Calvin']

baby_names.extend(['Rowen','Sandeep'])
print(baby_names)

# Find the position & pop
position = baby_names.index('Aliza')
baby_names.pop(position)
print(baby_names)

['Ximena', 'Aliza', 'Ayden', 'Calvin', 'Rowen', 'Sandeep']
['Ximena', 'Ayden', 'Calvin', 'Rowen', 'Sandeep']


In [3]:
sorted(baby_names)

['Ayden', 'Calvin', 'Rowen', 'Sandeep', 'Ximena']

## 2. Tuple

In [4]:
girl_names = ['Olivia','RACHEL','HAILEY','Brielle','Samantha']
boy_names = ['Ryan','Joshua','ETHAN','KEVIN','Samuel']

pairs = zip(girl_names, boy_names)

print(pairs)

<zip object at 0x0000026A60D54F48>


In [5]:
for idx, pair in enumerate(pairs):
    # Unpack pair
    girl_name, boy_name = pair
    print('{}: {} and {}'.format(idx, girl_name, boy_name))

0: Olivia and Ryan
1: RACHEL and Joshua
2: HAILEY and ETHAN
3: Brielle and KEVIN
4: Samantha and Samuel


## 3. Set

In [6]:
df_name = pd.read_csv('data/baby_names.csv')

df_name_2011 = df_name[(df_name.BRITH_YEAR == 2011) & (df_name.GENDER == 'FEMALE')]
df_name_2012 = df_name[(df_name.BRITH_YEAR == 2012) & (df_name.GENDER == 'FEMALE')]

print(df_name_2011.head())

print(df_name_2011.shape)
print(df_name_2012.shape)

   BRITH_YEAR  GENDER  ETHNICTY       NAME  COUNT  RANK
0        2011  FEMALE  HISPANIC  GERALDINE     13    75
1        2011  FEMALE  HISPANIC        GIA     21    67
2        2011  FEMALE  HISPANIC     GIANNA     49    42
3        2011  FEMALE  HISPANIC    GISELLE     38    51
4        2011  FEMALE  HISPANIC      GRACE     36    53
(4016, 6)
(1023, 6)


In [7]:
baby_names_2011 = set()
for row in np.array(df_name_2011):
    baby_names_2011.add((row[3], row[5]))   # name, rank
    
baby_names_2012 = set()
for row in np.array(df_name_2012):
    baby_names_2012.add((row[3], row[5]))
    
all_names = baby_names_2011.union(baby_names_2012)

diff_names = baby_names_2011.difference(baby_names_2012)

overlap_names = baby_names_2011.intersection(baby_names_2012)
print(overlap_names)

{('KIRA', 80), ('EDEN', 67), ('GABRIELLE', 76), ('SOPHIA', 4), ('STELLA', 32), ('AVA', 8), ('HARMONY', 35), ('AUBREY', 63), ('HAILEY', 57), ('SAVANNA', 78), ('AVIGAIL', 80), ('ANGELINA', 44), ('JAZMINE', 74), ('ISABELLA', 1), ('ABIGAIL', 19), ('MARIAMA', 33), ('ALEXA', 40), ('EMMA', 5), ('ALISSON', 73), ('ANGIE', 69), ('AUTUMN', 29), ('IVY', 76), ('OLIVIA', 4), ('IRIS', 78), ('MARILYN', 78), ('DESTINY', 20), ('ISABELLA', 6), ('MADISON', 1), ('JULIA', 45), ('ELIANA', 55), ('YAEL', 70), ('SLOANE', 74), ('MELANIE', 12), ('LILY', 33), ('AMY', 80), ('ALONDRA', 72), ('ROSA', 78), ('LAILA', 59), ('STEPHANIE', 41), ('ARIANA', 64), ('JULIETTE', 68), ('LONDON', 2), ('AMANDA', 76), ('ZOE', 23), ('CHARLOTTE', 77), ('CHLOE', 4), ('KATIE', 76), ('KATE', 38), ('JULIET', 55), ('KAYLEE', 28), ('MORGAN', 71), ('VICTORIA', 19), ('ANAYA', 43), ('VIVIANA', 77), ('ALESSIA', 77), ('AMIRA', 80), ('ARIELLA', 65), ('ALEXANDRA', 43), ('MADELYN', 72)}


## 4. Dictionary

In [8]:
names_2011 = {}
names_2012 = {}

# Loop over the girl names
for name, rank in baby_names_2011:
    # Add each name to the names dictionary using rank as the key
    names_2011[rank] = name

for name, rank in baby_names_2012:
    names_2012[rank] = name

# Sort the names list by rank in descending order and slice the first 10 items
for rank in sorted(names_2012, reverse=True)[:10]:
    print(rank, names_2012[rank])

83 MARYAM
82 ARIEL
81 ATHENA
80 LEORA
79 ADA
78 ELISA
77 ALESSIA
76 NOEMI
75 ALANI
74 ANASTASIA


In [9]:
# Safely finding by key

print(names_2012.get(1))

print(type(names_2012.get(100)))  # None
print(names_2012.get(100, 'Not Found'))

CHLOE
<class 'NoneType'>
Not Found


In [10]:
# nested dictionary
girl_names = {}

girl_names[2011] = names_2011
girl_names[2012] = names_2012
girl_names[2013] = {}

print(girl_names.keys())
print(girl_names[2011].keys())

dict_keys([2011, 2012, 2013])
dict_keys([76, 52, 4, 10, 11, 67, 78, 72, 77, 36, 37, 71, 14, 58, 57, 61, 68, 35, 74, 79, 41, 13, 17, 81, 34, 45, 56, 62, 59, 46, 29, 39, 43, 65, 49, 75, 2, 42, 80, 38, 28, 63, 66, 30, 60, 40, 73, 53, 3, 18, 5, 16, 32, 50, 25, 7, 21, 69, 1, 12, 27, 31, 26, 44, 24, 23, 6, 19, 33, 64, 15, 9, 55, 20, 22, 48, 8, 54, 70, 47, 51])


In [11]:
girl_names[2013].update([(1, 'Kate'), (2, 'Elsa'), (3, 'Jessica')])
print(girl_names[2013])

{1: 'Kate', 2: 'Elsa', 3: 'Jessica'}


In [12]:
for year in girl_names:
    for rank in sorted(girl_names[year], reverse=True)[:1]:
        # Check that you have a rank
        if not rank:
            print(year, 'No Data Available')
            
        # Safely print the year and the least popular name or 'Not Available'
        print(year, girl_names[year].get(rank,'Not Available'))

2011 ANGELICA
2012 MARYAM
2013 Jessica


In [13]:
# Working with dictionaries more pythonically
for rank, name in girl_names[2013].items():
    print(rank, name)

1 Kate
2 Elsa
3 Jessica


In [14]:
# Checking dictionaries for data

if 2011 in girl_names:
    print('Found 2011')
    
# Check to see if rank 1 is in 2012
if 1 in girl_names[2012]:
    print('Found Rank 1 in 2012')
else:
    print('Rank 1 missing from 2012')

Found 2011
Found Rank 1 in 2012


### CSV reader / DictReader

In [15]:
import csv

baby_names = {}
csvfile = open('data/baby_names.csv','r')

In [16]:
# CSV reader
reader = csv.reader(csvfile)
next(reader, None)  # skip the headers

for row in reader:
    baby_names[row[5]] = row[3]    # row = ['2011', 'FEMALE', 'HISPANIC', 'GIA', '21', '67']

print(baby_names.keys())

dict_keys(['75', '67', '42', '51', '53', '62', '8', '74', '71', '78', '73', '72', '77', '60', '1', '70', '61', '65', '58', '41', '76', '68', '48', '55', '40', '66', '64', '34', '44', '57', '63', '33', '31', '20', '69', '35', '13', '52', '59', '39', '9', '27', '10', '56', '12', '2', '25', '18', '14', '38', '28', '6', '3', '19', '45', '47', '11', '79', '17', '43', '80', '37', '81', '46', '5', '22', '50', '21', '30', '24', '54', '15', '36', '23', '7', '16', '49', '29', '4', '32', '26', '92', '90', '82', '91', '88', '89', '94', '83', '93', '84', '87', '85', '86', '96', '97', '95', '99', '98', '100', '101', '102'])


In [17]:
# CSV DictReader
for row in csv.DictReader(csvfile):
    baby_names[row['RANK']] = row['NAME']    # row = dictionary

print(baby_names.keys())

dict_keys(['75', '67', '42', '51', '53', '62', '8', '74', '71', '78', '73', '72', '77', '60', '1', '70', '61', '65', '58', '41', '76', '68', '48', '55', '40', '66', '64', '34', '44', '57', '63', '33', '31', '20', '69', '35', '13', '52', '59', '39', '9', '27', '10', '56', '12', '2', '25', '18', '14', '38', '28', '6', '3', '19', '45', '47', '11', '79', '17', '43', '80', '37', '81', '46', '5', '22', '50', '21', '30', '24', '54', '15', '36', '23', '7', '16', '49', '29', '4', '32', '26', '92', '90', '82', '91', '88', '89', '94', '83', '93', '84', '87', '85', '86', '96', '97', '95', '99', '98', '100', '101', '102'])


# 5. collections module

In [18]:
from collections import Counter

In [19]:
df_sta = pd.read_csv('data/cta_daily_station_totals.csv')
df_sta.head()

Unnamed: 0,station_id,stationname,date,daytype,rides
0,40010,Austin-Forest Park,01/01/2015,SUNDAY/HOLIDAY,587
1,40010,Austin-Forest Park,01/02/2015,WEEKDAY,1386
2,40010,Austin-Forest Park,01/03/2015,SATURDAY,785
3,40010,Austin-Forest Park,01/04/2015,SUNDAY/HOLIDAY,625
4,40010,Austin-Forest Park,01/05/2015,WEEKDAY,1752


In [20]:
stations = df_sta.stationname.unique()   # unique of column items
len(stations)

144

In [21]:
station_count = Counter(df_sta.stationname)
print(station_count)

Counter({'Austin-Forest Park': 700, 'Harlem-Lake': 700, 'Pulaski-Lake': 700, 'Quincy/Wells': 700, 'Davis': 700, "Belmont-O'Hare": 700, 'Jackson/Dearborn': 700, 'Sheridan': 700, 'Damen-Brown': 700, 'Morse': 700, '35th/Archer': 700, '51st': 700, 'Dempster-Skokie': 700, 'Pulaski-Cermak': 700, 'LaSalle/Van Buren': 700, 'Ashland-Lake': 700, 'Oak Park-Forest Park': 700, 'Sox-35th-Dan Ryan': 700, 'Randolph/Wabash': 700, 'Damen-Cermak': 700, 'Western-Forest Park': 700, 'Cumberland': 700, '79th': 700, 'Kedzie-Homan-Forest Park': 700, 'State/Lake': 700, 'Main': 700, 'Central-Lake': 700, 'Ashland/63rd': 700, 'Indiana': 700, 'Western-Orange': 700, 'Division/Milwaukee': 700, 'Grand/State': 700, 'Berwyn': 700, 'UIC-Halsted': 700, 'Southport': 700, 'Washington/Dearborn': 700, 'Clark/Lake': 700, 'Forest Park': 700, 'Noyes': 700, 'Cicero-Cermak': 700, 'Clinton-Forest Park': 700, 'California-Cermak': 700, '95th/Dan Ryan': 700, 'Merchandise Mart': 700, 'Racine': 700, 'Cicero-Lake': 700, 'Grand/Milwaukee'

In [22]:
# Finding most common elements
print(station_count.most_common(5))

[('Austin-Forest Park', 700), ('Harlem-Lake', 700), ('Pulaski-Lake', 700), ('Quincy/Wells', 700), ('Davis', 700)]


In [23]:
entries = np.array(df_sta[['date','stationname','rides']])[:1000]
entries.shape

(1000, 3)

In [24]:
# defaultdict()
# dictionary에 기본값을 정의해 키값이 없더라도 에러를 출력하지않고 기본값을 출력한다.
from collections import defaultdict

In [25]:
# Create a defaultdict with a default type of list: ridership
ridership = defaultdict(list)

for date, stop, riders in entries:
    # Use the stop as the key of ridership and append the riders to its value
    ridership[date].append((stop, riders))

In [26]:
print(list(ridership.items())[:1])

[('01/01/2015', [('Austin-Forest Park', 587), ('Harlem-Lake', 1106), ('Pulaski-Lake', 811), ('Quincy/Wells', 1117), ('Davis', 1400), ("Belmont-O'Hare", 2023), ('Jackson/Dearborn', 1730), ('Sheridan', 2616), ('Damen-Brown', 751), ('Morse', 2433), ('35th/Archer', 862), ('51st', 430), ('Dempster-Skokie', 542), ('Pulaski-Cermak', 491), ('LaSalle/Van Buren', 270), ('Ashland-Lake', 833), ('Oak Park-Forest Park', 416), ('Sox-35th-Dan Ryan', 1862), ('Randolph/Wabash', 2267), ('Damen-Cermak', 451), ('Western-Forest Park', 673), ('Cumberland', 1053), ('79th', 3641), ('Kedzie-Homan-Forest Park', 1151), ('State/Lake', 3566), ('Main', 468), ('Central-Lake', 985), ('Ashland/63rd', 547), ('Indiana', 319), ('Western-Orange', 1029), ('Division/Milwaukee', 2714), ('Grand/State', 12152), ('Berwyn', 1744)])]


In [27]:
# OrderedDictionaries
from collections import OrderedDict

In [28]:
ridership_date = OrderedDict()

for date, stop, riders in entries:
    # If a key does not exist in ridership_date, set it to 0
    if not date in ridership_date:
        ridership_date[date] = 0
        continue
        
    # Add riders to the date key in ridership_date
    ridership_date[date] += riders
    
print(list(ridership_date.items())[:20])

[('01/01/2015', 52453), ('01/02/2015', 94310), ('01/03/2015', 58621), ('01/04/2015', 46778), ('01/05/2015', 113876), ('01/06/2015', 117591), ('01/07/2015', 87687), ('01/08/2015', 100296), ('01/09/2015', 108958), ('01/10/2015', 57727), ('01/11/2015', 46049), ('01/12/2015', 123425), ('01/13/2015', 124727), ('01/14/2015', 124723), ('01/15/2015', 127371), ('01/16/2015', 125892), ('01/17/2015', 69142), ('01/18/2015', 50461), ('01/19/2015', 83803), ('01/20/2015', 126402)]


# 6. DateTimes

In [29]:
from datetime import datetime
from datetime import timedelta

In [30]:
local_dt = datetime.now()
print(local_dt)

# Compute the UTC datetime
utc_dt = datetime.utcnow()
print(utc_dt)

# Finding a time in the future and from the past
glanceback = timedelta(days=30)
print(local_dt - glanceback)
print(local_dt + glanceback)

2019-04-01 16:49:57.465794
2019-04-01 07:49:57.465794
2019-03-02 16:49:57.465794
2019-05-01 16:49:57.465794


In [31]:
for date_str in entries[:5]:
    # Convert each date to a datetime object
    date_obj  = datetime.strptime(date_str[0], '%m/%d/%Y')
    print(date_obj)
    
    # Convert to string
    print(datetime.strftime(date_obj, '%m/%d/%Y'))
    
    print(datetime.isoformat(date_obj))    # ISO standard string
    print()

2015-01-01 00:00:00
01/01/2015
2015-01-01T00:00:00

2015-01-02 00:00:00
01/02/2015
2015-01-02T00:00:00

2015-01-03 00:00:00
01/03/2015
2015-01-03T00:00:00

2015-01-04 00:00:00
01/04/2015
2015-01-04T00:00:00

2015-01-05 00:00:00
01/05/2015
2015-01-05T00:00:00



In [32]:
# Summary as the month, year, day

monthly_total_rides = defaultdict(int)

for date, stop, riders in entries:
    service_datetime = datetime.strptime(date, '%m/%d/%Y')

    # Add the total rides to the current amount for the month
    monthly_total_rides[service_datetime.month] += int(riders)
    
print(monthly_total_rides)

defaultdict(<class 'int'>, {1: 3079348})


# Excercise

In [33]:
csvfile = open('data/chicago_crime.csv', 'r')

crime_data = []

for row in csv.reader(csvfile):
    # Append the date, type of crime, location description, and arrest
    crime_data.append((row[0], row[2], row[4], row[5]))
    
# Remove the first element from crime_data
crime_data.pop(0)

('Date', 'Primary Type', 'Location Description', 'Arrest')

In [34]:
crimes_by_month = Counter()

for data in crime_data:
    
    # Convert to datetimes
    date = datetime.strptime(data[0], '%m/%d/%Y %I:%M:%S %p')
    
    # count by month
    crimes_by_month[date.month] += 1
    
# most common months for crime
print(crimes_by_month.most_common(3))

[(1, 1948), (2, 1862), (7, 1257)]


In [35]:
# locations by month
locations_by_month = defaultdict(list)

for row in crime_data:
    date = datetime.strptime(row[0], '%m/%d/%Y %I:%M:%S %p')
    
    # If the year is 2016 
    if date.year == 2016:
        # Set the dictionary key to the month and add the location
        locations_by_month[date.month].append(row[2])
    
print(locations_by_month[12][:10])

['APARTMENT', 'STREET', 'RESIDENCE', 'CONSTRUCTION SITE', 'APARTMENT', 'STREET', 'VEHICLE NON-COMMERCIAL', 'STREET', 'APARTMENT', 'STREET']


In [36]:
# Find the Most Common Crimes by Location Type by Month in 2016
for month, locations in locations_by_month.items():    
    location_count = Counter(locations)
    print(month, location_count.most_common(5))

5 [('STREET', 241), ('RESIDENCE', 175), ('APARTMENT', 128), ('SIDEWALK', 111), ('OTHER', 41)]
3 [('STREET', 240), ('RESIDENCE', 190), ('APARTMENT', 139), ('SIDEWALK', 99), ('OTHER', 52)]
4 [('STREET', 213), ('RESIDENCE', 171), ('APARTMENT', 152), ('SIDEWALK', 96), ('OTHER', 40)]
6 [('STREET', 245), ('RESIDENCE', 164), ('APARTMENT', 159), ('SIDEWALK', 123), ('PARKING LOT/GARAGE(NON.RESID.)', 44)]
7 [('STREET', 309), ('RESIDENCE', 177), ('APARTMENT', 166), ('SIDEWALK', 125), ('OTHER', 47)]
10 [('STREET', 248), ('RESIDENCE', 206), ('APARTMENT', 122), ('SIDEWALK', 92), ('OTHER', 62)]
12 [('STREET', 207), ('RESIDENCE', 158), ('APARTMENT', 136), ('OTHER', 47), ('SIDEWALK', 46)]
1 [('STREET', 196), ('RESIDENCE', 160), ('APARTMENT', 153), ('SIDEWALK', 72), ('PARKING LOT/GARAGE(NON.RESID.)', 43)]
9 [('STREET', 279), ('RESIDENCE', 183), ('APARTMENT', 144), ('SIDEWALK', 121), ('OTHER', 39)]
11 [('STREET', 236), ('RESIDENCE', 182), ('APARTMENT', 154), ('SIDEWALK', 75), ('OTHER', 41)]
8 [('STREET',

In [37]:
# crimes by district
csvfile = open('data/chicago_crime.csv', 'r')

crimes_by_district = defaultdict(list)

for row in csv.DictReader(csvfile):
    # Pop the district from each row
    district = row.pop('District')
    
    # Append the rest of the data to the list for proper district in crimes_by_district
    crimes_by_district[district].append(row)
    
print(crimes_by_district.keys())

dict_keys(['14', '24', '6', '15', '12', '7', '1', '11', '18', '22', '5', '16', '9', '8', '3', '2', '19', '10', '4', '17', '20', '25', '31'])


In [38]:
print(crimes_by_district['25'][:5])

[OrderedDict([('Date', '02/15/2016 04:45:00 AM'), ('Block', '029XX N LOTUS AVE'), ('Primary Type', 'BURGLARY'), ('Description', 'FORCIBLE ENTRY'), ('Location Description', 'RESIDENCE-GARAGE'), ('Arrest', 'false'), ('Domestic', 'false')]), OrderedDict([('Date', '02/13/2017 10:45:00 AM'), ('Block', '014XX N LEAMINGTON AVE'), ('Primary Type', 'CRIMINAL DAMAGE'), ('Description', 'TO PROPERTY'), ('Location Description', 'SCHOOL, PUBLIC, BUILDING'), ('Arrest', 'false'), ('Domestic', 'false')]), OrderedDict([('Date', '07/17/2016 11:25:00 AM'), ('Block', '049XX W ST PAUL AVE'), ('Primary Type', 'CRIMINAL DAMAGE'), ('Description', 'TO PROPERTY'), ('Location Description', 'APARTMENT'), ('Arrest', 'false'), ('Domestic', 'false')]), OrderedDict([('Date', '02/05/2017 05:30:00 AM'), ('Block', '051XX W DIVERSEY AVE'), ('Primary Type', 'BATTERY'), ('Description', 'DOMESTIC BATTERY SIMPLE'), ('Location Description', 'APARTMENT'), ('Arrest', 'true'), ('Domestic', 'true')]), OrderedDict([('Date', '11/07/

In [39]:
# Arrest count by District by Year
for district, crimes in crimes_by_district.items():
    year_count = Counter()

    for crime in crimes:
        # If there was an arrest
        if crime['Arrest'] == 'true':
            year = datetime.strptime(crime['Date'], '%m/%d/%Y %I:%M:%S %p').year
            year_count[year] += 1
            
    print(district, year_count)

14 Counter({2016: 59, 2017: 8})
24 Counter({2016: 51, 2017: 10})
6 Counter({2016: 157, 2017: 32})
15 Counter({2016: 154, 2017: 16})
12 Counter({2016: 72, 2017: 9})
7 Counter({2016: 181, 2017: 27})
1 Counter({2016: 124, 2017: 15})
11 Counter({2016: 275, 2017: 53})
18 Counter({2016: 92, 2017: 17})
22 Counter({2016: 78, 2017: 12})
5 Counter({2016: 149, 2017: 30})
16 Counter({2016: 66, 2017: 9})
9 Counter({2016: 116, 2017: 17})
8 Counter({2016: 124, 2017: 26})
3 Counter({2016: 98, 2017: 18})
2 Counter({2016: 84, 2017: 15})
19 Counter({2016: 88, 2017: 11})
10 Counter({2016: 144, 2017: 20})
4 Counter({2016: 134, 2017: 15})
17 Counter({2016: 38, 2017: 5})
20 Counter({2016: 27, 2017: 8})
25 Counter({2016: 150, 2017: 26})
31 Counter({2016: 1})


In [40]:
csvfile = open('data/chicago_crime.csv', 'r')

crimes_by_block = defaultdict(list)

for row in csv.DictReader(csvfile):
    crimes_by_block[row['Block']].append(row['Primary Type'])
    
print(len(crimes_by_block.keys()))

9195


In [41]:
n_state_st_crimes = set(crimes_by_block['001XX N STATE ST'])
print(n_state_st_crimes)

w_terminal_st_crimes = set(crimes_by_block['0000X W TERMINAL ST'])
print(w_terminal_st_crimes)

# Find the differences
crime_differences = n_state_st_crimes.difference(w_terminal_st_crimes)
print(crime_differences)

{'ASSAULT', 'THEFT', 'CRIMINAL DAMAGE', 'DECEPTIVE PRACTICE', 'OTHER OFFENSE', 'CRIMINAL TRESPASS', 'BATTERY', 'ROBBERY'}
{'ASSAULT', 'THEFT', 'PUBLIC PEACE VIOLATION', 'CRIMINAL DAMAGE', 'DECEPTIVE PRACTICE', 'OTHER OFFENSE', 'CRIMINAL TRESPASS', 'NARCOTICS'}
{'ROBBERY', 'BATTERY'}
