In [2]:
# Import Dependencies
import pandas as pd
import os

In [3]:
# Create a reference the CSV file desired
csv_path = os.path.join('Resources', 'usa_ufo_sightings.csv')

# Read the CSV into a Pandas DataFrame
usa_ufo_df = pd.read_csv(csv_path)

# Print the first five rows of data to the screen
usa_ufo_df.head()

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude
0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700.0,45 minutes,This event took place in early fall around 194...,4/27/2004,29.883056,-97.941111
1,10/10/1956 21:00,edna,tx,us,circle,20.0,1/2 hour,My older brother and twin sister were leaving ...,1/17/2004,28.978333,-96.645833
2,10/10/1960 20:00,kaneohe,hi,us,light,900.0,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1/22/2004,21.418056,-157.803611
3,10/10/1961 19:00,bristol,tn,us,sphere,300.0,5 minutes,My father is now 89 my brother 52 the girl wit...,4/27/2007,36.595,-82.188889
4,10/10/1965 23:45,norwalk,ct,us,disk,1200.0,20 minutes,A bright orange color changing to reddish colo...,10/2/1999,41.1175,-73.408333


In [4]:
# Count how many sightings have occured within each state
state_counts = usa_ufo_df["state"].value_counts()
state_counts.head()

ca    8683
fl    3754
wa    3707
tx    3398
ny    2915
Name: state, dtype: int64

In [6]:
# Using GroupBy in order to separate the data into fields according to "state" values
grouped_usa = usa_ufo_df.groupby('state')

# The object returned is a "GroupBy" object and cannot be viewed normally...
grouped_usa

<pandas.core.groupby.DataFrameGroupBy object at 0x11626d6a0>

In [7]:
# In order to be visualized, a data function must be used...
grouped_usa.count().head()

Unnamed: 0_level_0,datetime,city,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ak,311,311,311,311,311,311,311,311,311,311
al,629,629,629,629,629,629,629,629,629,629
ar,578,578,578,578,578,578,578,578,578,578
az,2362,2362,2362,2362,2362,2362,2362,2362,2362,2362
ca,8683,8683,8683,8683,8683,8683,8683,8683,8683,8683


In [8]:
# sum duration (seconds) grouped by state
state_duration = grouped_usa["duration (seconds)"].sum()
state_duration.head()

state
ak     1455863.00
al      900453.50
ar    66986144.50
az    15453494.60
ca    24865571.47
Name: duration (seconds), dtype: float64

In [10]:
# Creating a new DataFrame using both duration and count
state_summary_table = pd.DataFrame({
    "Number of Sightings": state_counts,
    "Total Visit Time": state_duration
})
state_summary_table.head()

Unnamed: 0,Number of Sightings,Total Visit Time
ak,311,1455863.0
al,629,900453.5
ar,578,66986144.5
az,2362,15453494.6
ca,8683,24865571.47


In [11]:
# It is also possible to group a DataFrame by multiple columns
# This returns an object with multiple indexes
grouped_city = usa_ufo_df.groupby(['state','city'])

# Converting a GroupBy object into a Series (call an aggregate function)
city_duration = grouped_city["duration (seconds)"].sum()
city_duration.head(10)

state  city        
ak     adak               180.0
       anchor point       300.0
       anchorage       109187.0
       angoon             600.0
       auke bay         40200.0
       bethel            2760.0
       big lake           180.0
       butte              240.0
       chugiak            180.0
       clam gulch        1200.0
Name: duration (seconds), dtype: float64

In [15]:
city_duration_df = pd.DataFrame(grouped_city["duration (seconds)"].sum())
city_duration_df

Unnamed: 0_level_0,Unnamed: 1_level_0,duration (seconds)
state,city,Unnamed: 2_level_1
ak,adak,180.0
ak,anchor point,300.0
ak,anchorage,109187.0
ak,angoon,600.0
ak,auke bay,40200.0
ak,bethel,2760.0
ak,big lake,180.0
ak,butte,240.0
ak,chugiak,180.0
ak,clam gulch,1200.0


In [16]:
city_duration_df.loc['ak']

Unnamed: 0_level_0,duration (seconds)
city,Unnamed: 1_level_1
adak,180.0
anchor point,300.0
anchorage,109187.0
angoon,600.0
auke bay,40200.0
bethel,2760.0
big lake,180.0
butte,240.0
chugiak,180.0
clam gulch,1200.0


In [17]:
city_duration_df.loc[('ak','anchorage')].head()

duration (seconds)    109187.0
Name: (ak, anchorage), dtype: float64

In [13]:
city_duration_df.loc[('ak', 'anchorage'), 'duration (seconds)']

109187.0

## see also [agg function](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.core.groupby.DataFrameGroupBy.agg.html)