# EDA

In [1]:
import pandas as pd
import numpy as np
import datetime as dt

In [2]:
csvfile = "meetings_short.csv"

In [3]:
df = pd.read_csv(csvfile)

In [4]:
df.head()

Unnamed: 0,Name,Day,Time,Video,Email,Desc,Categories,Phone
0,AA Lucan,Tuesday,3:00 pm,https://us02web.zoom.us/j/86771088384?pwd=UmJR...,lucanonlinegroup@gmail.com,"An open AA meeting based out of Dublin, Irelan...","Audio,Open,Tuesday,Video",
1,AA North Portugal,Tuesday,3:00 pm,https://us02web.zoom.us/j/3728994472,aaportugalnorth@outlook.com,"Public Email Contact, if any: aaportugalnorth@...","Open,Tuesday,Video",
2,HOPE Group,Tuesday,3:00 pm,https://zoom.us/j/96250625525,hopenonlysi@gmail.com,Closed Discussion. We read the 3 last paragrap...,"Audio,Discussion,Telephone,Tuesday,Video",16465588656
3,"No Fees, No Dues",Tuesday,3:00 pm,https://meet.jit.si/247recovery,nofeesnodues@gmail.com,It is encouraged that a structured meeting beg...,"Audio,Closed,Telephone,Tuesday,Video",
4,PG & Chill,Tuesday,3:00 pm,https://us02web.zoom.us/j/802496652,superdave1212@mac.com,Password 960328,"Audio,Open,Telephone,Tuesday,Video","16699006833,,802496652"


### Transforming

##### Convert 'Time' column to datetime data type

In [27]:
pd.to_datetime(df["Time"]).dt.time

0    15:00:00
1    15:00:00
2    15:00:00
3    15:00:00
4    15:00:00
5    15:00:00
6    15:00:00
7    15:00:00
8    15:00:00
9    15:30:00
Name: Time, dtype: object

In [36]:
# dt.normalize() sets time component to 00:00:00, so here we're subtracting the days
pd.to_datetime(df["Time"]) - pd.to_datetime(df["Time"]).dt.normalize()

0   15:00:00
1   15:00:00
2   15:00:00
3   15:00:00
4   15:00:00
5   15:00:00
6   15:00:00
7   15:00:00
8   15:00:00
9   15:30:00
Name: Time, dtype: timedelta64[ns]

In this case, it may be that the time data would be better analyzed as categorical. But since there is an ordinal component to it ("how many of this earlier in the day or that later in the day?"), let's just save both versions.

In [37]:
df["Time_dt"] = pd.to_datetime(df["Time"]).dt.time
df["Time_del"] = pd.to_datetime(df["Time"]) - pd.to_datetime(df["Time"]).dt.normalize()
df.head()

Unnamed: 0,Name,Day,Time,Video,Email,Desc,Categories,Phone,Time_dt,Time_del
0,AA Lucan,Tuesday,3:00 pm,https://us02web.zoom.us/j/86771088384?pwd=UmJR...,lucanonlinegroup@gmail.com,"An open AA meeting based out of Dublin, Irelan...","Audio,Open,Tuesday,Video",,15:00:00,15:00:00
1,AA North Portugal,Tuesday,3:00 pm,https://us02web.zoom.us/j/3728994472,aaportugalnorth@outlook.com,"Public Email Contact, if any: aaportugalnorth@...","Open,Tuesday,Video",,15:00:00,15:00:00
2,HOPE Group,Tuesday,3:00 pm,https://zoom.us/j/96250625525,hopenonlysi@gmail.com,Closed Discussion. We read the 3 last paragrap...,"Audio,Discussion,Telephone,Tuesday,Video",16465588656,15:00:00,15:00:00
3,"No Fees, No Dues",Tuesday,3:00 pm,https://meet.jit.si/247recovery,nofeesnodues@gmail.com,It is encouraged that a structured meeting beg...,"Audio,Closed,Telephone,Tuesday,Video",,15:00:00,15:00:00
4,PG & Chill,Tuesday,3:00 pm,https://us02web.zoom.us/j/802496652,superdave1212@mac.com,Password 960328,"Audio,Open,Telephone,Tuesday,Video","16699006833,,802496652",15:00:00,15:00:00


##### Weekdays are ordered categorical data

In [41]:
from pandas.api.types import CategoricalDtype
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
day_type = CategoricalDtype(categories=days, ordered=True)
df['Day'] = df['Day'].astype(day_type)

### Describing

In [53]:
df["Time_del"].min()
df["Time_del"].max()

Timedelta('0 days 15:30:00')

In [54]:
df["Day"].value_counts()

Tuesday      10
Sunday        0
Saturday      0
Friday        0
Thursday      0
Wednesday     0
Monday        0
Name: Day, dtype: int64

In [56]:
df['Time_del'].value_counts()

15:00:00    9
15:30:00    1
Name: Time_del, dtype: int64