In [1]:
import numpy as np
import pandas as pd
from datetime import datetime

# Data Cleanup

## Na'vi River Journey

In [2]:
# Navi River
navi = pd.read_csv('data/navi_river.csv')

In [3]:
navi.head()

Unnamed: 0,date,datetime,SACTMIN,SPOSTMIN
0,05/26/2017,2017-05-26 09:09:46,,-999.0
1,05/26/2017,2017-05-26 09:10:12,,5.0
2,05/26/2017,2017-05-26 09:17:09,,45.0
3,05/26/2017,2017-05-26 09:24:07,,45.0
4,05/26/2017,2017-05-26 09:30:10,,45.0


In [4]:
navi.dtypes

date         object
datetime     object
SACTMIN     float64
SPOSTMIN    float64
dtype: object

In [5]:
# Convert data and datetime to datetime type (not object)

navi['date'] = pd.to_datetime(navi['date'])
navi['datetime'] = pd.to_datetime(navi['datetime'])

In [6]:
# Add column for day of the week
navi['weekday'] = navi['date'].dt.dayofweek

# Adjust index so Monday == 1
navi['weekday'] = navi['weekday'] + 1

navi['day_name'] = navi['date'].dt.day_name()

In [7]:
#navi['day_name'] = navi.day_name.astype('string')

In [8]:
navi.head()

Unnamed: 0,date,datetime,SACTMIN,SPOSTMIN,weekday,day_name
0,2017-05-26,2017-05-26 09:09:46,,-999.0,5,Friday
1,2017-05-26,2017-05-26 09:10:12,,5.0,5,Friday
2,2017-05-26,2017-05-26 09:17:09,,45.0,5,Friday
3,2017-05-26,2017-05-26 09:24:07,,45.0,5,Friday
4,2017-05-26,2017-05-26 09:30:10,,45.0,5,Friday


In [9]:
navi.dtypes

date        datetime64[ns]
datetime    datetime64[ns]
SACTMIN            float64
SPOSTMIN           float64
weekday              int64
day_name            object
dtype: object

In [10]:
# Copy dataset before eliminating missing values
navi_orig = navi.copy()

In [13]:
# posted = SPOSTMIN = posted wait times in minutes
navi_posted = navi.dropna(how='any', subset=['SPOSTMIN'])
navi_posted = navi_posted[navi_posted.SPOSTMIN != -999]

In [14]:
navi_posted.shape

(169980, 6)

In [15]:
# Set Date to index for DAILY analysis
navi_posted.set_index('date', inplace = True)

In [16]:
navi_posted.head(2)

Unnamed: 0_level_0,datetime,SACTMIN,SPOSTMIN,weekday,day_name
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-05-26,2017-05-26 09:10:12,,5.0,5,Friday
2017-05-26,2017-05-26 09:17:09,,45.0,5,Friday


## Flight of Passage

In [19]:
# Flight of Passage
flight = pd.read_csv('data/flight_of_passage.csv')

In [20]:
# Convert data and datetime to datetime type (not object)

flight['date'] = pd.to_datetime(flight['date'])
flight['datetime'] = pd.to_datetime(flight['datetime'])

In [21]:
# Add column for day of the week
flight['weekday'] = flight['date'].dt.dayofweek

# Adjust index so Monday == 1
flight['weekday'] = flight['weekday'] + 1

flight['day_name'] = flight['date'].dt.day_name()

In [22]:
#flight['day_name'] = flight.day_name.astype('string')

In [23]:
flight.dtypes

date        datetime64[ns]
datetime    datetime64[ns]
SACTMIN            float64
SPOSTMIN           float64
weekday              int64
day_name            object
dtype: object

In [28]:
flight_posted = flight.dropna(how='any', subset=['SPOSTMIN'])
flight_posted = flight_posted[flight_posted.SPOSTMIN != -999]

In [29]:
flight_posted.shape

(173527, 6)

In [30]:
flight_posted.head()

Unnamed: 0,date,datetime,SACTMIN,SPOSTMIN,weekday,day_name
2,2017-05-26,2017-05-26 09:10:12,,5.0,5,Friday
3,2017-05-26,2017-05-26 09:17:09,,60.0,5,Friday
4,2017-05-26,2017-05-26 09:24:07,,60.0,5,Friday
5,2017-05-26,2017-05-26 09:30:10,,60.0,5,Friday
6,2017-05-26,2017-05-26 09:38:10,,45.0,5,Friday


In [31]:
# Set Date to index for DAILY analysis
flight_posted.set_index('date', inplace = True)

## Kilimanjaro Safaris

In [32]:
# Kilimanjaro Safaris
safari = pd.read_csv('data/kilimanjaro_safaris.csv')

In [37]:
safari_posted = safari.dropna(how='any', subset=['SPOSTMIN'])
safari_posted = safari_posted[safari_posted.SPOSTMIN != -999]

In [38]:
safari.shape

(257785, 4)

In [39]:
safari.head()

Unnamed: 0,date,datetime,SACTMIN,SPOSTMIN
0,01/01/2015,2015-01-01 07:47:26,,5.0
1,01/01/2015,2015-01-01 07:54:23,,5.0
2,01/01/2015,2015-01-01 08:05:33,,5.0
3,01/01/2015,2015-01-01 08:12:23,,10.0
4,01/01/2015,2015-01-01 08:19:26,,10.0


In [40]:
safari.dtypes

date         object
datetime     object
SACTMIN     float64
SPOSTMIN    float64
dtype: object

In [41]:
safari_posted.head()

Unnamed: 0,date,datetime,SACTMIN,SPOSTMIN
0,01/01/2015,2015-01-01 07:47:26,,5.0
1,01/01/2015,2015-01-01 07:54:23,,5.0
2,01/01/2015,2015-01-01 08:05:33,,5.0
3,01/01/2015,2015-01-01 08:12:23,,10.0
4,01/01/2015,2015-01-01 08:19:26,,10.0


In [42]:
# Convert data and datetime to datetime type (not object)

safari_posted['date'] = pd.to_datetime(safari['date'])
safari_posted['datetime'] = pd.to_datetime(safari['datetime'])

In [43]:
# Add column for day of the week
safari_posted['weekday'] = safari_posted['date'].dt.dayofweek

# Adjust index so Monday == 1
safari_posted['weekday'] = safari_posted['weekday'] + 1

safari_posted['day_name'] = safari_posted['date'].dt.day_name()

In [44]:
safari_posted.set_index('date', inplace = True)

## Expedition Everest

In [45]:
everest = pd.read_csv('data/expedition_everest.csv')

In [48]:
everest_posted = everest.dropna(how='any', subset=['SPOSTMIN'])
everest_posted = everest_posted[everest_posted.SPOSTMIN != -999]

In [49]:
everest_posted.shape

(246429, 4)

In [53]:
everest.head()

Unnamed: 0,date,datetime,SACTMIN,SPOSTMIN
0,01/01/2015,2015-01-01 07:47:26,,5.0
1,01/01/2015,2015-01-01 07:54:23,,5.0
2,01/01/2015,2015-01-01 08:05:33,,5.0
3,01/01/2015,2015-01-01 08:12:23,,5.0
4,01/01/2015,2015-01-01 08:19:26,,5.0


In [54]:
everest_posted.dtypes

date         object
datetime     object
SACTMIN     float64
SPOSTMIN    float64
dtype: object

In [55]:
# Convert data and datetime to datetime type (not object)

everest_posted['date'] = pd.to_datetime(everest_posted['date'])
everest_posted['datetime'] = pd.to_datetime(everest_posted['datetime'])

In [56]:
# Add column for day of the week
everest_posted['weekday'] = everest_posted['date'].dt.dayofweek

# Adjust index so Monday == 1
everest_posted['weekday'] = everest_posted['weekday'] + 1

everest_posted['day_name'] = everest_posted['date'].dt.day_name()

In [57]:
everest_posted.head()

Unnamed: 0,date,datetime,SACTMIN,SPOSTMIN,weekday,day_name
0,2015-01-01,2015-01-01 07:47:26,,5.0,4,Thursday
1,2015-01-01,2015-01-01 07:54:23,,5.0,4,Thursday
2,2015-01-01,2015-01-01 08:05:33,,5.0,4,Thursday
3,2015-01-01,2015-01-01 08:12:23,,5.0,4,Thursday
4,2015-01-01,2015-01-01 08:19:26,,5.0,4,Thursday


In [58]:
everest_posted.set_index('date', inplace = True)

In [59]:
everest_posted.dtypes

datetime    datetime64[ns]
SACTMIN            float64
SPOSTMIN           float64
weekday              int64
day_name            object
dtype: object

In [60]:
navi_posted.to_csv('data/navi_posted.csv', index=True)
flight_posted.to_csv('data/flight_posted.csv', index=True)
safari_posted.to_csv('data/safari_posted.csv', index=True)
everest_posted.to_csv('data/everest_posted.csv', index=True)

Next Notebook: Notebook 2: Navi River Journey EDA & Modeling