In [1]:
import pandas as pd

In [2]:
w_df = pd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/web_events.csv')
l_df = pd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/life_expectancy.csv')

In [3]:
#Load the web_events.csv data set into a Pandas dataframe

w_df.dtypes

timestamp          int64
visitorid          int64
event             object
itemid             int64
transactionid    float64
dtype: object

In [4]:
#Convert the values in the timestamp field to datetimes

w_df['timestamp'] = pd.to_datetime(w_df['timestamp'], unit='ms')

In [6]:
#Extract different time units from the timestamp field.

w_df['year'] = w_df['timestamp'].dt.year
w_df['month'] = w_df['timestamp'].dt.month
w_df['day'] = w_df['timestamp'].dt.day
w_df['weekday'] = w_df['timestamp'].dt.weekday
w_df['timestamp'].dt.round('H')
w_df['hour'] = w_df['timestamp'].dt.hour

In [8]:
#Aggregate on each one, counting the number of records, and see what insights you can discover for each type of event

w_df.groupby(by=['event', 'year']).count()
w_df.groupby(by=['event', 'month']).count()
w_df.groupby(by=['event', 'weekday']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,timestamp,visitorid,itemid,transactionid,year,month,day,hour
event,weekday,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
addtocart,0,11371,11371,11371,0,11371,11371,11371,11371
addtocart,1,11585,11585,11585,0,11585,11585,11585,11585
addtocart,2,11384,11384,11384,0,11384,11384,11384,11384
addtocart,3,10951,10951,10951,0,10951,10951,10951,10951
addtocart,4,9545,9545,9545,0,9545,9545,9545,9545
addtocart,5,6949,6949,6949,0,6949,6949,6949,6949
addtocart,6,7547,7547,7547,0,7547,7547,7547,7547
transaction,0,3848,3848,3848,3848,3848,3848,3848,3848
transaction,1,3973,3973,3973,3973,3973,3973,3973,3973
transaction,2,4151,4151,4151,4151,4151,4151,4151,4151


In [9]:
#Round datetimes by hour, aggregate, and see what insights you can discover

w_df.timestamp = w_df.timestamp.dt.round('H')
w_df["hour"] = w_df.timestamp.dt.hour
w_df.groupby(by=['event', 'hour']).head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,year,month,day,weekday,hour
0,2015-06-02 05:00:00,257597,view,355908,,2015,6,2,1,5
1,2015-06-02 06:00:00,992329,view,248676,,2015,6,2,1,6
2,2015-06-02 05:00:00,111016,view,318965,,2015,6,2,1,5
3,2015-06-02 05:00:00,483717,view,253185,,2015,6,2,1,5
4,2015-06-02 05:00:00,951259,view,367447,,2015,6,2,1,5
...,...,...,...,...,...,...,...,...,...,...
125779,2015-06-06 10:00:00,592787,transaction,339411,1657.0,2015,6,6,5,10
134001,2015-06-07 09:00:00,1169956,transaction,35245,14201.0,2015,6,7,6,9
139384,2015-06-07 09:00:00,1169956,transaction,308648,14201.0,2015,6,7,6,9
158937,2015-06-08 12:00:00,1381830,transaction,420896,9551.0,2015,6,8,0,12


In [10]:
#Load the life_expectancy.csv data set into a Pandas dataframe

l_df.dtypes

Country Name       object
Country Code       object
Indicator Name     object
Indicator Code     object
1960              float64
                   ...   
2012              float64
2013              float64
2014              float64
2015              float64
2016              float64
Length: 61, dtype: object

In [11]:
l_df.columns

Index(['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code',
       '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968',
       '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977',
       '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986',
       '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995',
       '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004',
       '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013',
       '2014', '2015', '2016'],
      dtype='object')

In [12]:
ids = ['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code']
years = l_df.columns.drop(ids)

m_df = pd.melt(l_df, id_vars=ids, value_vars=years, var_name='Year' , value_name='Value')
m_df

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,Year,Value
0,Aruba,ABW,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,1960,65.662000
1,Afghanistan,AFG,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,1960,32.292000
2,Angola,AGO,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,1960,33.251000
3,Albania,ALB,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,1960,62.279000
4,Andorra,AND,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,1960,
...,...,...,...,...,...,...
15043,Kosovo,XKX,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,2016,71.646341
15044,"Yemen, Rep.",YEM,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,2016,64.953000
15045,South Africa,ZAF,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,2016,62.774000
15046,Zambia,ZMB,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,2016,61.874000


In [13]:
#Practice address missing values for countries using the different approaches (imputation, interpolation, and deletion)
#Imputation
m_df.Value.fillna(m_df.Value.mean())

0        65.662000
1        32.292000
2        33.251000
3        62.279000
4        63.544406
           ...    
15043    71.646341
15044    64.953000
15045    62.774000
15046    61.874000
15047    61.163000
Name: Value, Length: 15048, dtype: float64

In [14]:
m_df.Value.fillna(m_df.Value.median())

0        65.662000
1        32.292000
2        33.251000
3        62.279000
4        66.328000
           ...    
15043    71.646341
15044    64.953000
15045    62.774000
15046    61.874000
15047    61.163000
Name: Value, Length: 15048, dtype: float64

In [15]:
#interpolation

new_df = m_df.Value.fillna(method='ffill')
new_df

0        65.662000
1        32.292000
2        33.251000
3        62.279000
4        62.279000
           ...    
15043    71.646341
15044    64.953000
15045    62.774000
15046    61.874000
15047    61.163000
Name: Value, Length: 15048, dtype: float64

In [16]:
b_df = m_df.Value.fillna(method='bfill')
b_df

0        65.662000
1        32.292000
2        33.251000
3        62.279000
4        46.825065
           ...    
15043    71.646341
15044    64.953000
15045    62.774000
15046    61.874000
15047    61.163000
Name: Value, Length: 15048, dtype: float64

In [17]:
s_df = (new_df + b_df)/2
s_df

0        65.662000
1        32.292000
2        33.251000
3        62.279000
4        54.552032
           ...    
15043    71.646341
15044    64.953000
15045    62.774000
15046    61.874000
15047    61.163000
Name: Value, Length: 15048, dtype: float64

In [18]:
#Deletion

m_df.Value.dropna()

0        65.662000
1        32.292000
2        33.251000
3        62.279000
5        46.825065
           ...    
15043    71.646341
15044    64.953000
15045    62.774000
15046    61.874000
15047    61.163000
Name: Value, Length: 13747, dtype: float64