In [1]:
import pandas as pd

### Load the web_events.csv data set into a Pandas dataframe.

In [3]:
web_df = pd.read_csv(
    "https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/web_events.csv"
)

In [4]:
web_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2756101 entries, 0 to 2756100
Data columns (total 5 columns):
 #   Column         Dtype  
---  ------         -----  
 0   timestamp      int64  
 1   visitorid      int64  
 2   event          object 
 3   itemid         int64  
 4   transactionid  float64
dtypes: float64(1), int64(3), object(1)
memory usage: 105.1+ MB


In [5]:
web_df.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,


### Convert the values in the timestamp field to datetimes.

In [6]:
web_df.timestamp = pd.to_datetime(web_df.timestamp, unit="ms")

In [7]:
web_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2756101 entries, 0 to 2756100
Data columns (total 5 columns):
 #   Column         Dtype         
---  ------         -----         
 0   timestamp      datetime64[ns]
 1   visitorid      int64         
 2   event          object        
 3   itemid         int64         
 4   transactionid  float64       
dtypes: datetime64[ns](1), float64(1), int64(2), object(1)
memory usage: 105.1+ MB


### Extract different time units from the timestamp field.

In [8]:
web_df["year"] = web_df.timestamp.dt.year

In [9]:
web_df["month"] = web_df.timestamp.dt.month

In [10]:
web_df["day"] = web_df.timestamp.dt.day

In [11]:
web_df["hour"] = web_df.timestamp.dt.hour

### Aggregate on each one, counting the number of records, and see what insights you can discover for each type of event.

In [17]:
web_df.groupby(by=['event', 'year']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,timestamp,visitorid,itemid,transactionid,month,day,hour
event,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
addtocart,2015,69332,69332,69332,0,69332,69332,69332
transaction,2015,22457,22457,22457,22457,22457,22457,22457
view,2015,2664312,2664312,2664312,0,2664312,2664312,2664312


In [18]:
web_df.groupby(by=['event', 'month']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,timestamp,visitorid,itemid,transactionid,year,day,hour
event,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
addtocart,5,14369,14369,14369,0,14369,14369,14369
addtocart,6,15095,15095,15095,0,15095,15095,15095
addtocart,7,17362,17362,17362,0,17362,17362,17362
addtocart,8,14825,14825,14825,0,14825,14825,14825
addtocart,9,7681,7681,7681,0,7681,7681,7681
transaction,5,4611,4611,4611,4611,4611,4611,4611
transaction,6,5043,5043,5043,5043,5043,5043,5043
transaction,7,5802,5802,5802,5802,5802,5802,5802
transaction,8,4632,4632,4632,4632,4632,4632,4632
transaction,9,2369,2369,2369,2369,2369,2369,2369


In [19]:
web_df.groupby(by=['event', 'day']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,timestamp,visitorid,itemid,transactionid,year,month,hour
event,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
addtocart,1,1830,1830,1830,0,1830,1830,1830
addtocart,2,1775,1775,1775,0,1775,1775,1775
addtocart,3,2390,2390,2390,0,2390,2390,2390
addtocart,4,2590,2590,2590,0,2590,2590,2590
addtocart,5,2431,2431,2431,0,2431,2431,2431
...,...,...,...,...,...,...,...,...
view,27,82689,82689,82689,0,82689,82689,82689
view,28,77851,77851,77851,0,77851,77851,77851
view,29,78108,78108,78108,0,78108,78108,78108
view,30,71970,71970,71970,0,71970,71970,71970


### Round datetimes by hour, aggregate, and see what insights you can discover.

In [22]:
web_df.timestamp = web_df.timestamp.dt.round('H')

In [24]:
web_df["hour"] = web_df.timestamp.dt.hour

In [25]:
web_df.groupby(by=['event', 'hour']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,timestamp,visitorid,itemid,transactionid,year,month,day
event,hour,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
addtocart,0,3837,3837,3837,0,3837,3837,3837
addtocart,1,3714,3714,3714,0,3714,3714,3714
addtocart,2,3269,3269,3269,0,3269,3269,3269
addtocart,3,3393,3393,3393,0,3393,3393,3393
addtocart,4,3219,3219,3219,0,3219,3219,3219
...,...,...,...,...,...,...,...,...
view,19,174616,174616,174616,0,174616,174616,174616
view,20,179152,179152,179152,0,179152,179152,179152
view,21,179811,179811,179811,0,179811,179811,179811
view,22,174466,174466,174466,0,174466,174466,174466


### Load the life_expectancy.csv data set into a Pandas dataframe.

In [26]:
life_df = pd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/life_expectancy.csv')

In [27]:
life_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264 entries, 0 to 263
Data columns (total 61 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Country Name    264 non-null    object 
 1   Country Code    264 non-null    object 
 2   Indicator Name  264 non-null    object 
 3   Indicator Code  264 non-null    object 
 4   1960            235 non-null    float64
 5   1961            236 non-null    float64
 6   1962            236 non-null    float64
 7   1963            235 non-null    float64
 8   1964            235 non-null    float64
 9   1965            236 non-null    float64
 10  1966            236 non-null    float64
 11  1967            236 non-null    float64
 12  1968            236 non-null    float64
 13  1969            236 non-null    float64
 14  1970            237 non-null    float64
 15  1971            236 non-null    float64
 16  1972            236 non-null    float64
 17  1973            237 non-null    flo

In [28]:
life_df.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
0,Aruba,ABW,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,65.662,66.074,66.444,66.787,67.113,67.435,67.762,68.095,68.436,68.784,69.14,69.498,69.851,70.191,70.519,70.833,71.14,71.441,71.736,72.023,72.293,72.538,72.751,72.929,73.071,73.181,73.262,73.325,73.378,73.425,73.468,73.509,73.544,73.573,73.598,73.622,73.646,73.671,73.7,73.738,73.787,73.853,73.937,74.038,74.156,74.287,74.429,74.576,74.725,74.872,75.016,75.158,75.299,75.44,75.582,75.725,75.867
1,Afghanistan,AFG,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,32.292,32.742,33.185,33.624,34.06,34.495,34.928,35.361,35.796,36.234,36.678,37.128,37.587,38.056,38.54,39.039,39.556,40.092,40.65,41.234,41.853,42.513,43.217,43.963,44.747,45.566,46.417,47.288,48.164,49.028,49.856,50.627,51.331,51.968,52.539,53.055,53.533,53.997,54.468,54.959,55.482,56.044,56.637,57.25,57.875,58.5,59.11,59.694,60.243,60.754,61.226,61.666,62.086,62.494,62.895,63.288,63.673
2,Angola,AGO,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,33.251,33.573,33.914,34.272,34.645,35.031,35.426,35.828,36.234,36.64,37.047,37.46,37.878,38.297,38.712,39.11,39.478,39.81,40.099,40.344,40.546,40.71,40.848,40.97,41.085,41.193,41.292,41.382,41.471,41.572,41.696,41.855,42.06,42.329,42.677,43.125,43.695,44.385,45.192,46.105,47.113,48.2,49.341,50.508,51.676,52.833,53.974,55.096,56.189,57.231,58.192,59.042,59.77,60.373,60.858,61.241,61.547
3,Albania,ALB,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,62.279,63.298,64.187,64.911,65.461,65.848,66.108,66.302,66.485,66.687,66.933,67.235,67.58,67.951,68.341,68.734,69.108,69.447,69.741,69.99,70.207,70.416,70.635,70.876,71.134,71.388,71.605,71.76,71.843,71.86,71.836,71.803,71.802,71.86,71.992,72.205,72.495,72.838,73.208,73.588,73.955,74.286,74.575,74.82,75.028,75.217,75.418,75.656,75.943,76.281,76.652,77.031,77.389,77.702,77.963,78.174,78.345
4,Andorra,AND,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


### Transform/melt the data so that the years are listed in a single column instead of separate columns.

In [29]:
life_df.columns

Index(['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code',
       '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968',
       '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977',
       '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986',
       '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995',
       '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004',
       '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013',
       '2014', '2015', '2016'],
      dtype='object')

In [30]:
ids = ['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code']

In [39]:
melt_cols = list(life_df.drop(columns=ids).columns)

In [41]:
life_df2 = pd.melt(life_df, id_vars=ids, value_vars=melt_cols, value_name='Value', var_name='Year' )

In [42]:
life_df2

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,Year,Value
0,Aruba,ABW,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,1960,65.662000
1,Afghanistan,AFG,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,1960,32.292000
2,Angola,AGO,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,1960,33.251000
3,Albania,ALB,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,1960,62.279000
4,Andorra,AND,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,1960,
...,...,...,...,...,...,...
15043,Kosovo,XKX,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,2016,71.646341
15044,"Yemen, Rep.",YEM,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,2016,64.953000
15045,South Africa,ZAF,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,2016,62.774000
15046,Zambia,ZMB,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,2016,61.874000


### Practice address missing values for countries using the different approaches (imputation, interpolation, and deletion).

In [43]:
life_df2.isnull().sum()

Country Name         0
Country Code         0
Indicator Name       0
Indicator Code       0
Year                 0
Value             1301
dtype: int64

#### Imputation

In [44]:
life_df2.Value.fillna(life_df2.Value.mean())

0        65.662000
1        32.292000
2        33.251000
3        62.279000
4        63.544406
           ...    
15043    71.646341
15044    64.953000
15045    62.774000
15046    61.874000
15047    61.163000
Name: Value, Length: 15048, dtype: float64

In [45]:
life_df2.Value.fillna(life_df2.Value.median())

0        65.662000
1        32.292000
2        33.251000
3        62.279000
4        66.328000
           ...    
15043    71.646341
15044    64.953000
15045    62.774000
15046    61.874000
15047    61.163000
Name: Value, Length: 15048, dtype: float64

#### Interpolation

In [50]:
forward_value = life_df2.Value.fillna(method='ffill')
forward_value

0        65.662000
1        32.292000
2        33.251000
3        62.279000
4        62.279000
           ...    
15043    71.646341
15044    64.953000
15045    62.774000
15046    61.874000
15047    61.163000
Name: Value, Length: 15048, dtype: float64

In [52]:
back_value = life_df2.Value.fillna(method='bfill')
back_value

0        65.662000
1        32.292000
2        33.251000
3        62.279000
4        46.825065
           ...    
15043    71.646341
15044    64.953000
15045    62.774000
15046    61.874000
15047    61.163000
Name: Value, Length: 15048, dtype: float64

In [53]:
smooth_value = (forward_value + back_value)/2
smooth_value

0        65.662000
1        32.292000
2        33.251000
3        62.279000
4        54.552032
           ...    
15043    71.646341
15044    64.953000
15045    62.774000
15046    61.874000
15047    61.163000
Name: Value, Length: 15048, dtype: float64

#### Deletion

In [55]:
life_df2.Value.dropna()

0        65.662000
1        32.292000
2        33.251000
3        62.279000
5        46.825065
           ...    
15043    71.646341
15044    64.953000
15045    62.774000
15046    61.874000
15047    61.163000
Name: Value, Length: 13747, dtype: float64