### Phone Data Pandas Groupby and Aggregation Practice

In [2]:
import pandas as pd
import dateutil
data = pd.DataFrame.from_csv('phone_data.csv')

In [3]:
data.head(3)

Unnamed: 0_level_0,date,duration,item,month,network,network_type
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,15/10/14 06:58,34.429,data,2014-11,data,data
1,15/10/14 06:58,13.0,call,2014-11,Vodafone,mobile
2,15/10/14 14:46,23.0,call,2014-11,Meteor,mobile


In [4]:
data['date'] = data['date'].apply(dateutil.parser.parse, dayfirst=True)

In [5]:
data.head()

Unnamed: 0_level_0,date,duration,item,month,network,network_type
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2014-10-15 06:58:00,34.429,data,2014-11,data,data
1,2014-10-15 06:58:00,13.0,call,2014-11,Vodafone,mobile
2,2014-10-15 14:46:00,23.0,call,2014-11,Meteor,mobile
3,2014-10-15 14:48:00,4.0,call,2014-11,Tesco,mobile
4,2014-10-15 17:27:00,4.0,call,2014-11,Tesco,mobile


### What is the average duration of call per network?

In [6]:
data[data["item"] == "call"].groupby("network").duration.mean()

network
Meteor       133.333333
Tesco        194.760563
Three        284.875000
Vodafone     221.530303
landline     438.880952
voicemail     65.740741
Name: duration, dtype: float64

In [7]:
# average duration of call per network
import numpy as np
avg = data.groupby(by=['network', 'item'])
avg = avg.agg({'duration': [np.mean]})
avg

Unnamed: 0_level_0,Unnamed: 1_level_0,duration
Unnamed: 0_level_1,Unnamed: 1_level_1,mean
network,item,Unnamed: 2_level_2
Meteor,call,133.333333
Meteor,sms,1.0
Tesco,call,194.760563
Tesco,sms,1.0
Three,call,284.875
Three,sms,1.0
Vodafone,call,221.530303
Vodafone,sms,1.0
data,data,34.429
landline,call,438.880952


In [9]:
# What was the longest call/data entry made?

data.duration.max() / 60. / 60

2.9244444444444446

### How many seconds of phone calls are recorded in total?

In [10]:
# seconds of phone calls are recorded in total

data['duration'][data['item'] == 'call'].sum()

# suketu
data[data['item'] =='call']['duration'].sum()

92321.0

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 830 entries, 0 to 829
Data columns (total 6 columns):
date            830 non-null datetime64[ns]
duration        830 non-null float64
item            830 non-null object
month           830 non-null object
network         830 non-null object
network_type    830 non-null object
dtypes: datetime64[ns](1), float64(1), object(4)
memory usage: 45.4+ KB


In [12]:
# How many entries are there for each month?
data['month'].value_counts()

2014-11    230
2015-01    205
2014-12    157
2015-02    137
2015-03    101
Name: month, dtype: int64

In [13]:
data['month'].unique()

array(['2014-11', '2014-12', '2015-01', '2015-02', '2015-03'], dtype=object)

In [14]:
data.groupby(['month']).groups.keys()

['2014-12', '2014-11', '2015-02', '2015-03', '2015-01']

In [15]:
# Get the first entry for each month
data.pivot_table(index = ['month'], aggfunc = 'first')

Unnamed: 0_level_0,date,duration,item,network,network_type
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014-11,2014-10-15 06:58:00,34.429,data,data,data
2014-12,2014-11-13 06:58:00,34.429,data,data,data
2015-01,2014-12-13 06:58:00,34.429,data,data,data
2015-02,2015-01-13 06:58:00,34.429,data,data,data
2015-03,2015-02-12 20:15:00,69.0,call,landline,landline


In [16]:
data.groupby('month').first()

Unnamed: 0_level_0,date,duration,item,network,network_type
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014-11,2014-10-15 06:58:00,34.429,data,data,data
2014-12,2014-11-13 06:58:00,34.429,data,data,data
2015-01,2014-12-13 06:58:00,34.429,data,data,data
2015-02,2015-01-13 06:58:00,34.429,data,data,data
2015-03,2015-02-12 20:15:00,69.0,call,landline,landline


In [17]:
data.groupby(["month"]).head(3)

Unnamed: 0_level_0,date,duration,item,month,network,network_type
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2014-10-15 06:58:00,34.429,data,2014-11,data,data
1,2014-10-15 06:58:00,13.0,call,2014-11,Vodafone,mobile
2,2014-10-15 14:46:00,23.0,call,2014-11,Meteor,mobile
228,2014-11-13 06:58:00,34.429,data,2014-12,data,data
231,2014-11-14 06:58:00,34.429,data,2014-12,data,data
232,2014-11-14 17:24:00,124.0,call,2014-12,voicemail,voicemail
381,2014-12-13 06:58:00,34.429,data,2015-01,data,data
386,2014-12-14 06:58:00,34.429,data,2015-01,data,data
389,2014-12-15 06:58:00,34.429,data,2015-01,data,data
577,2015-01-13 06:58:00,34.429,data,2015-02,data,data


In [18]:
data.sort_values(ascending=True,by='date').groupby(by=['month']).first()


Unnamed: 0_level_0,date,duration,item,network,network_type
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014-11,2014-10-15 06:58:00,34.429,data,data,data
2014-12,2014-11-13 06:58:00,34.429,data,data,data
2015-01,2014-12-13 06:58:00,34.429,data,data,data
2015-02,2015-01-13 06:58:00,34.429,data,data,data
2015-03,2015-02-12 20:15:00,69.0,call,landline,landline


In [19]:
data.groupby('month')["date"].min()

month
2014-11   2014-10-15 06:58:00
2014-12   2014-11-13 06:58:00
2015-01   2014-12-13 06:58:00
2015-02   2015-01-13 06:58:00
2015-03   2015-02-12 20:15:00
Name: date, dtype: datetime64[ns]

In [20]:
# What is the sum of durations, for calls only, to each network
data[data['item'] == 'call'].groupby('network')['duration'].sum()


network
Meteor        7200.0
Tesco        13828.0
Three        36464.0
Vodafone     14621.0
landline     18433.0
voicemail     1775.0
Name: duration, dtype: float64

In [21]:
# How many calls, texts, and data are sent per month, split by network_type?
data.groupby(['month', 'network_type'])['date'].count()

month    network_type
2014-11  data             29
         landline          5
         mobile          189
         special           1
         voicemail         6
2014-12  data             30
         landline          7
         mobile          108
         voicemail         8
         world             4
2015-01  data             31
         landline         11
         mobile          160
         voicemail         3
2015-02  data             31
         landline          8
         mobile           90
         special           2
         voicemail         6
2015-03  data             29
         landline         11
         mobile           54
         voicemail         4
         world             3
Name: date, dtype: int64

In [22]:
import numpy as np
data.groupby(['month', 'item']).agg({'duration':sum,      # find the sum of the durations for each group
                                     'network_type': 'count', # find the number of network type entries
                                     'date': 'first'})  

Unnamed: 0_level_0,Unnamed: 1_level_0,duration,date,network_type
month,item,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014-11,call,25547.0,2014-10-15 06:58:00,107
2014-11,data,998.441,2014-10-15 06:58:00,29
2014-11,sms,94.0,2014-10-16 22:18:00,94
2014-12,call,13561.0,2014-11-14 17:24:00,79
2014-12,data,1032.87,2014-11-13 06:58:00,30
2014-12,sms,48.0,2014-11-14 17:28:00,48
2015-01,call,17070.0,2014-12-15 20:03:00,88
2015-01,data,1067.299,2014-12-13 06:58:00,31
2015-01,sms,86.0,2014-12-15 19:56:00,86
2015-02,call,14416.0,2015-01-15 10:36:00,67
