In [36]:
import pandas as pd
import numpy as np

In [37]:
# Importing the dataframe
df = pd.read_csv("gapminder.csv")

In [38]:
# Displays the top 10 rows
df.head(10)

Unnamed: 0.1,Unnamed: 0,country,year,population,cont,life_exp,gdp_cap
0,11,Afghanistan,2007,31889923.0,Asia,43.828,974.580338
1,23,Albania,2007,3600523.0,Europe,76.423,5937.029526
2,35,Algeria,2007,33333216.0,Africa,72.301,6223.367465
3,47,Angola,2007,12420476.0,Africa,42.731,4797.231267
4,59,Argentina,2007,40301927.0,Americas,75.32,12779.37964
5,71,Australia,2007,20434176.0,Oceania,81.235,34435.36744
6,83,Austria,2007,8199783.0,Europe,79.829,36126.4927
7,95,Bahrain,2007,708573.0,Asia,75.635,29796.04834
8,107,Bangladesh,2007,150448339.0,Asia,64.062,1391.253792
9,119,Belgium,2007,10392226.0,Europe,79.441,33692.60508


In [39]:
# Group by continent
cont_wise = df.groupby("cont")
cont_wise

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000026D2E681B90>

In [40]:
# Displays the sum of population for each continent
cont_wise.population.sum()

cont
Africa      9.295397e+08
Americas    8.988712e+08
Asia        3.811954e+09
Europe      5.860985e+08
Oceania     2.454995e+07
Name: population, dtype: float64

In [41]:
# Sorting according to the continent and then country
cont_wise.head(2).sort_values(["cont","country"],ascending=[False,True])

Unnamed: 0.1,Unnamed: 0,country,year,population,cont,life_exp,gdp_cap
5,71,Australia,2007,20434176.0,Oceania,81.235,34435.36744
91,1103,New Zealand,2007,4115771.0,Oceania,80.204,25185.00911
1,23,Albania,2007,3600523.0,Europe,76.423,5937.029526
6,83,Austria,2007,8199783.0,Europe,79.829,36126.4927
0,11,Afghanistan,2007,31889923.0,Asia,43.828,974.580338
7,95,Bahrain,2007,708573.0,Asia,75.635,29796.04834
4,59,Argentina,2007,40301927.0,Americas,75.32,12779.37964
11,143,Bolivia,2007,9119152.0,Americas,65.554,3822.137084
2,35,Algeria,2007,33333216.0,Africa,72.301,6223.367465
3,47,Angola,2007,12420476.0,Africa,42.731,4797.231267


#### Reshaping
We can reshape our data by pivoting, stacking and unstacking DataFrame objects

In [42]:
# For this we will be using a different dataframe
import seaborn as sns

In [43]:
flights = sns.load_dataset('flights')
flights.head(14)

Unnamed: 0,year,month,passengers
0,1949,Jan,112
1,1949,Feb,118
2,1949,Mar,132
3,1949,Apr,129
4,1949,May,121
5,1949,Jun,135
6,1949,Jul,148
7,1949,Aug,148
8,1949,Sep,136
9,1949,Oct,119


In [47]:
# Group by month to get total passesnger in dataframe
fly = flights.groupby(["year"])[["passengers"]].sum()
fly


Unnamed: 0_level_0,passengers
year,Unnamed: 1_level_1
1949,1520
1950,1676
1951,2042
1952,2364
1953,2700
1954,2867
1955,3408
1956,3939
1957,4421
1958,4572


In [46]:
# Group by year and month - The data is in long format
fly = flights.groupby(["year","month"])[["passengers"]].sum()
fly

Unnamed: 0_level_0,Unnamed: 1_level_0,passengers
year,month,Unnamed: 2_level_1
1949,Jan,112
1949,Feb,118
1949,Mar,132
1949,Apr,129
1949,May,121
...,...,...
1960,Aug,606
1960,Sep,508
1960,Oct,461
1960,Nov,390


In [56]:
flights_pivoted = pd.pivot_table(flights,
                                values='passengers', # Values in the table
                                index=['year'],  # the x axis
                                columns='month', # The y axis
                                aggfunc=np.sum)
flights_pivoted

month,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1949,112,118,132,129,121,135,148,148,136,119,104,118
1950,115,126,141,135,125,149,170,170,158,133,114,140
1951,145,150,178,163,172,178,199,199,184,162,146,166
1952,171,180,193,181,183,218,230,242,209,191,172,194
1953,196,196,236,235,229,243,264,272,237,211,180,201
1954,204,188,235,227,234,264,302,293,259,229,203,229
1955,242,233,267,269,270,315,364,347,312,274,237,278
1956,284,277,317,313,318,374,413,405,355,306,271,306
1957,315,301,356,348,355,422,465,467,404,347,305,336
1958,340,318,362,348,363,435,491,505,404,359,310,337


In [61]:
flights_piv = flights.pivot(index="year",columns="month",values="passengers")
flights_piv

month,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1949,112,118,132,129,121,135,148,148,136,119,104,118
1950,115,126,141,135,125,149,170,170,158,133,114,140
1951,145,150,178,163,172,178,199,199,184,162,146,166
1952,171,180,193,181,183,218,230,242,209,191,172,194
1953,196,196,236,235,229,243,264,272,237,211,180,201
1954,204,188,235,227,234,264,302,293,259,229,203,229
1955,242,233,267,269,270,315,364,347,312,274,237,278
1956,284,277,317,313,318,374,413,405,355,306,271,306
1957,315,301,356,348,355,422,465,467,404,347,305,336
1958,340,318,362,348,363,435,491,505,404,359,310,337


In [63]:
flights_pivoted =pd.pivot_table(flights,values="passengers",index=["year"],columns="month",aggfunc=np.sum)
flights_pivoted


month,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1949,112,118,132,129,121,135,148,148,136,119,104,118
1950,115,126,141,135,125,149,170,170,158,133,114,140
1951,145,150,178,163,172,178,199,199,184,162,146,166
1952,171,180,193,181,183,218,230,242,209,191,172,194
1953,196,196,236,235,229,243,264,272,237,211,180,201
1954,204,188,235,227,234,264,302,293,259,229,203,229
1955,242,233,267,269,270,315,364,347,312,274,237,278
1956,284,277,317,313,318,374,413,405,355,306,271,306
1957,315,301,356,348,355,422,465,467,404,347,305,336
1958,340,318,362,348,363,435,491,505,404,359,310,337


In [None]:
df.pivot_table(index='month', columns='product', aggfunc=np.sum ).plot(kind='bar', y='quantity',width = 0.8, edgecolor='black')  

More practise

In [64]:
df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",
                         "bar", "bar", "bar", "bar"],
                   "B": ["one", "one", "one", "two", "two",
                         "one", "one", "two", "two"],
                   "C": ["small", "large", "large", "small",
                         "small", "large", "small", "small",
                         "large"],
                   "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
                   "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]})
df

Unnamed: 0,A,B,C,D,E
0,foo,one,small,1,2
1,foo,one,large,2,4
2,foo,one,large,2,5
3,foo,two,small,3,5
4,foo,two,small,3,6
5,bar,one,large,4,6
6,bar,one,small,5,8
7,bar,two,small,6,9
8,bar,two,large,7,9


In [65]:
# example aggregates values by taking the sum
table =pd.pivot_table(df, values="D",index=['A','B'],columns=["C"],aggfunc=np.sum)
table

Unnamed: 0_level_0,C,large,small
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,4.0,5.0
bar,two,7.0,6.0
foo,one,4.0,1.0
foo,two,,6.0


In [66]:
# example aggregates by taking the mean across multiple columns
table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'],
                    aggfunc={'D': np.mean,
                             'E': np.mean})
table

Unnamed: 0_level_0,Unnamed: 1_level_0,D,E
A,C,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,large,5.5,7.5
bar,small,5.5,8.5
foo,large,2.0,4.5
foo,small,2.333333,4.333333
