In [1]:
import pandas

In [2]:
df = pandas.read_csv('./gapminder.tsv', sep='\t')

In [4]:
df.head(50)

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106
5,Afghanistan,Asia,1977,38.438,14880372,786.11336
6,Afghanistan,Asia,1982,39.854,12881816,978.011439
7,Afghanistan,Asia,1987,40.822,13867957,852.395945
8,Afghanistan,Asia,1992,41.674,16317921,649.341395
9,Afghanistan,Asia,1997,41.763,22227415,635.341351


In [5]:
df.shape

(1704, 6)

In [6]:
df.columns

Index(['country', 'continent', 'year', 'lifeExp', 'pop', 'gdpPercap'], dtype='object')

In [7]:
df.index

RangeIndex(start=0, stop=1704, step=1)

In [8]:
df.values #to get the numpy array version of the dataset

array([['Afghanistan', 'Asia', 1952, 28.801, 8425333, 779.4453145],
       ['Afghanistan', 'Asia', 1957, 30.332, 9240934, 820.8530296],
       ['Afghanistan', 'Asia', 1962, 31.997, 10267083, 853.10071],
       ...,
       ['Zimbabwe', 'Africa', 1997, 46.809, 11404948, 792.4499603],
       ['Zimbabwe', 'Africa', 2002, 39.989, 11926563, 672.0386227],
       ['Zimbabwe', 'Africa', 2007, 43.487, 12311143, 469.7092981]],
      dtype=object)

In [10]:
type(df) # Useful when you get an error to know why and what type of objects 

pandas.core.frame.DataFrame

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   country    1704 non-null   object 
 1   continent  1704 non-null   object 
 2   year       1704 non-null   int64  
 3   lifeExp    1704 non-null   float64
 4   pop        1704 non-null   int64  
 5   gdpPercap  1704 non-null   float64
dtypes: float64(2), int64(2), object(2)
memory usage: 80.0+ KB


In [14]:
df['country']

0       Afghanistan
1       Afghanistan
2       Afghanistan
3       Afghanistan
4       Afghanistan
           ...     
1699       Zimbabwe
1700       Zimbabwe
1701       Zimbabwe
1702       Zimbabwe
1703       Zimbabwe
Name: country, Length: 1704, dtype: object

In [15]:
country_df = df['country']

In [16]:
country_df.shape

(1704,)

In [18]:
country_df.info()

<class 'pandas.core.series.Series'>
RangeIndex: 1704 entries, 0 to 1703
Series name: country
Non-Null Count  Dtype 
--------------  ----- 
1704 non-null   object
dtypes: object(1)
memory usage: 13.4+ KB


In [20]:
type(country_df) #Series can be seen similar to the columns in a dataframe or one dimensional numpy array

pandas.core.series.Series

In [21]:
df

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.853030
2,Afghanistan,Asia,1962,31.997,10267083,853.100710
3,Afghanistan,Asia,1967,34.020,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106
...,...,...,...,...,...,...
1699,Zimbabwe,Africa,1987,62.351,9216418,706.157306
1700,Zimbabwe,Africa,1992,60.377,10704340,693.420786
1701,Zimbabwe,Africa,1997,46.809,11404948,792.449960
1702,Zimbabwe,Africa,2002,39.989,11926563,672.038623


In [28]:
#How to subset columns

In [22]:
subset = df[['country', 'year', 'pop']]

In [24]:
subset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   country  1704 non-null   object
 1   year     1704 non-null   int64 
 2   pop      1704 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 40.1+ KB


In [25]:
subset.head()

Unnamed: 0,country,year,pop
0,Afghanistan,1952,8425333
1,Afghanistan,1957,9240934
2,Afghanistan,1962,10267083
3,Afghanistan,1967,11537966
4,Afghanistan,1972,13079460


In [27]:
pandas.__version__ #to check the python version

'1.4.4'

In [29]:
#How to subset rows 

In [37]:
#Using .loc and .iloc

In [36]:
#Using LOC

In [33]:
df.loc[3] # for duplicated rows with the same index during row binding or concatenation, it will return a duplicate.

country      Afghanistan
continent           Asia
year                1967
lifeExp            34.02
pop             11537966
gdpPercap     836.197138
Name: 3, dtype: object

In [35]:
df.loc[[2, 0]] # Just like character matching, it is looking for 2, 0.

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
0,Afghanistan,Asia,1952,28.801,8425333,779.445314


In [None]:
# Using ILOC # The "i" stands for index. Imagine you have a dataframe that is not ordered according to their index, we use ILOC to get the actual position ex. iloc[2] which brings up the second index positionally not character matching [2] like LOC. iloc gets the position using the index while loc gets the index number using the number. 

In [38]:
df.iloc[2] #same result because df is ordered already. if it was sorted or random, it will pull out the record for [2] 

country      Afghanistan
continent           Asia
year                1962
lifeExp           31.997
pop             10267083
gdpPercap      853.10071
Name: 2, dtype: object

In [None]:
# How to subset rows and column together

In [40]:
#Lets say we want to subset columns and also start filtering by rows.

In [41]:
subset =df.loc[:, ['country', 'pop', 'year']]

In [42]:
subset.head()

Unnamed: 0,country,pop,year
0,Afghanistan,8425333,1952
1,Afghanistan,9240934,1957
2,Afghanistan,10267083,1962
3,Afghanistan,11537966,1967
4,Afghanistan,13079460,1972


In [55]:
# Lets find the Country, Nigeria and show the country, year and population. Rows are on the left while columns are on the right of df.loc[ , ]

In [52]:
df.loc[df['country'] == 'Nigeria', ['country', 'year', 'pop']]

Unnamed: 0,country,year,pop
1128,Nigeria,1952,33119096
1129,Nigeria,1957,37173340
1130,Nigeria,1962,41871351
1131,Nigeria,1967,47287752
1132,Nigeria,1972,53740085
1133,Nigeria,1977,62209173
1134,Nigeria,1982,73039376
1135,Nigeria,1987,81551520
1136,Nigeria,1992,93364244
1137,Nigeria,1997,106207839


In [53]:
 Nigeria = df.loc[df['country'] == 'Nigeria', ['country', 'year', 'pop']]

In [54]:
Nigeria.head()

Unnamed: 0,country,year,pop
1128,Nigeria,1952,33119096
1129,Nigeria,1957,37173340
1130,Nigeria,1962,41871351
1131,Nigeria,1967,47287752
1132,Nigeria,1972,53740085


In [60]:
df.loc[df['country'] == 'Ghana', ['country', 'year', 'pop']]

Unnamed: 0,country,year,pop
576,Ghana,1952,5581001
577,Ghana,1957,6391288
578,Ghana,1962,7355248
579,Ghana,1967,8490213
580,Ghana,1972,9354120
581,Ghana,1977,10538093
582,Ghana,1982,11400338
583,Ghana,1987,14168101
584,Ghana,1992,16278738
585,Ghana,1997,18418288


In [None]:
#For multiple conditions

In [68]:
df.loc[ (df['year'] == 1967) & (df['pop'] > 1_000_000), ['year'] ] 

Unnamed: 0,year
3,1967
15,1967
27,1967
39,1967
51,1967
...,...
1647,1967
1659,1967
1671,1967
1683,1967


In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   country    1704 non-null   object 
 1   continent  1704 non-null   object 
 2   year       1704 non-null   int64  
 3   lifeExp    1704 non-null   float64
 4   pop        1704 non-null   int64  
 5   gdpPercap  1704 non-null   float64
dtypes: float64(2), int64(2), object(2)
memory usage: 80.0+ KB


In [74]:
df.loc[ (df['year'] == 1967) & (df['pop'] > 1_000_000), 
        ['country', 'pop', 'lifeExp'] ]  #Includig other columns

Unnamed: 0,country,pop,lifeExp
3,Afghanistan,11537966,34.020
15,Albania,1984060,66.220
27,Algeria,12760499,51.407
39,Angola,5247469,35.985
51,Argentina,22934225,65.634
...,...,...,...
1647,Vietnam,39463910,47.838
1659,West Bank and Gaza,1142636,51.631
1671,"Yemen, Rep.",6740785,36.984
1683,Zambia,3900000,47.768
