### PANDAS: 
High-level data manipulation package which was built on top of Numpy and Matplotlib. The key structures within pandas include Pandas Series and Pandas DataFrames

In [733]:
# Import packages
import pandas as pd
import numpy as np

In [734]:
# creating a series out of a list
x = pd.Series([10,20,30,40,50])
x.index=["A", "B", "C", "D", "E"]
x

A    10
B    20
C    30
D    40
E    50
dtype: int64

In [735]:
# Creating a series with label-based index
data = [450,650,870]
Sales = pd.Series(data,index=['Don','Mike','Edwin'])
Sales

Don      450
Mike     650
Edwin    870
dtype: int64

In [736]:
# Accesing the Index
print(Sales.index)  

Index(['Don', 'Mike', 'Edwin'], dtype='object')


In [737]:
# Accessing the values
print("values are {}".format(Sales.values))
print("data type stored in series is {}".format(Sales.dtype))

values are [450 650 870]
data type stored in series is int64


In [738]:
# Accessing values using index name
print(Sales)

"Mike's sales {}".format(Sales['Mike'])

Don      450
Mike     650
Edwin    870
dtype: int64


"Mike's sales 650"

In [739]:
# Accessing values using a positional index for 'mike'
Sales[1]

650

#### Checking for conditions and filtering the data


In [740]:
Sales

Don      450
Mike     650
Edwin    870
dtype: int64

In [741]:
# We can filter our data based on condtions we specify, we can use booleans to do this
# if we want sales > 500:
Sales> 500   # Returns boolean

Don      False
Mike      True
Edwin     True
dtype: bool

In [742]:
Sales[[False,True,True]]

Mike     650
Edwin    870
dtype: int64

Working with Dictionaries

In [743]:
# Converting Series to Dictionaries
sales_dict = Sales.to_dict()
sales_dict

{'Don': 450, 'Mike': 650, 'Edwin': 870}

In [744]:
# Convertinf Dictionaries to Pandas Series
sales_ser = pd.Series(sales_dict)
sales_ser

Don      450
Mike     650
Edwin    870
dtype: int64

In [745]:
# Adding entries and working with NaN/null values

new_sales = pd.Series(Sales, index = ['Don', 'Mike', 'Sally', 'Edwin', 'Lucy'])
new_sales

Don      450.0
Mike     650.0
Sally      NaN
Edwin    870.0
Lucy       NaN
dtype: float64

In [746]:
# Checking if entries are NaN - using pandas
pd.isna(new_sales)

Don      False
Mike     False
Sally     True
Edwin    False
Lucy      True
dtype: bool

In [747]:
# checking for null values using pandas
pd.isna(new_sales['Sally'])

True

In [748]:
new_sales[pd.isna(new_sales)]

Sally   NaN
Lucy    NaN
dtype: float64

In [749]:
# Which values/ data is not null 
new_sales[pd.notna(new_sales)]

Don      450.0
Mike     650.0
Edwin    870.0
dtype: float64

#### Pandas DataFrames
Two-dimentional, size-mutable, potentially heterogeneous(diverse) tabular data structure

In [750]:
# Creating a DataFrame from a dictionary
new_dict = {
    'Name':['Tom','Jane','Steve','Lucy'],
    'Sales':[250,500,350,400],
    'Date': [2022,2020,2021,2022]}
df = pd.DataFrame(new_dict)
df

Unnamed: 0,Name,Sales,Date
0,Tom,250,2022
1,Jane,500,2020
2,Steve,350,2021
3,Lucy,400,2022


In [751]:
# Adding custom index 
df_index = pd.DataFrame(new_dict, index=['rank1','rank2','rank3','rank4'])
df_index

Unnamed: 0,Name,Sales,Date
rank1,Tom,250,2022
rank2,Jane,500,2020
rank3,Steve,350,2021
rank4,Lucy,400,2022


In [752]:
# Naming the index column
df.index.name='Rank'
df

Unnamed: 0_level_0,Name,Sales,Date
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Tom,250,2022
1,Jane,500,2020
2,Steve,350,2021
3,Lucy,400,2022


In [753]:
# Reset the index column and made it part of the data as a new column
df.reset_index()

Unnamed: 0,Rank,Name,Sales,Date
0,0,Tom,250,2022
1,1,Jane,500,2020
2,2,Steve,350,2021
3,3,Lucy,400,2022


In [754]:
df.columns

Index(['Name', 'Sales', 'Date'], dtype='object')

In [755]:
df.index

RangeIndex(start=0, stop=4, step=1, name='Rank')

In [756]:
df.values

array([['Tom', 250, 2022],
       ['Jane', 500, 2020],
       ['Steve', 350, 2021],
       ['Lucy', 400, 2022]], dtype=object)

In [757]:
df = pd.DataFrame(new_dict)
# Index values
df.index=['rank1','rank2','rank3','rank4']
df.index.name='Rank'
df

Unnamed: 0_level_0,Name,Sales,Date
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
rank1,Tom,250,2022
rank2,Jane,500,2020
rank3,Steve,350,2021
rank4,Lucy,400,2022


In [758]:
new_dict_v2 = {
    'Name':['Tom','Jane','Steve','Lucy'],
    'Sales':[250,500,350,400],
    'Date': [2022,2020,2021,2022],
    'Rank': ['rank1','rank2','rank3','rank4']}
df2=pd.DataFrame(new_dict_v2)
df2

Unnamed: 0,Name,Sales,Date,Rank
0,Tom,250,2022,rank1
1,Jane,500,2020,rank2
2,Steve,350,2021,rank3
3,Lucy,400,2022,rank4


In [759]:
# Assigning the rank coloumn as an index
df2.set_index("Rank")

Unnamed: 0_level_0,Name,Sales,Date
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
rank1,Tom,250,2022
rank2,Jane,500,2020
rank3,Steve,350,2021
rank4,Lucy,400,2022


In [760]:
# Multi level indexes (hierachical indexes)
# Setting 'RANK' and 'Name' as my index
df2.set_index(["Rank", "Name"], inplace=True)


In [761]:
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,Sales,Date
Rank,Name,Unnamed: 2_level_1,Unnamed: 3_level_1
rank1,Tom,250,2022
rank2,Jane,500,2020
rank3,Steve,350,2021
rank4,Lucy,400,2022


In [762]:
# Sort the table according to name first and then Rank index
df2.sort_index(level=["Name", "Rank"], ascending=[False,True])

Unnamed: 0_level_0,Unnamed: 1_level_0,Sales,Date
Rank,Name,Unnamed: 2_level_1,Unnamed: 3_level_1
rank1,Tom,250,2022
rank3,Steve,350,2021
rank4,Lucy,400,2022
rank2,Jane,500,2020


#### Subsetting DataFrame
it is important to access column, rows and single elements in your DataFrame easily

- square brackets
- Avance Method: loc, iloc

In [763]:
df

Unnamed: 0_level_0,Name,Sales,Date
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
rank1,Tom,250,2022
rank2,Jane,500,2020
rank3,Steve,350,2021
rank4,Lucy,400,2022


In [764]:
df['Name']  # data series

Rank
rank1      Tom
rank2     Jane
rank3    Steve
rank4     Lucy
Name: Name, dtype: object

In [765]:
df[['Name']]  # Now in a dataa frame

Unnamed: 0_level_0,Name
Rank,Unnamed: 1_level_1
rank1,Tom
rank2,Jane
rank3,Steve
rank4,Lucy


In [766]:
df[['Name', 'Sales']]

Unnamed: 0_level_0,Name,Sales
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1
rank1,Tom,250
rank2,Jane,500
rank3,Steve,350
rank4,Lucy,400


In [767]:
# quick access of different columns
df.Name

Rank
rank1      Tom
rank2     Jane
rank3    Steve
rank4     Lucy
Name: Name, dtype: object

In [768]:
# Row Access with []
df[1:3]

Unnamed: 0_level_0,Name,Sales,Date
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
rank2,Jane,500,2020
rank3,Steve,350,2021


In [769]:
df[df["Sales"]>300]

Unnamed: 0_level_0,Name,Sales,Date
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
rank2,Jane,500,2020
rank3,Steve,350,2021
rank4,Lucy,400,2022


In [770]:
# Looking for more conditions
df[(df["Sales"]>300) & (df["Date"]>2020)]

Unnamed: 0_level_0,Name,Sales,Date
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
rank3,Steve,350,2021
rank4,Lucy,400,2022


isin()

In [771]:
df.Date.isin([2020,2022])

Rank
rank1     True
rank2     True
rank3    False
rank4     True
Name: Date, dtype: bool

In [772]:
# Only looking for data within 2020 and 2022
df[df["Date"].isin([2020,2022])]

Unnamed: 0_level_0,Name,Sales,Date
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
rank1,Tom,250,2022
rank2,Jane,500,2020
rank4,Lucy,400,2022


loc - Label based access

- loc[row_label,column_lable]
- iloc[row_postion, column_position]

In [773]:
df

Unnamed: 0_level_0,Name,Sales,Date
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
rank1,Tom,250,2022
rank2,Jane,500,2020
rank3,Steve,350,2021
rank4,Lucy,400,2022


In [774]:
df.loc["rank1":"rank2",["Name","Sales"]]

Unnamed: 0_level_0,Name,Sales
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1
rank1,Tom,250
rank2,Jane,500


In [775]:
#df.loc[df.sales>300,["Name","Sales"]]

#### Summary Statistic and Groupby

In [776]:
df = pd.read_csv("gapminder.csv")

In [777]:
df

Unnamed: 0.1,Unnamed: 0,country,year,population,cont,life_exp,gdp_cap
0,11,Afghanistan,2007,31889923.0,Asia,43.828,974.580338
1,23,Albania,2007,3600523.0,Europe,76.423,5937.029526
2,35,Algeria,2007,33333216.0,Africa,72.301,6223.367465
3,47,Angola,2007,12420476.0,Africa,42.731,4797.231267
4,59,Argentina,2007,40301927.0,Americas,75.320,12779.379640
...,...,...,...,...,...,...,...
137,1655,Vietnam,2007,85262356.0,Asia,74.249,2441.576404
138,1667,West Bank and Gaza,2007,4018332.0,Asia,73.422,3025.349798
139,1679,"Yemen, Rep.",2007,22211743.0,Asia,62.698,2280.769906
140,1691,Zambia,2007,11746035.0,Africa,42.384,1271.211593


In [778]:
# Max population
df["population"].max()


1318683096.0

In [779]:
# Min population
df["population"].min()


199579.0

In [780]:
# Mean
df["population"].mean()

44021219.57042254

In [781]:
df[df["population"]==df["population"].min()]

Unnamed: 0.1,Unnamed: 0,country,year,population,cont,life_exp,gdp_cap
108,1307,Sao Tome and Principe,2007,199579.0,Africa,65.528,1598.435089


In [787]:
# Group by cont 
df.groupby(["cont"])[["gdp_cap","population","life_exp","year"]].sum()


Unnamed: 0_level_0,gdp_cap,population,life_exp,year
cont,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Africa,160629.695446,929539700.0,2849.914,104364
Americas,275075.790634,898871200.0,1840.203,50175
Asia,411609.886714,3811954000.0,2334.04,66231
Europe,751634.449078,586098500.0,2329.458,60210
Oceania,59620.37655,24549950.0,161.439,4014


In [793]:
df.groupby(["year"])[["gdp_cap","population","life_exp"]].mean()


Unnamed: 0_level_0,gdp_cap,population,life_exp
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2007,11680.07182,44021220.0,67.007423


In [790]:
df["cont"].value_counts(sort=True,normalize=True)

Africa      0.366197
Asia        0.232394
Europe      0.211268
Americas    0.176056
Oceania     0.014085
Name: cont, dtype: float64

In [794]:
# tells us the size of the data --> 142 rows and 7 coloumns
df.shape

(142, 7)

In [796]:
# Gives us an overview of the data - here we can see no null value
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142 entries, 0 to 141
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  142 non-null    int64  
 1   country     142 non-null    object 
 2   year        142 non-null    int64  
 3   population  142 non-null    float64
 4   cont        142 non-null    object 
 5   life_exp    142 non-null    float64
 6   gdp_cap     142 non-null    float64
dtypes: float64(3), int64(2), object(2)
memory usage: 7.9+ KB


In [798]:
# Shows statistic of the data
df.describe()

Unnamed: 0.1,Unnamed: 0,year,population,life_exp,gdp_cap
count,142.0,142.0,142.0,142.0,142.0
mean,857.0,2007.0,44021220.0,67.007423,11680.07182
std,493.631441,0.0,147621400.0,12.073021,12859.937337
min,11.0,2007.0,199579.0,39.613,277.551859
25%,434.0,2007.0,4508034.0,57.16025,1624.842248
50%,857.0,2007.0,10517530.0,71.9355,6124.371108
75%,1280.0,2007.0,31210040.0,76.41325,18008.83564
max,1703.0,2007.0,1318683000.0,82.603,49357.19017


#### Creating a DataFrame from Dictionary of series

In [799]:
# If we have series we want to put into a DataFrame, we can easily combine them together
# If we wanted a DataFrame from a single series, we can do that by passing in the single series 
east = pd.Series([1000,1200,3400],index=['Q1','Q2','Q3'])
west = pd.Series([1100,1300,2400,3500],index=['Q1','Q2','Q3','Q4'])
df_region = pd.DataFrame({'East':east,'West':west})
df_region

Unnamed: 0,East,West
Q1,1000.0,1100
Q2,1200.0,1300
Q3,3400.0,2400
Q4,,3500


#### Size mutability

In [800]:
# Once we have a DataFrame, we can easily add Series on
df_region['North'] = [2000,3000,2500,4000]
df_region['South'] = [1500,2000,1500,4000]
df_region

Unnamed: 0,East,West,North,South
Q1,1000.0,1100,2000,1500
Q2,1200.0,1300,3000,2000
Q3,3400.0,2400,2500,1500
Q4,,3500,4000,4000


In [801]:
# creating a new coloumn --> 'years
years = ['2016','2017','2018','2019']
df_region['years'] = years
df_region

Unnamed: 0,East,West,North,South,years
Q1,1000.0,1100,2000,1500,2016
Q2,1200.0,1300,3000,2000,2017
Q3,3400.0,2400,2500,1500,2018
Q4,,3500,4000,4000,2019


In [802]:
# Setting a year coloumn as our index
#  We can use set_index to set the index to a different column in the DataFrame
df_region = df_region.set_index('years')
df_region


Unnamed: 0_level_0,East,West,North,South
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016,1000.0,1100,2000,1500
2017,1200.0,1300,3000,2000
2018,3400.0,2400,2500,1500
2019,,3500,4000,4000


### Reindexing

In [803]:
# Let's say we want to see different index values, we can use reindex
# reindex will shift our index
new_df = df_region.reindex(['2017','2018','2019','2020','2021'])
new_df


Unnamed: 0_level_0,East,West,North,South
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017,1200.0,1300.0,3000.0,2000.0
2018,3400.0,2400.0,2500.0,1500.0
2019,,3500.0,4000.0,4000.0
2020,,,,
2021,,,,


In [804]:
# reindex can also be used on columns
# We can shift our columns, or add new ones if we add a name that was not present before
re_indexed = new_df.reindex(columns=['North','East','South','New'])
re_indexed

Unnamed: 0_level_0,North,East,South,New
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017,3000.0,1200.0,2000.0,
2018,2500.0,3400.0,1500.0,
2019,4000.0,,4000.0,
2020,,,,
2021,,,,
