# Pandas and Matplotlib tutorial

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Let's start with Pandas Series

In [3]:
# Creating a series
a = np.array(np.random.randint(1,100,10))
s = pd.Series(a)
print(s)
print(s.values)

0     1
1    46
2    95
3    19
4    95
5    85
6    67
7    99
8    30
9     9
dtype: int64
[ 1 46 95 19 95 85 67 99 30  9]


In [4]:
# Giving Custom Indexes
s = pd.Series(a,index=np.random.randint(400,500,10))
print(s)

482     1
488    46
489    95
498    19
426    95
495    85
432    67
499    99
403    30
413     9
dtype: int64


In [5]:
# Creating a Series from a dictionary
dictionary = {'h':456,'g':392,'c':980}
print(pd.Series(dictionary))

h    456
g    392
c    980
dtype: int64


In [6]:
# Accessing elemnets in a Series
# Using indexes and using values

dict1 = {'Uttar Pradesh': 38332521,'Karnataka': 26448193,'Haryana': 19651127}
a = pd.Series(dict1)
print(a['Karnataka'])
print(a[1])

26448193
26448193


In [7]:
# Retrieving a range

s = pd.Series(range(10), index = [x for x in 'abcdefghij'])

# Retrieve the first 3 elements

print(s[:3])
print(s[:'c'])

# Retrieve the last element
print(s[-1:])

# Accessing everything but the last element
print(s[:-1])

a    0
b    1
c    2
dtype: int64
a    0
b    1
c    2
dtype: int64
j    9
dtype: int64
a    0
b    1
c    2
d    3
e    4
f    5
g    6
h    7
i    8
dtype: int64


Let's work with Iris Dataset

In [8]:
# Load iris dataset
# filename = 'http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
# data = pd.read_csv(filename, sep=',', header=None)
data = pd.read_csv('Iris.csv')

In [9]:
# Basic dataset statistics
data.describe()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
count,150.0,150.0,150.0,150.0,150.0
mean,75.5,5.843333,3.054,3.758667,1.198667
std,43.445368,0.828066,0.433594,1.76442,0.763161
min,1.0,4.3,2.0,1.0,0.1
25%,38.25,5.1,2.8,1.6,0.3
50%,75.5,5.8,3.0,4.35,1.3
75%,112.75,6.4,3.3,5.1,1.8
max,150.0,7.9,4.4,6.9,2.5


In [10]:
# Info about datatypes and missing values
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [26]:
data.Species.unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [27]:
data.Species.value_counts()

Iris-versicolor    50
Iris-virginica     50
Iris-setosa        50
Name: Species, dtype: int64

In [13]:
# Removing null values
data.dropna().head()
#explore data.tail()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [14]:
# Get column names of data
cols = list(data.columns)
cols

['Id',
 'SepalLengthCm',
 'SepalWidthCm',
 'PetalLengthCm',
 'PetalWidthCm',
 'Species']

In [13]:
data.Species

0         Iris-setosa
1         Iris-setosa
2         Iris-setosa
3         Iris-setosa
4         Iris-setosa
            ...      
145    Iris-virginica
146    Iris-virginica
147    Iris-virginica
148    Iris-virginica
149    Iris-virginica
Name: Species, Length: 150, dtype: object

In [14]:
data['Species'] #Same as data.Species

0         Iris-setosa
1         Iris-setosa
2         Iris-setosa
3         Iris-setosa
4         Iris-setosa
            ...      
145    Iris-virginica
146    Iris-virginica
147    Iris-virginica
148    Iris-virginica
149    Iris-virginica
Name: Species, Length: 150, dtype: object

In [15]:
# slicing data frames
n = 10
data[:n]

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
5,6,5.4,3.9,1.7,0.4,Iris-setosa
6,7,4.6,3.4,1.4,0.3,Iris-setosa
7,8,5.0,3.4,1.5,0.2,Iris-setosa
8,9,4.4,2.9,1.4,0.2,Iris-setosa
9,10,4.9,3.1,1.5,0.1,Iris-setosa


In [12]:
#slicing
data[2:5][['SepalWidthCm','Species']]

Unnamed: 0,SepalWidthCm,Species
2,3.2,Iris-setosa
3,3.1,Iris-setosa
4,3.6,Iris-setosa


In [16]:
# returns nth row 
data.iloc[n-1]

Id                        10
SepalLengthCm            4.9
SepalWidthCm             3.1
PetalLengthCm            1.5
PetalWidthCm             0.1
Species          Iris-setosa
Name: 9, dtype: object

In [17]:
# returns range of rows and columns
data.iloc[2:10, 1:3]

Unnamed: 0,SepalLengthCm,SepalWidthCm
2,4.7,3.2
3,4.6,3.1
4,5.0,3.6
5,5.4,3.9
6,4.6,3.4
7,5.0,3.4
8,4.4,2.9
9,4.9,3.1


In [16]:
data.max() #data.min()

Id                          150
SepalLengthCm               7.9
SepalWidthCm                4.4
PetalLengthCm               6.9
PetalWidthCm                2.5
Species          Iris-virginica
dtype: object

In [19]:
#Find min/max of a particular column
data['SepalLengthCm'].max()

7.9

In [20]:
data[data.SepalLengthCm == data.SepalLengthCm.max()] 

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
131,132,7.9,3.8,6.4,2.0,Iris-virginica


In [21]:
#select only day column which has maximum temperature
data.PetalLengthCm[data.SepalLengthCm == data.SepalLengthCm.max()] 

131    6.4
Name: PetalLengthCm, dtype: float64

In [18]:
# Add a new column 
data['newCol1'] = 0
data['newCol2'] = 1
print(data.head())
# Drop the new column
data.drop('newCol1', axis='columns', inplace=False)
data.drop(['newCol1','newCol2'], axis='columns', inplace=True)
data.head()

   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species  \
0   1            5.1           3.5            1.4           0.2  Iris-setosa   
1   2            4.9           3.0            1.4           0.2  Iris-setosa   
2   3            4.7           3.2            1.3           0.2  Iris-setosa   
3   4            4.6           3.1            1.5           0.2  Iris-setosa   
4   5            5.0           3.6            1.4           0.2  Iris-setosa   

   newCol1  newCol2  
0        0        1  
1        0        1  
2        0        1  
3        0        1  
4        0        1  


Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [19]:
# Applying a function on the data

sepal_lengths = data['SepalLengthCm'] 
mean = np.mean(sepal_lengths)

# Define function 
do = lambda x : x - mean

mean_sepal = data['SepalLengthCm'].apply(do)
print(mean_sepal.head())

0   -0.743333
1   -0.943333
2   -1.143333
3   -1.243333
4   -0.843333
Name: SepalLengthCm, dtype: float64


In [20]:
# sorting data
data.sort_values(by=['PetalLengthCm'], ascending=True, inplace=False).head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
22,23,4.6,3.6,1.0,0.2,Iris-setosa
13,14,4.3,3.0,1.1,0.1,Iris-setosa
14,15,5.8,4.0,1.2,0.2,Iris-setosa
35,36,5.0,3.2,1.2,0.2,Iris-setosa
36,37,5.5,3.5,1.3,0.2,Iris-setosa


In [21]:
# Finding the number of data points in each class
data['Species'].value_counts()

Iris-setosa        50
Iris-virginica     50
Iris-versicolor    50
Name: Species, dtype: int64

In [22]:
# Cumulative sum of the value counts
data['Species'].value_counts().cumsum()

Iris-setosa         50
Iris-virginica     100
Iris-versicolor    150
Name: Species, dtype: int64

In [23]:
# dropping duplicate values
data.drop_duplicates()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica


In [24]:
# removing some columns / row from dataset
data.drop('PetalLengthCm', axis='columns', inplace=False).head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalWidthCm,Species
0,1,5.1,3.5,0.2,Iris-setosa
1,2,4.9,3.0,0.2,Iris-setosa
2,3,4.7,3.2,0.2,Iris-setosa
3,4,4.6,3.1,0.2,Iris-setosa
4,5,5.0,3.6,0.2,Iris-setosa


Pivot tables

In [25]:
# Creating pivot tables with np.sum
data.pivot_table(values='SepalLengthCm', index='Species', aggfunc=np.sum)

Unnamed: 0_level_0,SepalLengthCm
Species,Unnamed: 1_level_1
Iris-setosa,250.3
Iris-versicolor,296.8
Iris-virginica,329.4


In [26]:
# Create pivot table with np.mean
data.pivot_table(values='SepalLengthCm', index='Species', aggfunc=np.mean)

Unnamed: 0_level_0,SepalLengthCm
Species,Unnamed: 1_level_1
Iris-setosa,5.006
Iris-versicolor,5.936
Iris-virginica,6.588


# Excercise
1. Use lambda or apply to calculate mean deviation of SepalLengthCm and store in a new column
2. Do 1. without using any function call