# Pandas Tutorial

Make sure you already have Pandas installed: `pip install pandas`

In [5]:
import pandas as pd
import numpy as np

## Creating and Manipulating Series objects

In [8]:
a = np.array(
    np.random.randint(1, 100, 10)
)

print(a.shape)

s = pd.Series(a)
print(s)

(10,)
0    38
1    21
2    29
3    29
4    85
5    63
6     6
7    30
8    91
9    66
dtype: int64


In [9]:
s = pd.Series(a, index = np.random.randint(300, 400, 10))
print(s)

384    38
366    21
317    29
342    29
367    85
399    63
384     6
398    30
369    91
372    66
dtype: int64


In [11]:
d = {
    "key1": "val1",
    "key2": "val2",
    "key3": "val3"
}

t = pd.Series(d)
print(t["key1"])
print(t[:21])

val1
key1    val1
key2    val2
key3    val3
dtype: object


## Loading in a dataset

In [6]:
df = pd.read_csv("../Migrated-Numpy-Pandas/Iris.csv")

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


## Describing the Dataset

In [12]:
df.describe()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
count,150.0,150.0,150.0,150.0,150.0
mean,75.5,5.843333,3.054,3.758667,1.198667
std,43.445368,0.828066,0.433594,1.76442,0.763161
min,1.0,4.3,2.0,1.0,0.1
25%,38.25,5.1,2.8,1.6,0.3
50%,75.5,5.8,3.0,4.35,1.3
75%,112.75,6.4,3.3,5.1,1.8
max,150.0,7.9,4.4,6.9,2.5


In [13]:
df.isna().sum()

Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

In [22]:
df[:30]["SepalLengthCm"].value_counts()

5.1    5
5.0    4
5.4    4
4.6    3
4.8    3
4.9    2
4.7    2
5.7    2
5.2    2
4.4    1
4.3    1
5.8    1
Name: SepalLengthCm, dtype: int64

## Indexing

In [23]:
df.iloc[19]

Id                        20
SepalLengthCm            5.1
SepalWidthCm             3.8
PetalLengthCm            1.5
PetalWidthCm             0.3
Species          Iris-setosa
Name: 19, dtype: object

In [24]:
df.iloc[2:10, 1:3]

Unnamed: 0,SepalLengthCm,SepalWidthCm
2,4.7,3.2
3,4.6,3.1
4,5.0,3.6
5,5.4,3.9
6,4.6,3.4
7,5.0,3.4
8,4.4,2.9
9,4.9,3.1


## Cuts of the dataset (Filtering)

In [27]:
df[df["SepalLengthCm"] <= df["SepalLengthCm"].mean()]

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...,...
106,107,4.9,2.5,4.5,1.7,Iris-virginica
113,114,5.7,2.5,5.0,2.0,Iris-virginica
114,115,5.8,2.8,5.1,2.4,Iris-virginica
121,122,5.6,2.8,4.9,2.0,Iris-virginica


In [28]:
df[["SepalLengthCm", "SepalWidthCm"]]

Unnamed: 0,SepalLengthCm,SepalWidthCm
0,5.1,3.5
1,4.9,3.0
2,4.7,3.2
3,4.6,3.1
4,5.0,3.6
...,...,...
145,6.7,3.0
146,6.3,2.5
147,6.5,3.0
148,6.2,3.4


## Transformations, Lambda Functions, and Sorting

In [29]:
df["SepalDoubled"] = df["SepalLengthCm"].apply(lambda x : 2.0 * x)

In [30]:
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,SepalDoubled
0,1,5.1,3.5,1.4,0.2,Iris-setosa,10.2
1,2,4.9,3.0,1.4,0.2,Iris-setosa,9.8
2,3,4.7,3.2,1.3,0.2,Iris-setosa,9.4
3,4,4.6,3.1,1.5,0.2,Iris-setosa,9.2
4,5,5.0,3.6,1.4,0.2,Iris-setosa,10.0


In [32]:
df.sort_values(
    by = ["SepalDoubled"],
    ascending = False, 
    inplace = False
).tail(3)

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,SepalDoubled
38,39,4.4,3.0,1.3,0.2,Iris-setosa,8.8
8,9,4.4,2.9,1.4,0.2,Iris-setosa,8.8
13,14,4.3,3.0,1.1,0.1,Iris-setosa,8.6


In [33]:
df.drop(
    ["SepalDoubled"],
    axis = "columns",
    inplace = True
)

In [34]:
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


## [Optional] Exercises

1. Find a dataset on Kaggle
2. Spin up a notebook and load the data in
3. Summarise the dataset by performing EDA
4. Drop all NULL values (or use the `fillna` method)
5. Create a new column
6. Create a pivot table on any column
7. Save the dataset (by using the `to_csv()` method)