# Pandas Tutorial

In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.DataFrame(
[['Jan',58,42,74,22,2.95],
['Feb',61,45,78,26,3.02],
['Mar',65,48,84,25,2.34],
['Apr',67,50,92,28,1.02],
['May',71,53,98,35,0.48],
['Jun',75,56,107,41,0.48],
['Jul',77,58,105,44,0.0],
['Aug',77,59,102,43,0.03],
['Sep',77,57,103,40,0.17],
['Oct',73,54,96,34,0.81],
['Nov',64,48,84,30,1.7],
['Dec',58,42,73,21,2.56]],
index = [0,1,2,3,4,5,6,7,8,9,10,11],
columns = ['month','avg_high','avg_low','record_high','record_low','avg_precipitation'])

In [14]:
df[2:4]

Unnamed: 0,month,avg_high,avg_low,record_high,record_low,avg_precipitation
2,Mar,65,48,84,25,2.34
3,Apr,67,50,92,28,1.02


### Features of the Data frame

In [7]:
df.dtypes #shows thhe data types

month                 object
avg_high               int64
avg_low                int64
record_high            int64
record_low             int64
avg_precipitation    float64
dtype: object

In [8]:
df.index # shows the index,range,start,stop of dataframe 

Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], dtype='int64')

In [10]:
df.columns # shows the columns

Index(['month', 'avg_high', 'avg_low', 'record_high', 'record_low',
       'avg_precipitation'],
      dtype='object')

In [11]:
df.values #shows the values of each rows

array([['Jan', 58, 42, 74, 22, 2.95],
       ['Feb', 61, 45, 78, 26, 3.02],
       ['Mar', 65, 48, 84, 25, 2.34],
       ['Apr', 67, 50, 92, 28, 1.02],
       ['May', 71, 53, 98, 35, 0.48],
       ['Jun', 75, 56, 107, 41, 0.48],
       ['Jul', 77, 58, 105, 44, 0.0],
       ['Aug', 77, 59, 102, 43, 0.03],
       ['Sep', 77, 57, 103, 40, 0.17],
       ['Oct', 73, 54, 96, 34, 0.81],
       ['Nov', 64, 48, 84, 30, 1.7],
       ['Dec', 58, 42, 73, 21, 2.56]], dtype=object)

In [13]:
df.describe() #the statistical results

Unnamed: 0,avg_high,avg_low,record_high,record_low,avg_precipitation
count,12.0,12.0,12.0,12.0,12.0
mean,68.583333,51.0,91.333333,32.416667,1.296667
std,7.366488,6.060303,12.323911,8.240238,1.158097
min,58.0,42.0,73.0,21.0,0.0
25%,63.25,47.25,82.5,25.75,0.4025
50%,69.0,51.5,94.0,32.0,0.915
75%,75.5,56.25,102.25,40.25,2.395
max,77.0,59.0,107.0,44.0,3.02


### Sort the values according to  a column

In [14]:
df.sort_values('record_high', ascending = False)

Unnamed: 0,month,avg_high,avg_low,record_high,record_low,avg_precipitation
5,Jun,75,56,107,41,0.48
6,Jul,77,58,105,44,0.0
8,Sep,77,57,103,40,0.17
7,Aug,77,59,102,43,0.03
4,May,71,53,98,35,0.48
9,Oct,73,54,96,34,0.81
3,Apr,67,50,92,28,1.02
2,Mar,65,48,84,25,2.34
10,Nov,64,48,84,30,1.7
1,Feb,61,45,78,26,3.02


## Slicing Records

In [20]:
#Pandas Tutorial
df.avg_low # index with single column (slicing)

0     42
1     45
2     48
3     50
4     53
5     56
6     58
7     59
8     57
9     54
10    48
11    42
Name: avg_low, dtype: int64

In [16]:
df['avg_low'] # same thing!

0     42
1     45
2     48
3     50
4     53
5     56
6     58
7     59
8     57
9     54
10    48
11    42
Name: avg_low, dtype: int64

In [5]:
df[2:4] # Slicing the multiple rows of data

Unnamed: 0,month,avg_high,avg_low,record_high,record_low,avg_precipitation
2,Mar,65,48,84,25,2.34
3,Apr,67,50,92,28,1.02


In [8]:
#indexing: use loc for label based indexing
#all columns 

#iloc is used when you need position based index and columns 
# indexes are starting at 0 but positions = 1
df.iloc[5:10]

Unnamed: 0,month,avg_high,avg_low,record_high,record_low,avg_precipitation
5,Jun,75,56,107,41,0.48
6,Jul,77,58,105,44,0.0
7,Aug,77,59,102,43,0.03
8,Sep,77,57,103,40,0.17
9,Oct,73,54,96,34,0.81


In [9]:
#Selecting row range and column range 
df.loc[5:10, 'avg_high' : 'record_low']

Unnamed: 0,avg_high,avg_low,record_high,record_low
5,75,56,107,41
6,77,58,105,44
7,77,59,102,43
8,77,57,103,40
9,73,54,96,34
10,64,48,84,30


In [10]:
#shows list of columns
df[['avg_low','avg_high']]

Unnamed: 0,avg_low,avg_high
0,42,58
1,45,61
2,48,65
3,50,67
4,53,71
5,56,75
6,58,77
7,59,77
8,57,77
9,54,73


In [12]:
#slicing only rows according to index
df[2:4]

Unnamed: 0,month,avg_high,avg_low,record_high,record_low,avg_precipitation
2,Mar,65,48,84,25,2.34
3,Apr,67,50,92,28,1.02


In [23]:
#Filter rows based on conditioning
best_precip = df.loc[df.avg_precipitation < 1]
#used formatted string literal to show the output
print(f'Number of total best precipitation recorded: {len(best_precip)}')


Number of total best precipitation recorded: 6


#### Comparing two data sets

In [6]:
df['Criteria 1'] = df['avg_low'] >= df['record_low']
print(df['Criteria 1'])

0     True
1     True
2     True
3     True
4     True
5     True
6     True
7     True
8     True
9     True
10    True
11    True
Name: Criteria 1, dtype: bool
