# Panda

## Panel Data

__Features__
* fast and efficient DataFrame
* default and customized  indexing 
* different file format support
* handling of missing data
* label-based slicing, indexing and subsetting
* large datasets can be handled easily
* grouby, aggregation

In [3]:
import pandas as pd
import numpy as np

In [None]:
# numpy
"""
2 - scalar
[1,2,3] - vector
[[1,2], [3,4]] - matrix
"""
# pandas
"""
1d - Series - homogeneous array, size-immutable
2d - DataFrame - hetrogeneously typed columns, size-mutable
"""

__series - array like__

__DataFrame - Tabular format__

# syntax of Series

In [None]:
# pandas.Series(data, index, dtype)

In [5]:
series = pd.Series([1,2,3,4,5])
print(series)

0    1
1    2
2    3
3    4
4    5
dtype: int64


In [7]:
series1 = pd.Series([1,2,3,4,5], index=['a','b','c','d','e'])
print(series1)

a    1
b    2
c    3
d    4
e    5
dtype: int64


In [9]:
for i in series1:
    print(i)

1
2
3
4
5


# creating series using numpy

In [11]:
series2 = pd.Series(np.random.randn(5), 
                    index=['a','b','c','d','e'])
print(series2)

a    0.369561
b    1.743040
c   -0.466904
d   -0.339494
e    1.466372
dtype: float64


# creating series using dictionary

In [13]:
dictionary = {'a':1, 'b':2, 'c':3}
d1 = pd.Series(dictionary)
d1

a    1
b    2
c    3
dtype: int64

# [] - series
# [[]] - dataframe

In [18]:
# d1[[1,2]]
d1.iloc[[1,2]]

b    2
c    3
dtype: int64

In [None]:
# help(d1.iloc)

In [20]:
series1 + series2

a    1.369561
b    3.743040
c    2.533096
d    3.660506
e    6.466372
dtype: float64

In [22]:
series + series1

0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
a   NaN
b   NaN
c   NaN
d   NaN
e   NaN
dtype: float64

# Dataframe

## empty dataframe

In [24]:
df = pd.DataFrame()
print(df)

Empty DataFrame
Columns: []
Index: []


In [26]:
df = pd.DataFrame(columns = ['c1','c2','c3'])
print(df)

Empty DataFrame
Columns: [c1, c2, c3]
Index: []


In [28]:
df = pd.DataFrame(columns = ['c1','c2','c3','c4'], 
                  index = range(1,6))
df

Unnamed: 0,c1,c2,c3,c4
1,,,,
2,,,,
3,,,,
4,,,,
5,,,,


In [30]:
dictionary = {'a':1, 'b':2, 'c':3}
pd.DataFrame([dictionary])

Unnamed: 0,a,b,c
0,1,2,3


## attributes

In [32]:
df.index

RangeIndex(start=1, stop=6, step=1)

In [34]:
df.columns

Index(['c1', 'c2', 'c3', 'c4'], dtype='object')

# create a dataframe using list

In [36]:
mylist = [
    ['Apple','Red'],
    ['Banana', 'yellow'],
    ['Orange', 'orange']
]

mydata = pd.DataFrame(mylist)
mydata

Unnamed: 0,0,1
0,Apple,Red
1,Banana,yellow
2,Orange,orange


In [38]:
mydata = pd.DataFrame(mylist, columns = ['Fruit_name', 'color'])
mydata

Unnamed: 0,Fruit_name,color
0,Apple,Red
1,Banana,yellow
2,Orange,orange


# dataframe using numpy array

In [40]:
mylist1 = np.array([
    [0,1],
    [2,3],
    [4,5]
])

mydf = pd.DataFrame(mylist1, columns = ['Even', 'Odd'])
mydf

Unnamed: 0,Even,Odd
0,0,1
1,2,3
2,4,5


# load a csv data using pandas

In [42]:
df = pd.read_csv('./csv files/cereals.csv')
df

Unnamed: 0,name,calories,protein,vitamins,rating
0,100% Bran,70,4,25,68.402973
1,100% Natural Bran,120,3,0,33.983679
2,All-Bran,70,4,25,59.425505
3,All-Bran with Extra Fiber,50,4,25,93.704912
4,Almond Delight,110,2,25,34.384843
5,Apple Cinnamon Cheerios,110,2,25,29.509541
6,Apple Jacks,110,2,25,33.174094
7,Basic 4,130,3,25,37.038562
8,Bran Chex,90,2,25,49.120253
9,Bran Flakes,90,3,25,53.313813


In [None]:
df.set_index('name') # it will return a new dataframe

# it will modify the original dataframe
# df.set_index('name', inplace=True)

# examining the dataframe

In [44]:
df.head()

Unnamed: 0,name,calories,protein,vitamins,rating
0,100% Bran,70,4,25,68.402973
1,100% Natural Bran,120,3,0,33.983679
2,All-Bran,70,4,25,59.425505
3,All-Bran with Extra Fiber,50,4,25,93.704912
4,Almond Delight,110,2,25,34.384843


In [46]:
df.tail()

Unnamed: 0,name,calories,protein,vitamins,rating
5,Apple Cinnamon Cheerios,110,2,25,29.509541
6,Apple Jacks,110,2,25,33.174094
7,Basic 4,130,3,25,37.038562
8,Bran Chex,90,2,25,49.120253
9,Bran Flakes,90,3,25,53.313813


In [48]:
df.describe()

Unnamed: 0,calories,protein,vitamins,rating
count,10.0,10.0,10.0,10.0
mean,95.0,2.9,22.5,49.205817
std,25.495098,0.875595,7.905694,20.315297
min,50.0,2.0,0.0,29.509541
25%,75.0,2.0,25.0,34.08397
50%,100.0,3.0,25.0,43.079408
75%,110.0,3.75,25.0,57.897582
max,130.0,4.0,25.0,93.704912
