In [4]:
import pandas as pd
import numpy as np

2 data structures
* dataframe
* series
* A Series is similar to a one-dimensional array whereas a DataFrame is a tabular representation akin to a spreadsheet table.

# Series

In [5]:
s = pd.Series([3.1,2.3,5.4]) # gives indexing
s

0    3.1
1    2.3
2    5.4
dtype: float64

In [6]:
# custom indexing
s2 = pd.Series([3,4,5,6,5,3],
               index=[1,2,3,4,5,6])
s2

1    3
2    4
3    5
4    6
5    5
6    3
dtype: int64

In [7]:
# 3 properties of series
print(s.values)
print(s.index)
print(s.dtype)

[3.1 2.3 5.4]
RangeIndex(start=0, stop=3, step=1)
float64


In [8]:
# Series from dictionary objects
d1 = {'animal':'cat','color':'red','height':'small'}
print(d1)
s3= pd.Series(d1)
print(s3)
print(s3.index)
print(s3.values)

# Series from tuples
# Series from ndArray

{'animal': 'cat', 'color': 'red', 'height': 'small'}
animal      cat
color       red
height    small
dtype: object
Index(['animal', 'color', 'height'], dtype='object')
['cat' 'red' 'small']


In [9]:
# Accessing elements
print(s3[2])

small


In [10]:
# find number of animale
s3 = pd.Series([1.2,2.5,-2.2,3.1,-0.8,-3.2], 
            index = ['Jan 1','Jan 2','Jan 3','Jan 4','Jan 5','Jan 6',])
print(s3.size) # no of elemnts
print(s3.shape) # dimensions
print(s3.count()) # no of non null values

6
(6,)
6


In [11]:
colors = pd.Series(['red', 'blue', 'blue', 'yellow', 'red', 'green', 'blue', np.nan])
print('colors =\n', colors, '\n')
print('colors.value_counts() =\n', colors.value_counts())

colors =
 0       red
1      blue
2      blue
3    yellow
4       red
5     green
6      blue
7       NaN
dtype: object 

colors.value_counts() =
 blue      3
red       2
yellow    1
green     1
dtype: int64


# Data frame

In [13]:
cars = {'make': ['Ford', 'Honda', 'Toyota', 'Tesla'],
       'model': ['Taurus', 'Accord', 'Camry', 'Model S'],
       'MSRP': [27595, 23570, 23495, 68000]}          
carData = pd.DataFrame(cars)            # creating DataFrame from dictionary
carData                              # display the table

Unnamed: 0,make,model,MSRP
0,Ford,Taurus,27595
1,Honda,Accord,23570
2,Toyota,Camry,23495
3,Tesla,Model S,68000


In [15]:
print('carData.index =', carData.index)         # print the row indices
print('carData.columns =', carData.columns)  

carData.index = RangeIndex(start=0, stop=4, step=1)
carData.columns = Index(['make', 'model', 'MSRP'], dtype='object')


In [17]:
carData['Year'] = 1800
carData

Unnamed: 0,make,model,MSRP,Year
0,Ford,Taurus,27595,1800
1,Honda,Accord,23570,1800
2,Toyota,Camry,23495,1800
3,Tesla,Model S,68000,1800


In [19]:
carData['type'] = [1,2,5,3]
carData


Unnamed: 0,make,model,MSRP,Year,type
0,Ford,Taurus,27595,1800,1
1,Honda,Accord,23570,1800,2
2,Toyota,Camry,23495,1800,5
3,Tesla,Model S,68000,1800,3


In [21]:
# from tuples
values = [(1,2,3),(2,3,5),(3,2,4),(2,3,4)]
colNames = ['v1','v2','v3']
d5 = pd.DataFrame(values,columns = colNames)
d5

Unnamed: 0,v1,v2,v3
0,1,2,3
1,2,3,5
2,3,2,4
3,2,3,4


In [25]:
#from nd array
arr1 = np.random.rand(3,5)
colNames2 = ['a','b','c','d','e']
d6 = pd.DataFrame(arr1, columns = colNames2) 
d6                 

Unnamed: 0,a,b,c,d,e
0,0.507706,0.31941,0.919487,0.833578,0.973463
1,0.85044,0.59491,0.21357,0.458786,0.014377
2,0.401212,0.681104,0.241822,0.640196,0.630444


In [29]:
# filtering and selection

print(d6[d6.e >0.2])

          a         b         c         d         e
0  0.507706  0.319410  0.919487  0.833578  0.973463
2  0.401212  0.681104  0.241822  0.640196  0.630444


In [32]:
# Aritmetic operations
# Transpose
print(d6.T)
# addition,multiplication
print(d6+2)

print((d6+2)*10)


          0         1         2
a  0.507706  0.850440  0.401212
b  0.319410  0.594910  0.681104
c  0.919487  0.213570  0.241822
d  0.833578  0.458786  0.640196
e  0.973463  0.014377  0.630444
          a         b         c         d         e
0  2.507706  2.319410  2.919487  2.833578  2.973463
1  2.850440  2.594910  2.213570  2.458786  2.014377
2  2.401212  2.681104  2.241822  2.640196  2.630444
           a          b          c          d          e
0  25.077057  23.194103  29.194867  28.335779  29.734632
1  28.504402  25.949098  22.135698  24.587865  20.143767
2  24.012119  26.811045  22.418218  26.401958  26.304436


In [37]:
# Maximum minimum values per column
print(d6.max())
print(d6.min())
print(d6.min(axis=1)) # min value of each row
print(d6.mean(axis =1)) # mean value of each row

a    0.850440
b    0.681104
c    0.919487
d    0.833578
e    0.973463
dtype: float64
a    0.401212
b    0.319410
c    0.213570
d    0.458786
e    0.014377
dtype: float64
0    0.319410
1    0.014377
2    0.241822
dtype: float64
0    0.710729
1    0.426417
2    0.518955
dtype: float64


In [38]:
# calculate max- min per column
f = lambda x : x.max() - x.min()
print(d6.apply(f))

a    0.449228
b    0.361694
c    0.705917
d    0.374791
e    0.959086
dtype: float64


In [41]:
# calculate max- min per row
f = lambda x: x.max() -x.min()
print(d6.apply(f,axis=1))

0    0.654053
1    0.836063
2    0.439283
dtype: float64
