In [2]:
import pandas as pd

In [3]:
obj = pd.Series([1,3,6,9])
print(obj)

0    1
1    3
2    6
3    9
dtype: int64


### Index and values in Pandas

In [4]:
print(obj.values)
print(obj.index)

[1 3 6 9]
RangeIndex(start=0, stop=4, step=1)


In [5]:
#we can provide index of our choice
obj = pd.Series([100,300,400,700], index = ['a','b','c','d'])
print(obj)

a    100
b    300
c    400
d    700
dtype: int64


In [6]:
# we can also give name the to Series
obj = pd.Series([100,300,400,700], index = ['a','b','c','d'], name = '4 months sale in K')
print(obj)

a    100
b    300
c    400
d    700
Name: 4 months sale in K, dtype: int64


### canteen data of one week as example

In [7]:
sandwich = pd.Series([100,30,50,34,45,92,0],
                     index = ['mon','tue','wed','thu','fri','sat','sun'])
print(sandwich)

mon    100
tue     30
wed     50
thu     34
fri     45
sat     92
sun      0
dtype: int64


In [8]:
# we can access data either by locatoin or index
print(sandwich[1])
print(sandwich['tue'])

30
30


In [9]:
# to get data from multiple locations
print(sandwich[[3,5]])
print(sandwich[['thu','sat']])

thu    34
sat    92
dtype: int64
thu    34
sat    92
dtype: int64


In [10]:
#can select some conditional data
print(sandwich[sandwich>30])

mon    100
wed     50
thu     34
fri     45
sat     92
dtype: int64


In [11]:
# we can use data from numpy to panda series

import numpy as np
ar = np.array([1,2,4,5,6,8])
ind = np.array(['a','b','c','d','e','f'])
obj2 = pd.Series(ar, index = ind)
print(obj2)

a    1
b    2
c    4
d    5
e    6
f    8
dtype: int32


In [12]:
state = ['ohio','ohio','ohio','LA','LA','LA','NYC']
data = {'state':state,
        'year':[2000,2001,2002,2003,2000,2003,2004],
        'pop': [1.2,1.3,2.3,2.5,1.2,1.6,5.6]
       }
state_pop_df = pd.DataFrame(data,index = ['1st','2nd','3rd','4th','5th','6th','7th'])
print(state_pop_df)

    state  year  pop
1st  ohio  2000  1.2
2nd  ohio  2001  1.3
3rd  ohio  2002  2.3
4th    LA  2003  2.5
5th    LA  2000  1.2
6th    LA  2003  1.6
7th   NYC  2004  5.6


In [13]:
# we can change the arrangement of the dataframe
state_pop_df = pd.DataFrame(data,columns = ['year','pop','state'])
print(state_pop_df)

   year  pop state
0  2000  1.2  ohio
1  2001  1.3  ohio
2  2002  2.3  ohio
3  2003  2.5    LA
4  2000  1.2    LA
5  2003  1.6    LA
6  2004  5.6   NYC


In [15]:
# Adding additional column
frame2 = pd.DataFrame(data,columns = ['year','pop','state','debt'],
                     index = ['one','two','three','four','five','six','seven'])
frame2.head()

Unnamed: 0,year,pop,state,debt
one,2000,1.2,ohio,
two,2001,1.3,ohio,
three,2002,2.3,ohio,
four,2003,2.5,LA,
five,2000,1.2,LA,


In [None]:
# A column in a DataFrame can be reterieved as a series either by dict-like notation or as attribute

In [16]:
print(data)

{'state': ['ohio', 'ohio', 'ohio', 'LA', 'LA', 'LA', 'NYC'], 'year': [2000, 2001, 2002, 2003, 2000, 2003, 2004], 'pop': [1.2, 1.3, 2.3, 2.5, 1.2, 1.6, 5.6]}


In [17]:
data['state']

['ohio', 'ohio', 'ohio', 'LA', 'LA', 'LA', 'NYC']

In [18]:
#lets try this with dataFRAME
frame2['year']  # this is dictionary like notation to extract dataframe data columns

one      2000
two      2001
three    2002
four     2003
five     2000
six      2003
seven    2004
Name: year, dtype: int64

In [19]:
# lets try attribute style 
frame2.year

one      2000
two      2001
three    2002
four     2003
five     2000
six      2003
seven    2004
Name: year, dtype: int64

In [20]:
frame2 = pd.DataFrame(data,columns = ['year','pop','state','debt'],
                     index = ['one','two','three','four','five','six','seven'])
print(len(frame2)) # finding number of rows in dataframe

7


In [23]:
ln = len(frame2)
rng = np.arange(ln)
print(rng)

[0 1 2 3 4 5 6]


In [24]:
frame2['debt'] = rng
frame2

Unnamed: 0,year,pop,state,debt
one,2000,1.2,ohio,0
two,2001,1.3,ohio,1
three,2002,2.3,ohio,2
four,2003,2.5,LA,3
five,2000,1.2,LA,4
six,2003,1.6,LA,5
seven,2004,5.6,NYC,6


# Function Application and Mapping

In [26]:
frame = frame = pd.DataFrame(np.random.randn(4,3),
                            columns = list('bde'),
                            index = ['Utah','Ohio','Texas','Oregon'])
print(frame)
print(np.abs(frame))

               b         d         e
Utah   -0.704553  1.948404 -0.444879
Ohio    0.053271  0.507766 -0.133718
Texas   0.381706  0.807391 -2.024940
Oregon  0.398183  0.731834  0.421827
               b         d         e
Utah    0.704553  1.948404  0.444879
Ohio    0.053271  0.507766  0.133718
Texas   0.381706  0.807391  2.024940
Oregon  0.398183  0.731834  0.421827


In [27]:
print(frame['d'].min())
print(frame['d'].max())

0.5077662397270193
1.9484043856230175


In [29]:
print(frame['d'].max()-frame['d'].min())

1.4406381458959983


In [30]:
f = lambda x:x.max()-x.min()
df = frame.apply(f) #applied on a column
print(df,type(df))

b    1.102736
d    1.440638
e    2.446767
dtype: float64 <class 'pandas.core.series.Series'>


In [32]:
# now apply on a row
df = frame.apply(f, axis = 1)
print(df)

Utah      2.652957
Ohio      0.641485
Texas     2.832331
Oregon    0.333650
dtype: float64


# Sorting and Ranking

In [34]:
frame = pd.DataFrame(np.arange(8).reshape((2,4)),
                    index = ['three','one'],
                    columns = ['d','a','b','c'])
print(frame)

       d  a  b  c
three  0  1  2  3
one    4  5  6  7


In [36]:
print(frame.sort_index(axis = 1, ascending = False))

       d  c  b  a
three  0  3  2  1
one    4  7  6  5


In [38]:
# to sort in terms of rows
print(frame.sort_index())

       d  a  b  c
one    4  5  6  7
three  0  1  2  3


In [39]:
# to sort columnwise in ascending order
print(frame.sort_index(axis = 1, ascending = True))

       a  b  c  d
three  1  2  3  0
one    5  6  7  4


In [40]:
# using loc method in python we use indexes to get specific data but usin iloc we can use numbers to get
#data from specific location of dataframe


In [41]:
# can use fill_value = 0 to overcome the missing value issue
# or can use any other value than 0 in fill_value

# Reindex method

In [43]:
obj3 = pd.Series(['blue','purple','yellow'], index = [0,3,6])
print(obj3)



0      blue
3    purple
6    yellow
dtype: object


In [44]:
# to create more index
obj3 = obj3.reindex(range(9))
obj3

0      blue
1       NaN
2       NaN
3    purple
4       NaN
5       NaN
6    yellow
7       NaN
8       NaN
dtype: object

In [49]:
# to overcom eth eproblem of NaN we can use 'ffill method' , it will use previous values to fill the upcoming NaN values
obj3 = pd.Series(['blue','purple','yellow'], index = [0,3,6])
print(obj3)


obj3 = obj3.reindex(range(9), method = "ffill")
obj3

0      blue
3    purple
6    yellow
dtype: object


0      blue
1      blue
2      blue
3    purple
4    purple
5    purple
6    yellow
7    yellow
8    yellow
dtype: object

In [58]:
#Column name changing using reindex
states = pd.DataFrame(np.arange(9).reshape((3,3)),
                      index = ['a','c','d'], columns = ['Ohio','Texas','California'])
print(states)

states_names = ['UTAH','OHIO','CALIFORNIA','TEXAS']

states = states.reindex(columns = states_names, fill_value = np.random.randint(0,10))
print(states)

   Ohio  Texas  California
a     0      1           2
c     3      4           5
d     6      7           8
   UTAH  OHIO  CALIFORNIA  TEXAS
a     7     7           7      7
c     7     7           7      7
d     7     7           7      7


# Selection with loc and iloc

In [59]:
data = {'Name': ['John', 'Alice', 'Bob', 'Emily'],
        'Age': [25, 28, 30, 22],
        'City': ['New York', 'London', 'Paris', 'Tokyo']}
df = pd.DataFrame(data)

# Accessing rows and columns using iloc
print(df.iloc[1])         # Access row at position 1
print(df.iloc[2:4])       # Access rows at positions 2 and 3
print(df.iloc[:, 0])     # Access the first column for all rows
print(df.iloc[0:2, 1])    # Access rows 0 and 1, and the second column

Name     Alice
Age         28
City    London
Name: 1, dtype: object
    Name  Age   City
2    Bob   30  Paris
3  Emily   22  Tokyo
0     John
1    Alice
2      Bob
3    Emily
Name: Name, dtype: object
0    25
1    28
Name: Age, dtype: int64
