### A Revision to pandas DataFrames

In [1]:
import pandas as pd
import numpy as np

In [2]:
array_a = np.array([[3, 2, 1], [6, 3, 2]])
array_a

array([[3, 2, 1],
       [6, 3, 2]])

In [3]:
pd.DataFrame(array_a)

Unnamed: 0,0,1,2
0,3,2,1
1,6,3,2


In [4]:
type(pd.DataFrame(array_a))

pandas.core.frame.DataFrame

In [5]:
df = pd.DataFrame(array_a, columns = ['Column 1', 'Column 2', 'Column 3'])
df

Unnamed: 0,Column 1,Column 2,Column 3
0,3,2,1
1,6,3,2


In [6]:
df = pd.DataFrame(array_a, columns = ['Column 1', 'Column 2', 'Column 3'], index = ['Row 1', 'Row 2'])
df

Unnamed: 0,Column 1,Column 2,Column 3
Row 1,3,2,1
Row 2,6,3,2


In [7]:
# Can we change to "StringID"?
data = pd.read_csv('Lending-company.csv', index_col = 'LoanID')
lending_co_data = data.copy()
lending_co_data.head()

Unnamed: 0_level_0,StringID,Product,CustomerGender,Location,Region,TotalPrice
LoanID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,id_1,Product B,Male,Location 2,Region 2,16600.0
2,id_2,Product B,Male,Location 3,,16600.0
3,id_3,Product C,Female,Location 5,Region 5,15600.0
4,id_4,Product B,Male,Location 6,Region 1,16600.0
5,id_5,Product D,Female,Location 7,Region 2,20250.0


In [8]:
type(lending_co_data)

pandas.core.frame.DataFrame

### Common Attributes for Working with DataFrames

In [9]:
# Can we change to "StringID"?
data = pd.read_csv('Lending-company.csv', index_col = 'LoanID')
lending_co_data = data.copy()
lending_co_data.head()

Unnamed: 0_level_0,StringID,Product,CustomerGender,Location,Region,TotalPrice
LoanID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,id_1,Product B,Male,Location 2,Region 2,16600.0
2,id_2,Product B,Male,Location 3,,16600.0
3,id_3,Product C,Female,Location 5,Region 5,15600.0
4,id_4,Product B,Male,Location 6,Region 1,16600.0
5,id_5,Product D,Female,Location 7,Region 2,20250.0


In [10]:
lending_co_data.index

Int64Index([   1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
            ...
            1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042, 1043],
           dtype='int64', name='LoanID', length=1043)

In [11]:
type(lending_co_data.index)

pandas.core.indexes.numeric.Int64Index

In [12]:
lending_co_data.columns

Index(['StringID', 'Product', 'CustomerGender', 'Location', 'Region',
       'TotalPrice'],
      dtype='object')

In [13]:
type(lending_co_data.columns)

pandas.core.indexes.base.Index

In [14]:
lending_co_data.axes

[Int64Index([   1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
             ...
             1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042, 1043],
            dtype='int64', name='LoanID', length=1043),
 Index(['StringID', 'Product', 'CustomerGender', 'Location', 'Region',
        'TotalPrice'],
       dtype='object')]

In [15]:
lending_co_data.dtypes

StringID           object
Product            object
CustomerGender     object
Location           object
Region             object
TotalPrice        float64
dtype: object

In [16]:
lending_co_data.values

array([['id_1', 'Product B', 'Male', 'Location 2', 'Region 2', 16600.0],
       ['id_2', 'Product B', 'Male', 'Location 3', nan, 16600.0],
       ['id_3', 'Product C', 'Female', 'Location 5', 'Region 5', 15600.0],
       ...,
       ['id_1041', 'Product B', 'Male', 'Location 23', 'Region 4',
        16600.0],
       ['id_1042', 'Product C', 'NotSpecified', 'Location 52',
        'Region 6', 15600.0],
       ['id_1043', 'Product B', 'Female', 'Location 142', 'Region 6',
        16600.0]], dtype=object)

In [17]:
type(lending_co_data.values)

numpy.ndarray

In [18]:
lending_co_data.to_numpy()

array([['id_1', 'Product B', 'Male', 'Location 2', 'Region 2', 16600.0],
       ['id_2', 'Product B', 'Male', 'Location 3', nan, 16600.0],
       ['id_3', 'Product C', 'Female', 'Location 5', 'Region 5', 15600.0],
       ...,
       ['id_1041', 'Product B', 'Male', 'Location 23', 'Region 4',
        16600.0],
       ['id_1042', 'Product C', 'NotSpecified', 'Location 52',
        'Region 6', 15600.0],
       ['id_1043', 'Product B', 'Female', 'Location 142', 'Region 6',
        16600.0]], dtype=object)

In [19]:
type(lending_co_data.to_numpy())

numpy.ndarray

In [20]:
lending_co_data.shape

(1043, 6)

In [21]:
len(lending_co_data.columns)

6

In [22]:
location_data = pd.read_csv('Lending-company.csv', usecols = ['Location'], squeeze = True)
location_data

0         Location 2
1         Location 3
2         Location 5
3         Location 6
4         Location 7
            ...     
1038     Location 39
1039     Location 50
1040     Location 23
1041     Location 52
1042    Location 142
Name: Location, Length: 1043, dtype: object

In [23]:
type(location_data)

pandas.core.series.Series

In [24]:
location_data.shape

(1043,)

### Data Selection in pandas DataFrames

In [25]:
import pandas as pd

In [26]:
data = pd.read_csv('Lending-company.csv', index_col = 'StringID')
lending_co_data = data.copy()
lending_co_data.head()

Unnamed: 0_level_0,LoanID,Product,CustomerGender,Location,Region,TotalPrice
StringID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
id_1,1,Product B,Male,Location 2,Region 2,16600.0
id_2,2,Product B,Male,Location 3,,16600.0
id_3,3,Product C,Female,Location 5,Region 5,15600.0
id_4,4,Product B,Male,Location 6,Region 1,16600.0
id_5,5,Product D,Female,Location 7,Region 2,20250.0


In [27]:
lending_co_data.Product

StringID
id_1       Product B
id_2       Product B
id_3       Product C
id_4       Product B
id_5       Product D
             ...    
id_1039    Product B
id_1040    Product B
id_1041    Product B
id_1042    Product C
id_1043    Product B
Name: Product, Length: 1043, dtype: object

In [28]:
lending_co_data.Location

StringID
id_1         Location 2
id_2         Location 3
id_3         Location 5
id_4         Location 6
id_5         Location 7
               ...     
id_1039     Location 39
id_1040     Location 50
id_1041     Location 23
id_1042     Location 52
id_1043    Location 142
Name: Location, Length: 1043, dtype: object

In [29]:
lending_co_data['Product']

StringID
id_1       Product B
id_2       Product B
id_3       Product C
id_4       Product B
id_5       Product D
             ...    
id_1039    Product B
id_1040    Product B
id_1041    Product B
id_1042    Product C
id_1043    Product B
Name: Product, Length: 1043, dtype: object

In [30]:
lending_co_data['Location']

StringID
id_1         Location 2
id_2         Location 3
id_3         Location 5
id_4         Location 6
id_5         Location 7
               ...     
id_1039     Location 39
id_1040     Location 50
id_1041     Location 23
id_1042     Location 52
id_1043    Location 142
Name: Location, Length: 1043, dtype: object

In [31]:
lending_co_data['location']

KeyError: 'location'

In [32]:
type(lending_co_data['Location'])

pandas.core.series.Series

In [33]:
lending_co_data[['Location']]

Unnamed: 0_level_0,Location
StringID,Unnamed: 1_level_1
id_1,Location 2
id_2,Location 3
id_3,Location 5
id_4,Location 6
id_5,Location 7
...,...
id_1039,Location 39
id_1040,Location 50
id_1041,Location 23
id_1042,Location 52


In [34]:
type(lending_co_data[['Location']])

pandas.core.frame.DataFrame

In [35]:
lending_co_data[['Location', 'Product']].head()

Unnamed: 0_level_0,Location,Product
StringID,Unnamed: 1_level_1,Unnamed: 2_level_1
id_1,Location 2,Product B
id_2,Location 3,Product B
id_3,Location 5,Product C
id_4,Location 6,Product B
id_5,Location 7,Product D


In [36]:
prod_loc = ['Location', 'Product']
lending_co_data[prod_loc].head()

Unnamed: 0_level_0,Location,Product
StringID,Unnamed: 1_level_1,Unnamed: 2_level_1
id_1,Location 2,Product B
id_2,Location 3,Product B
id_3,Location 5,Product C
id_4,Location 6,Product B
id_5,Location 7,Product D


In [37]:
lending_co_data['Product', 'Location']

KeyError: ('Product', 'Location')

### Data Selection - Indexing Data with .iloc[]

In [38]:
import pandas as pd

In [39]:
data = pd.read_csv('Lending-company.csv', index_col = 'StringID')
lending_co_data = data.copy()
lending_co_data.head()

Unnamed: 0_level_0,LoanID,Product,CustomerGender,Location,Region,TotalPrice
StringID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
id_1,1,Product B,Male,Location 2,Region 2,16600.0
id_2,2,Product B,Male,Location 3,,16600.0
id_3,3,Product C,Female,Location 5,Region 5,15600.0
id_4,4,Product B,Male,Location 6,Region 1,16600.0
id_5,5,Product D,Female,Location 7,Region 2,20250.0


In [40]:
lending_co_data[1]

KeyError: 1

In [41]:
lending_co_data[0,1]

KeyError: (0, 1)

In [None]:
lending_co_data['Product']

In [42]:
lending_co_data.iloc[1]

LoanID                     2
Product            Product B
CustomerGender          Male
Location          Location 3
Region                   NaN
TotalPrice             16600
Name: id_2, dtype: object

In [43]:
lending_co_data.iloc[1, 3]

'Location 3'

In [44]:
lending_co_data.iloc[1,:]

LoanID                     2
Product            Product B
CustomerGender          Male
Location          Location 3
Region                   NaN
TotalPrice             16600
Name: id_2, dtype: object

In [45]:
lending_co_data.iloc[:, 3]

StringID
id_1         Location 2
id_2         Location 3
id_3         Location 5
id_4         Location 6
id_5         Location 7
               ...     
id_1039     Location 39
id_1040     Location 50
id_1041     Location 23
id_1042     Location 52
id_1043    Location 142
Name: Location, Length: 1043, dtype: object

In [46]:
type(lending_co_data.iloc[1, 3])

str

In [47]:
type(lending_co_data.iloc[1, :])

pandas.core.series.Series

In [48]:
type(lending_co_data.iloc[:, 3])

pandas.core.series.Series

In [49]:
lending_co_data.iloc[[1, 3], :]

Unnamed: 0_level_0,LoanID,Product,CustomerGender,Location,Region,TotalPrice
StringID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
id_2,2,Product B,Male,Location 3,,16600.0
id_4,4,Product B,Male,Location 6,Region 1,16600.0


In [50]:
lending_co_data.iloc[:, [3, 1]]

Unnamed: 0_level_0,Location,Product
StringID,Unnamed: 1_level_1,Unnamed: 2_level_1
id_1,Location 2,Product B
id_2,Location 3,Product B
id_3,Location 5,Product C
id_4,Location 6,Product B
id_5,Location 7,Product D
...,...,...
id_1039,Location 39,Product B
id_1040,Location 50,Product B
id_1041,Location 23,Product B
id_1042,Location 52,Product C


### Data Selection - Indexing Data with .loc[]

In [51]:
import pandas as pd

In [52]:
data = pd.read_csv('Lending-company.csv', index_col = 'StringID')
lending_co_data = data.copy()
lending_co_data

Unnamed: 0_level_0,LoanID,Product,CustomerGender,Location,Region,TotalPrice
StringID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
id_1,1,Product B,Male,Location 2,Region 2,16600.0
id_2,2,Product B,Male,Location 3,,16600.0
id_3,3,Product C,Female,Location 5,Region 5,15600.0
id_4,4,Product B,Male,Location 6,Region 1,16600.0
id_5,5,Product D,Female,Location 7,Region 2,20250.0
...,...,...,...,...,...,...
id_1039,1039,Product B,Female,Location 39,Region 6,16600.0
id_1040,1040,Product B,Male,Location 50,Region 1,
id_1041,1041,Product B,Male,Location 23,Region 4,16600.0
id_1042,1042,Product C,NotSpecified,Location 52,Region 6,15600.0


In [53]:
lending_co_data.loc['LoanID_3']

KeyError: 'LoanID_3'

In [54]:
lending_co_data.loc['LoanID_3', :]

KeyError: 'LoanID_3'

In [None]:
lending_co_data.loc['LoanID_3', 'Region']

In [55]:
lending_co_data['Location']

StringID
id_1         Location 2
id_2         Location 3
id_3         Location 5
id_4         Location 6
id_5         Location 7
               ...     
id_1039     Location 39
id_1040     Location 50
id_1041     Location 23
id_1042     Location 52
id_1043    Location 142
Name: Location, Length: 1043, dtype: object

In [56]:
lending_co_data.loc['Location']

KeyError: 'Location'

In [57]:
lending_co_data.loc[:, 'Location']

StringID
id_1         Location 2
id_2         Location 3
id_3         Location 5
id_4         Location 6
id_5         Location 7
               ...     
id_1039     Location 39
id_1040     Location 50
id_1041     Location 23
id_1042     Location 52
id_1043    Location 142
Name: Location, Length: 1043, dtype: object

In [58]:
lending_co_data.loc[:, 'Locations']

KeyError: 'Locations'

### A Few Comments on Using .loc[] and .iloc[]

In [59]:
import pandas as pd

In [60]:
data = pd.read_csv('Lending-company.csv', index_col = 'LoanID')
lending_co_data = data.copy()
lending_co_data

Unnamed: 0_level_0,StringID,Product,CustomerGender,Location,Region,TotalPrice
LoanID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,id_1,Product B,Male,Location 2,Region 2,16600.0
2,id_2,Product B,Male,Location 3,,16600.0
3,id_3,Product C,Female,Location 5,Region 5,15600.0
4,id_4,Product B,Male,Location 6,Region 1,16600.0
5,id_5,Product D,Female,Location 7,Region 2,20250.0
...,...,...,...,...,...,...
1039,id_1039,Product B,Female,Location 39,Region 6,16600.0
1040,id_1040,Product B,Male,Location 50,Region 1,
1041,id_1041,Product B,Male,Location 23,Region 4,16600.0
1042,id_1042,Product C,NotSpecified,Location 52,Region 6,15600.0


In [61]:
lending_co_data.shape

(1043, 6)

In [62]:
lending_co_data.iloc[1043, :]

IndexError: single positional indexer is out-of-bounds

In [63]:
lending_co_data.iloc[10000, :]

IndexError: single positional indexer is out-of-bounds

In [None]:
lending_co_data.iloc[:, 14]

In [None]:
lending_co_data.iloc[:, 13]

In [64]:
lending_co_data.iloc[:, -1]

LoanID
1       16600.0
2       16600.0
3       15600.0
4       16600.0
5       20250.0
         ...   
1039    16600.0
1040        NaN
1041    16600.0
1042    15600.0
1043    16600.0
Name: TotalPrice, Length: 1043, dtype: float64

In [65]:
lending_co_data.head()

Unnamed: 0_level_0,StringID,Product,CustomerGender,Location,Region,TotalPrice
LoanID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,id_1,Product B,Male,Location 2,Region 2,16600.0
2,id_2,Product B,Male,Location 3,,16600.0
3,id_3,Product C,Female,Location 5,Region 5,15600.0
4,id_4,Product B,Male,Location 6,Region 1,16600.0
5,id_5,Product D,Female,Location 7,Region 2,20250.0


In [66]:
# incorrect (single indexer required)
lending_co_data['TotalPrice'].iloc[0, :]

IndexingError: Too many indexers

In [None]:
lending_co_data['TotalPrice'].iloc[0]

In [67]:
# avoid for index columns composed of labels/integers
lending_co_data['TotalPrice'][0]

KeyError: 0

In [68]:
# AVOID
lending_co_data['TotalPrice'][1]

16600.0

In [69]:
lending_co_data['TotalPrice'].loc[1]

16600.0

In [70]:
data = pd.read_csv('Lending-company.csv', index_col = 'StringID')
lending_co_data = data.copy()
lending_co_data.head()

Unnamed: 0_level_0,LoanID,Product,CustomerGender,Location,Region,TotalPrice
StringID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
id_1,1,Product B,Male,Location 2,Region 2,16600.0
id_2,2,Product B,Male,Location 3,,16600.0
id_3,3,Product C,Female,Location 5,Region 5,15600.0
id_4,4,Product B,Male,Location 6,Region 1,16600.0
id_5,5,Product D,Female,Location 7,Region 2,20250.0


In [71]:
lending_co_data['TotalPrice'].iloc[0]

16600.0

In [72]:
lending_co_data['TotalPrice'].loc['LoanID_1']

KeyError: 'LoanID_1'

In [73]:
# AVOID
lending_co_data['TotalPrice'][0]

16600.0

In [74]:
# AVOID
lending_co_data['TotalPrice'][1]

16600.0

In [75]:
# AVOID
lending_co_data['TotalPrice']['LoanID_1']

KeyError: 'LoanID_1'

In [76]:
# AVOID
lending_co_data.iloc[0][5]

16600.0

In [77]:
lending_co_data.iloc[0, 5]

16600.0

In [78]:
# AVOID
lending_co_data.iloc[[0, 5]]

Unnamed: 0_level_0,LoanID,Product,CustomerGender,Location,Region,TotalPrice
StringID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
id_1,1,Product B,Male,Location 2,Region 2,16600.0
id_6,6,Product B,Female,Location 10,Region 2,


In [79]:
lending_co_data.iloc[[0, 5], :]

Unnamed: 0_level_0,LoanID,Product,CustomerGender,Location,Region,TotalPrice
StringID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
id_1,1,Product B,Male,Location 2,Region 2,16600.0
id_6,6,Product B,Female,Location 10,Region 2,


In [80]:
# AVOID
lending_co_data['TotalPrice']['LoanID_1']

KeyError: 'LoanID_1'

In [None]:
lending_co_data.loc['LoanID_1', 'TotalPrice']

In [81]:
# AVOID
lending_co_data.loc[['LoanID_1', 'LoanID_6']]

KeyError: "None of [Index(['LoanID_1', 'LoanID_6'], dtype='object', name='StringID')] are in the [index]"

In [85]:
lending_co_data.loc[['LoanID_1', 'LoanID_6'], :]

KeyError: "None of [Index(['LoanID_1', 'LoanID_6'], dtype='object', name='StringID')] are in the [index]"

In [None]:
# AVOID
lending_co_data.TotalPrice['LoanID_1']

In [86]:
# AVOID
lending_co_data['TotalPrice'].iloc[[0, 5]]

StringID
id_1    16600.0
id_6        NaN
Name: TotalPrice, dtype: float64

In [84]:
lending_co_data.loc[:, 'TotalPrice'].iloc[[0,5]]

StringID
id_1    16600.0
id_6        NaN
Name: TotalPrice, dtype: float64