# Introduction to Pandas

In [None]:
import numpy as np
import pandas as pd

In [None]:
array = np.random.random(10)

array

In [None]:
serie = pd.Series(array)

serie

In [None]:
serie.index

In [None]:
serie.values

In [None]:
serie[3]

In [None]:
colnames = ['col1', 'col2', 'col3', 'col4', 'col5']
df = pd.DataFrame(np.random.random((10,5)), columns=colnames)
df

In [None]:
df.index.to_list()

In [None]:
df.values

In [None]:
# this is a series
series = df['col1']

series

In [None]:
# beware of the list now
df[['col1', 'col2', 'col3']]

In [None]:
df['col1', 'col2', 'col3']

In [None]:
# this is a dataframe
df[['col1']]

In [None]:
lst = [208500, 181500, 223500, 140000, 250000, 143000, 307000, 200000, 129900, 118000]

price_df = pd.DataFrame(lst, columns=['SalePrice'])
price_df

In [None]:
lst_lst = [[8450, 'CollgCr', 2003, 7, 208500],
           [9600, 'Veenker', 1976, 6, 181500],
           [11250, 'CollgCr', 2001, 7, 223500],
           [9550, 'Crawfor', 1915, 7, 140000],
           [14260, 'NoRidge', 2000, 8, 250000],
           [14115, 'Mitchel', 1993, 5, 143000],
           [10084, 'Somerst', 2004, 8, 307000],
           [10382, 'NWAmes', 1973, 7, 200000],
           [6120, 'OldTown', 1931, 7, 129900],
           [7420, 'BrkSide', 1939, 5, 118000]]

colnames = ['LotSize','Neighborhood','YearBuilt','Quality','SalePrice']
df = pd.DataFrame(lst_lst, columns=colnames)

df

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
house_dict = {'Baker House': [7420, 'BrkSide', 1939, 5, 123054],
              'Beazley House': [14115, 'Mitchel', 1993, 5, 143000],
              'Dominguez House': [14260, 'NoRidge', 2000, 8, 250000],
              'Hamilton House': [6120, 'OldTown', 1931, 7, 129900],
              'James House': [11250, 'CollgCr', 2001, 7, 223500],
              'Martinez House': [9600, 'Veenker', 1976, 6, 181500],
              'Roberts House': [9550, 'Crawfor', 1915, 7, 140000],
              'Smith House': [8450, 'CollgCr', 2003, 7, 208500],
              'Snyder House': [10084, 'Somerst', 2004, 8, 307000],
              'Zuckerman House': [10382, 'NWAmes', 1973, 7, 200000]}

In [None]:
df = pd.DataFrame(house_dict)
df

In [None]:
df.dtypes

In [None]:
df.T

In [None]:
df.T.index

In [None]:
df.T.columns

In [None]:
df = df.T
df.columns = colnames

df

In [36]:
# orient='index' means take the dict keys as index (not as column names)
df = pd.DataFrame.from_dict(house_dict, orient='index')
df.columns = colnames

df

Unnamed: 0,LotSize,Neighborhood,YearBuilt,Quality,SalePrice
Baker House,7420,BrkSide,1939,5,123054
Beazley House,14115,Mitchel,1993,5,143000
Dominguez House,14260,NoRidge,2000,8,250000
Hamilton House,6120,OldTown,1931,7,129900
James House,11250,CollgCr,2001,7,223500
Martinez House,9600,Veenker,1976,6,181500
Roberts House,9550,Crawfor,1915,7,140000
Smith House,8450,CollgCr,2003,7,208500
Snyder House,10084,Somerst,2004,8,307000
Zuckerman House,10382,NWAmes,1973,7,200000


In [None]:
# Total price of all houses sold
print(df['SalePrice'].sum())

# Average lot size of houses sold
print(df['LotSize'].mean())

# The latest year a house in the data set was built
print(df['YearBuilt'].max())

# The eariliest year a house in the data set was built
print(df['YearBuilt'].min())

In [None]:
df.sort_values(by='SalePrice', ascending=False)

In [None]:
df['LotSize'].sort_values(ascending=True)

In [37]:
series = df['LotSize']

series.sort_values()

Hamilton House      6120
Baker House         7420
Smith House         8450
Roberts House       9550
Martinez House      9600
Snyder House       10084
Zuckerman House    10382
James House        11250
Beazley House      14115
Dominguez House    14260
Name: LotSize, dtype: int64

In [38]:
series

Baker House         7420
Beazley House      14115
Dominguez House    14260
Hamilton House      6120
James House        11250
Martinez House      9600
Roberts House       9550
Smith House         8450
Snyder House       10084
Zuckerman House    10382
Name: LotSize, dtype: int64

In [42]:
series.index

Index(['Baker House', 'Beazley House', 'Dominguez House', 'Hamilton House',
       'James House', 'Martinez House', 'Roberts House', 'Smith House',
       'Snyder House', 'Zuckerman House'],
      dtype='object')

In [43]:
series.values

array([ 7420, 14115, 14260,  6120, 11250,  9600,  9550,  8450, 10084,
       10382])

In [41]:
df[['LotSize']]

Unnamed: 0,LotSize
Baker House,7420
Beazley House,14115
Dominguez House,14260
Hamilton House,6120
James House,11250
Martinez House,9600
Roberts House,9550
Smith House,8450
Snyder House,10084
Zuckerman House,10382


## Bonus

In [47]:
df

Unnamed: 0,LotSize,Neighborhood,YearBuilt,Quality,SalePrice
Baker House,7420,BrkSide,1939,5,123054
Beazley House,14115,Mitchel,1993,5,143000
Dominguez House,14260,NoRidge,2000,8,250000
Hamilton House,6120,OldTown,1931,7,129900
James House,11250,CollgCr,2001,7,223500
Martinez House,9600,Veenker,1976,6,181500
Roberts House,9550,Crawfor,1915,7,140000
Smith House,8450,CollgCr,2003,7,208500
Snyder House,10084,Somerst,2004,8,307000
Zuckerman House,10382,NWAmes,1973,7,200000


In [45]:
series_1 = series

series_1

Baker House         7420
Beazley House      14115
Dominguez House    14260
Hamilton House      6120
James House        11250
Martinez House      9600
Roberts House       9550
Smith House         8450
Snyder House       10084
Zuckerman House    10382
Name: LotSize, dtype: int64

### filter

In [46]:
# series (watch the index)
lotsize_filter = series_1 > 10000

series[lotsize_filter]

Beazley House      14115
Dominguez House    14260
James House        11250
Snyder House       10084
Zuckerman House    10382
Name: LotSize, dtype: int64

In [50]:
print(df)

                 LotSize Neighborhood  YearBuilt  Quality  SalePrice
Baker House         7420      BrkSide       1939        5     123054
Beazley House      14115      Mitchel       1993        5     143000
Dominguez House    14260      NoRidge       2000        8     250000
Hamilton House      6120      OldTown       1931        7     129900
James House        11250      CollgCr       2001        7     223500
Martinez House      9600      Veenker       1976        6     181500
Roberts House       9550      Crawfor       1915        7     140000
Smith House         8450      CollgCr       2003        7     208500
Snyder House       10084      Somerst       2004        8     307000
Zuckerman House    10382       NWAmes       1973        7     200000


In [51]:
display(df)

Unnamed: 0,LotSize,Neighborhood,YearBuilt,Quality,SalePrice
Baker House,7420,BrkSide,1939,5,123054
Beazley House,14115,Mitchel,1993,5,143000
Dominguez House,14260,NoRidge,2000,8,250000
Hamilton House,6120,OldTown,1931,7,129900
James House,11250,CollgCr,2001,7,223500
Martinez House,9600,Veenker,1976,6,181500
Roberts House,9550,Crawfor,1915,7,140000
Smith House,8450,CollgCr,2003,7,208500
Snyder House,10084,Somerst,2004,8,307000
Zuckerman House,10382,NWAmes,1973,7,200000


In [55]:
# dataframe
lotsize_filter = df['LotSize'] > 10000
quality_filter = df['Quality'] > 7

display(df[lotsize_filter & quality_filter])
display(df[lotsize_filter | quality_filter])
display(df[~(lotsize_filter & quality_filter)])

# if equivalences:
# and -> &
# or -> |
# not -> ~

Unnamed: 0,LotSize,Neighborhood,YearBuilt,Quality,SalePrice
Dominguez House,14260,NoRidge,2000,8,250000
Snyder House,10084,Somerst,2004,8,307000


Unnamed: 0,LotSize,Neighborhood,YearBuilt,Quality,SalePrice
Beazley House,14115,Mitchel,1993,5,143000
Dominguez House,14260,NoRidge,2000,8,250000
James House,11250,CollgCr,2001,7,223500
Snyder House,10084,Somerst,2004,8,307000
Zuckerman House,10382,NWAmes,1973,7,200000


Unnamed: 0,LotSize,Neighborhood,YearBuilt,Quality,SalePrice
Baker House,7420,BrkSide,1939,5,123054
Beazley House,14115,Mitchel,1993,5,143000
Hamilton House,6120,OldTown,1931,7,129900
James House,11250,CollgCr,2001,7,223500
Martinez House,9600,Veenker,1976,6,181500
Roberts House,9550,Crawfor,1915,7,140000
Smith House,8450,CollgCr,2003,7,208500
Zuckerman House,10382,NWAmes,1973,7,200000


In [57]:
df[['LotSize', 'Quality']]

Unnamed: 0,LotSize,Quality
Baker House,7420,5
Beazley House,14115,5
Dominguez House,14260,8
Hamilton House,6120,7
James House,11250,7
Martinez House,9600,6
Roberts House,9550,7
Smith House,8450,7
Snyder House,10084,8
Zuckerman House,10382,7


In [58]:
df[['LotSize', 'Quality']][lotsize_filter & quality_filter]

Unnamed: 0,LotSize,Quality
Dominguez House,14260,8
Snyder House,10084,8


In [59]:
df[lotsize_filter & quality_filter][['LotSize', 'Quality']]

Unnamed: 0,LotSize,Quality
Dominguez House,14260,8
Snyder House,10084,8


### map

In [60]:
series_1

Baker House         7420
Beazley House      14115
Dominguez House    14260
Hamilton House      6120
James House        11250
Martinez House      9600
Roberts House       9550
Smith House         8450
Snyder House       10084
Zuckerman House    10382
Name: LotSize, dtype: int64

In [61]:
series_1.apply(lambda x: x * 2)

Baker House        14840
Beazley House      28230
Dominguez House    28520
Hamilton House     12240
James House        22500
Martinez House     19200
Roberts House      19100
Smith House        16900
Snyder House       20168
Zuckerman House    20764
Name: LotSize, dtype: int64

In [62]:
df[['LotSize', 'Quality']].apply(lambda x: x * 2)

Unnamed: 0,LotSize,Quality
Baker House,14840,10
Beazley House,28230,10
Dominguez House,28520,16
Hamilton House,12240,14
James House,22500,14
Martinez House,19200,12
Roberts House,19100,14
Smith House,16900,14
Snyder House,20168,16
Zuckerman House,20764,14


In [68]:
series_to_add = df['LotSize']

In [71]:
# I'm creating a new column
df['my_new_column'] = series_to_add.apply(lambda x: x ** 2)

In [72]:
df.head()

Unnamed: 0,LotSize,Neighborhood,YearBuilt,Quality,SalePrice,my_new_column
Baker House,14840,BrkSide,1939,10,123054,220225600
Beazley House,28230,Mitchel,1993,10,143000,796932900
Dominguez House,28520,NoRidge,2000,16,250000,813390400
Hamilton House,12240,OldTown,1931,14,129900,149817600
James House,22500,CollgCr,2001,14,223500,506250000


In [73]:
# I'm changing the values of an existing column
df['Quality'] = series_to_add.apply(lambda x: 0)

In [74]:
df

Unnamed: 0,LotSize,Neighborhood,YearBuilt,Quality,SalePrice,my_new_column
Baker House,14840,BrkSide,1939,0,123054,220225600
Beazley House,28230,Mitchel,1993,0,143000,796932900
Dominguez House,28520,NoRidge,2000,0,250000,813390400
Hamilton House,12240,OldTown,1931,0,129900,149817600
James House,22500,CollgCr,2001,0,223500,506250000
Martinez House,19200,Veenker,1976,0,181500,368640000
Roberts House,19100,Crawfor,1915,0,140000,364810000
Smith House,16900,CollgCr,2003,0,208500,285610000
Snyder House,20168,Somerst,2004,0,307000,406748224
Zuckerman House,20764,NWAmes,1973,0,200000,431143696
