<a href="https://colab.research.google.com/github/tsnow2010/DATA601/blob/main/DATA601_Notes/DATA601_Week_7_Notes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Week 7 Notes

### Series (like a vector) to DataFrame (like a matrix)

In [1]:
import pandas as pd
import numpy as np

population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}

area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
             'Florida': 170312, 'Illinois': 149995,'Maryland': 12407} #area in square miles


#Make a series from population_dict
population = pd.Series(population_dict)

#Make a series from area_dict
area = pd.Series(area_dict)

#put those series into an 'outer' dictionary
states = pd.DataFrame({'population': population,
                       'area': area})
#The keys of the 'outer' dictionary keys became the column names
#while the inner dictionary becomes the row names (index)

In [2]:
states

Unnamed: 0,population,area
California,38332521.0,423967
Florida,19552860.0,170312
Illinois,12882135.0,149995
Maryland,,12407
New York,19651127.0,141297
Texas,26448193.0,695662


In [3]:
states.values

array([[3.8332521e+07, 4.2396700e+05],
       [1.9552860e+07, 1.7031200e+05],
       [1.2882135e+07, 1.4999500e+05],
       [          nan, 1.2407000e+04],
       [1.9651127e+07, 1.4129700e+05],
       [2.6448193e+07, 6.9566200e+05]])

In [4]:
states.index

Index(['California', 'Florida', 'Illinois', 'Maryland', 'New York', 'Texas'], dtype='object')

In [5]:
states.sort_index(axis=1,ascending=True) # Orders the columns alphabetically

Unnamed: 0,area,population
California,423967,38332521.0
Florida,170312,19552860.0
Illinois,149995,12882135.0
Maryland,12407,
New York,141297,19651127.0
Texas,695662,26448193.0


In [6]:
states.sort_index(axis=1,ascending=False) # Orders the columns alphabetically (by name) in reverse
# Changing axis=0 would order rows

Unnamed: 0,population,area
California,38332521.0,423967
Florida,19552860.0,170312
Illinois,12882135.0,149995
Maryland,,12407
New York,19651127.0,141297
Texas,26448193.0,695662


In [7]:
states.sort_values(by='Texas',axis=1) # Sorts columns based on values in columns at Texas

Unnamed: 0,area,population
California,423967,38332521.0
Florida,170312,19552860.0
Illinois,149995,12882135.0
Maryland,12407,
New York,141297,19651127.0
Texas,695662,26448193.0


In [8]:
# Selecting a column this way

states['area']

Unnamed: 0,area
California,423967
Florida,170312
Illinois,149995
Maryland,12407
New York,141297
Texas,695662


In [9]:
# Select a cell, order is column -> row
states['area']['Texas']

695662

In [10]:
# Splicing
states['California':'Illinois']

Unnamed: 0,population,area
California,38332521.0,423967
Florida,19552860.0,170312
Illinois,12882135.0,149995


In [11]:
# Selecting by loc

states.loc['Texas'] # gives row at row, 'Texas'

Unnamed: 0,Texas
population,26448193.0
area,695662.0


In [12]:
# "I want column 'area' and all rows associated with it.
states.loc[:,'area']

Unnamed: 0,area
California,423967
Florida,170312
Illinois,149995
Maryland,12407
New York,141297
Texas,695662


In [13]:
# Another way to get (1) cell, but this is faster than iloc or loc
states.at['Texas','population']

26448193.0

In [14]:
# Yet another way to get (1) cell
states.loc['Texas','population']

26448193.0

In [15]:
states.iloc[3] # gives row at index 3, think "index location"

Unnamed: 0,Maryland
population,
area,12407.0


In [16]:
states.iloc[2:3,0:2] # Another splicing option

Unnamed: 0,population,area
Illinois,12882135.0,149995


In [17]:
# In summary, at,iat,loc,iloc all get a scalar value.  But the at equivalents are fastest.

In [18]:
# Tells you where this is true/false.
states['population']>12882135 # You can also do the entire DataFrame

Unnamed: 0,population
California,True
Florida,True
Illinois,False
Maryland,False
New York,True
Texas,True


In [19]:
#Add another column to df
states_pl = states.copy()
states_pl["political leaning"]=pd.Categorical(['Red','Blue','Red','Blue','Red','Blue'])
states_pl

Unnamed: 0,population,area,political leaning
California,38332521.0,423967,Red
Florida,19552860.0,170312,Blue
Illinois,12882135.0,149995,Red
Maryland,,12407,Blue
New York,19651127.0,141297,Red
Texas,26448193.0,695662,Blue


In [20]:
# Using .isin() function
states_pl['political leaning'].isin(['Red'])

Unnamed: 0,political leaning
California,True
Florida,False
Illinois,True
Maryland,False
New York,True
Texas,False


In [21]:
# How to index with dates
cal = pd.Series(pd.Categorical(['T','W','TH','F','S','SU']),index=pd.date_range('2024-10-08',periods=6))
cal


Unnamed: 0,0
2024-10-08,T
2024-10-09,W
2024-10-10,TH
2024-10-11,F
2024-10-12,S
2024-10-13,SU


In [22]:
# Can add col to states
states_pl['days'] = cal

In [23]:
states_pl

Unnamed: 0,population,area,political leaning,days
California,38332521.0,423967,Red,
Florida,19552860.0,170312,Blue,
Illinois,12882135.0,149995,Red,
Maryland,,12407,Blue,
New York,19651127.0,141297,Red,
Texas,26448193.0,695662,Blue,


In [24]:
# Try replacing all values in a column
states_pl.loc[:,'area'] = np.array([0]*len(states_pl))
states_pl

Unnamed: 0,population,area,political leaning,days
California,38332521.0,0,Red,
Florida,19552860.0,0,Blue,
Illinois,12882135.0,0,Red,
Maryland,,0,Blue,
New York,19651127.0,0,Red,
Texas,26448193.0,0,Blue,


In [25]:
# When adding col's to a DataFrame, if the indices don't match up, the rows are dropped!

In [26]:
#Adding a row to DataFrame
states_pl.loc['Pennsylvania'] = [0,0,'Red',5]

# Below also changes a cell
states_pl.loc['Pennsylvania'] = {'population':1000000}
states_pl

Unnamed: 0,population,area,political leaning,days
California,38332521.0,0.0,Red,
Florida,19552860.0,0.0,Blue,
Illinois,12882135.0,0.0,Red,
Maryland,,0.0,Blue,
New York,19651127.0,0.0,Red,
Texas,26448193.0,0.0,Blue,
Pennsylvania,1000000.0,,,


In [27]:
no_states = states_pl.copy()
no_states.reindex(index=[0,1,2,3,4,5,6,7], columns=list(no_states.columns))
print(no_states)

              population  area political leaning days
California    38332521.0   0.0               Red  NaN
Florida       19552860.0   0.0              Blue  NaN
Illinois      12882135.0   0.0               Red  NaN
Maryland             NaN   0.0              Blue  NaN
New York      19651127.0   0.0               Red  NaN
Texas         26448193.0   0.0              Blue  NaN
Pennsylvania   1000000.0   NaN               NaN  NaN


In [28]:
no_states.reset_index()
no_states

Unnamed: 0,population,area,political leaning,days
California,38332521.0,0.0,Red,
Florida,19552860.0,0.0,Blue,
Illinois,12882135.0,0.0,Red,
Maryland,,0.0,Blue,
New York,19651127.0,0.0,Red,
Texas,26448193.0,0.0,Blue,
Pennsylvania,1000000.0,,,


In [29]:
no_states.dropna(how='any',axis=0) # also takes thresh=<num of NaN not allowed
no_states

Unnamed: 0,population,area,political leaning,days
California,38332521.0,0.0,Red,
Florida,19552860.0,0.0,Blue,
Illinois,12882135.0,0.0,Red,
Maryland,,0.0,Blue,
New York,19651127.0,0.0,Red,
Texas,26448193.0,0.0,Blue,
Pennsylvania,1000000.0,,,


In [31]:
no_states.dropna(subset=['days'])
no_states

Unnamed: 0,population,area,political leaning,days
California,38332521.0,0.0,Red,
Florida,19552860.0,0.0,Blue,
Illinois,12882135.0,0.0,Red,
Maryland,,0.0,Blue,
New York,19651127.0,0.0,Red,
Texas,26448193.0,0.0,Blue,
Pennsylvania,1000000.0,,,


In [32]:
no_states.fillna(0)

  no_states.fillna(0)


Unnamed: 0,population,area,political leaning,days
California,38332521.0,0.0,Red,0
Florida,19552860.0,0.0,Blue,0
Illinois,12882135.0,0.0,Red,0
Maryland,0.0,0.0,Blue,0
New York,19651127.0,0.0,Red,0
Texas,26448193.0,0.0,Blue,0
Pennsylvania,1000000.0,0.0,0,0


In [39]:
no_states.infer_objects(copy=False) # This prevented "downcasting", whatever that means
no_states.dropna(how='any',axis=1)
no_states

Unnamed: 0,population,area,political leaning,days
California,38332521.0,0.0,Red,
Florida,19552860.0,0.0,Blue,
Illinois,12882135.0,0.0,Red,
Maryland,,0.0,Blue,
New York,19651127.0,0.0,Red,
Texas,26448193.0,0.0,Blue,
Pennsylvania,1000000.0,,,


In [40]:
no_states.isna()

Unnamed: 0,population,area,political leaning,days
California,False,False,False,True
Florida,False,False,False,True
Illinois,False,False,False,True
Maryland,True,False,False,True
New York,False,False,False,True
Texas,False,False,False,True
Pennsylvania,False,True,True,True


In [41]:
no_states['population'].mean()

19644472.666666668

In [49]:
# DataFrame with dates as an index and random numbers.
dates = pd.date_range('2024-10-08',periods=6) # Can use this later for indexing.
dates_df = pd.DataFrame(np.random.randint(1,10,size=(6,1)),index=dates, columns=['lucky number'])
dates_df

Unnamed: 0,lucky number
2024-10-08,2
2024-10-09,5
2024-10-10,7
2024-10-11,2
2024-10-12,3
2024-10-13,3


In [55]:
test = pd.Series([5,4,3,2,1,0], index=dates) # MUST HAVE SAME INDEX or it won't work!
test
dates_df.sub(test,axis=0)


Unnamed: 0,lucky number
2024-10-08,-3
2024-10-09,1
2024-10-10,4
2024-10-11,0
2024-10-12,2
2024-10-13,3


In [56]:
# Does cumulative summation on the rows
dates_df.apply(np.cumsum)  # You can use a lambda function here (instead of np.cumsum)

Unnamed: 0,lucky number
2024-10-08,2
2024-10-09,7
2024-10-10,14
2024-10-11,16
2024-10-12,19
2024-10-13,22


In [57]:
dates_df.apply(lambda x: x.max() - x.min())

Unnamed: 0,0
lucky number,5
