In [10]:
import pandas as pd 
import numpy as np

## Pandas Data Structures 

In [50]:
## Series 
ser1 = pd.Series(['x1', 'x2', 'x3']) # By default, index is 0 to N-1 
## Can change index 
ind = ['a', 'b', 'c']
ser2 =  pd.Series(['x1', 'x2', 'x3'], index=ind)

## Unlike numpy array, can select entries using index 
ser2['a']
## Now, with numpy array: 
nump1 = np.array([1,2,3]) # No index, obviously 

## Can think of a series as an ordered dict of fixed length, with index == key and values 
ser3 = {'a':1, 'b':2}

sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
states = ['California', 'Ohio', 'Oregon', 'Texas']
ser4 = pd.Series(sdata, index = states) # California is going to have NaN value 
ser4 + pd.Series({'Ohio': -35000, 'Texas': -71000, 'Oregon': -16000, 'Utah': -5000}) # Align series by index! 

## Getting index, naming index 'col' and value 'col'
ser4.index
ser4.values
ser4.name = 'pop'; ser4.index.name = 'state'


## Dataframe 
## @note THink of as dict of series sharing the same indices 
## Can create a data frame with either a) dict of equal length lists or b) nested dicts 
## a) 
dict1 = pd.DataFrame({"CO":[300,200,150],
                     "CA": [100, 200,300]}, index = [2000,2001,2002])
dict1.columns.name = "State"
dict1.index.name = "Year"
## b) @note outer keys are columns, inner keys correspond to df's indices
dict2 = pd.DataFrame({"CA": {"2000": 100,
                              "2001": 200},
                     "CO": {"2000": 150,
                              "2001": 250}})

## To get cols 
dict2["CA"]
## To get rows: 
dict2.loc["2000"] ## More on row indexing later.. 




CA    100
CO    150
Name: 2000, dtype: int64

## Essential Functionalities

In [146]:
## Reindexing 
## reindexing rows, columns and using loc() for label-indexing
reindex = pd.Series({"a": 1, "b":2, "c":3})
reindex.reindex(["b", "c", "a", "d"]) # Switch order of rows, and adds another one with nan (d not in og series)

reindex = pd.DataFrame({"a":[1,2,3], "b":[3,4,5]}, index = ['x1','x2','x3'])
reindex.reindex(columns = ["b", "a"]) # @note - not with axis = 1 or 0 
reindex.loc[['x2','x3'], ['b', 'a']] # Straight indexing syntax 


## Dropping entries from axis 
## Dropping from row labels vs. cols - inplace dropping vs. not in place 
reindex.drop('x1') # Drops the x1 row 
reindex.drop('a', axis=1) # Drops the 'a' col 
## inplace = True argument to drop in place

## Indexing, Selection and Filtering 
## Just like numpy array, instead can use labels vs. just integers
reindex.loc['x2':'x3', 'a']
reindex['a'] #reindex[['a', 'c']] Will fail, there is no c column 

data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                     index=['Ohio', 'Colorado', 'Utah', 'New York'],
                     columns=['one', 'two', 'three', 'four'])
data[data['three'] > 5]
    
## Selection with loc vs. iloc @important 
## @note: loc == label indexing and iloc == integer indexing 
data.loc[['Ohio','Colorado'], 'one'] # @note can't do .loc[["OH":"CO"]]
data.iloc[[0,1],0] 

## Integer indexing 
## If index contains integers, data selection will be label oriented. Otherwise, it will be integer oriented
test = pd.Series([1,2,3], index = range(4,7))
# test[1] # Will fail, there is no index numbered 1. Instead, use iloc: 
test.iloc[1]

## Will work if labels don't have ints, example: 
test = pd.Series([1,2,3], index = ['a', 'b', 'c'])
test[2] # Will return last value, 3

## Function application and mapping 
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
## Apply a function to each row / col 
## Ex. of calculating max - min 
frame.apply(lambda x: x.max() - x.min(), axis = 1) # Per row / Accross the columns

## Can also return multiple values 
def f(x):
    return pd.Series([min(x), max(x)], index = ['min', 'max'])
# frame.apply(lambda x: f(x),axis = 1) # Not the best way to wrte this - or even the right way
frame.apply(f,axis = 1) # Just apply the function


## Sorting and Ranking 
## sort_index and sort_value functions
print(frame)
frame.sort_index(ascending=False)
frame.sort_values(by = 'b', ascending = False)
frame.rank(method = 'first', axis = 1)
    


               b         d         e
Utah   -0.843499  0.101592 -1.735070
Ohio   -0.777616  0.319189 -0.553669
Texas   0.003353  2.263836 -0.171703
Oregon  0.803797  0.701730 -0.166434


Unnamed: 0,b,d,e
Utah,2.0,3.0,1.0
Ohio,1.0,3.0,2.0
Texas,2.0,3.0,1.0
Oregon,3.0,2.0,1.0
