## Pandas basics

In [1]:
import numpy as np
import pandas as pd

### The pandas Series object

In [2]:
#shift+tab to see the parameters of the Series object
#create a Series object, a1, from a Python list object
a1 = pd.Series(['F','GM','RACE','TSLA'])
a1

0       F
1      GM
2    RACE
3    TSLA
dtype: object

In [3]:
#create a Series object, a1, from a Numpy array object
array1 = np.array(['F','GM','RACE','TSLA'])
array1

array(['F', 'GM', 'RACE', 'TSLA'], dtype='<U4')

In [4]:
a1 = pd.Series(array1)
a1

0       F
1      GM
2    RACE
3    TSLA
dtype: object

In [5]:
#access elements of the series through slicing the index
a1[2:]

2    RACE
3    TSLA
dtype: object

In [6]:
#create a Series object, dict1, from a dictionary object

In [7]:
dict1 = {"a":"F", "b":"GM", "c": "RACE", "d": "TSLA"}
dict1

{'a': 'F', 'b': 'GM', 'c': 'RACE', 'd': 'TSLA'}

In [8]:
a1 = pd.Series(dict1)
a1

a       F
b      GM
c    RACE
d    TSLA
dtype: object

In [9]:
b1 = pd.Series(['F','GM','RACE','TSLA'], index=['a','b','c','d'])
b1

a       F
b      GM
c    RACE
d    TSLA
dtype: object

In [10]:
#access elements of the series through slicing the index
b1["c":]

c    RACE
d    TSLA
dtype: object

### DataFrames

In [11]:
np.random.seed(0)
d1 = np.random.randn(4,5)
d1

array([[ 1.76405235,  0.40015721,  0.97873798,  2.2408932 ,  1.86755799],
       [-0.97727788,  0.95008842, -0.15135721, -0.10321885,  0.4105985 ],
       [ 0.14404357,  1.45427351,  0.76103773,  0.12167502,  0.44386323],
       [ 0.33367433,  1.49407907, -0.20515826,  0.3130677 , -0.85409574]])

In [12]:
df = pd.DataFrame(d1, ['F','GM','RACE','TSLA'], ['sale','ni','at','lt', 'seq'])
df

Unnamed: 0,sale,ni,at,lt,seq
F,1.764052,0.400157,0.978738,2.240893,1.867558
GM,-0.977278,0.950088,-0.151357,-0.103219,0.410599
RACE,0.144044,1.454274,0.761038,0.121675,0.443863
TSLA,0.333674,1.494079,-0.205158,0.313068,-0.854096


In [13]:
df['year']=2020
df

Unnamed: 0,sale,ni,at,lt,seq,year
F,1.764052,0.400157,0.978738,2.240893,1.867558,2020
GM,-0.977278,0.950088,-0.151357,-0.103219,0.410599,2020
RACE,0.144044,1.454274,0.761038,0.121675,0.443863,2020
TSLA,0.333674,1.494079,-0.205158,0.313068,-0.854096,2020


In [14]:
np.random.seed(1)
d2 = np.random.randn(4,5)
d2

array([[ 1.62434536, -0.61175641, -0.52817175, -1.07296862,  0.86540763],
       [-2.3015387 ,  1.74481176, -0.7612069 ,  0.3190391 , -0.24937038],
       [ 1.46210794, -2.06014071, -0.3224172 , -0.38405435,  1.13376944],
       [-1.09989127, -0.17242821, -0.87785842,  0.04221375,  0.58281521]])

In [15]:
a1

a       F
b      GM
c    RACE
d    TSLA
dtype: object

In [16]:
df2 = pd.DataFrame(d2, a1, ['sale','ni','at','lt', 'seq'])
df2

Unnamed: 0,sale,ni,at,lt,seq
F,1.624345,-0.611756,-0.528172,-1.072969,0.865408
GM,-2.301539,1.744812,-0.761207,0.319039,-0.24937
RACE,1.462108,-2.060141,-0.322417,-0.384054,1.133769
TSLA,-1.099891,-0.172428,-0.877858,0.042214,0.582815


In [17]:
df2['year']=2019
df2

Unnamed: 0,sale,ni,at,lt,seq,year
F,1.624345,-0.611756,-0.528172,-1.072969,0.865408,2019
GM,-2.301539,1.744812,-0.761207,0.319039,-0.24937,2019
RACE,1.462108,-2.060141,-0.322417,-0.384054,1.133769,2019
TSLA,-1.099891,-0.172428,-0.877858,0.042214,0.582815,2019


In [18]:
df = pd.concat([df, df2])
df

Unnamed: 0,sale,ni,at,lt,seq,year
F,1.764052,0.400157,0.978738,2.240893,1.867558,2020
GM,-0.977278,0.950088,-0.151357,-0.103219,0.410599,2020
RACE,0.144044,1.454274,0.761038,0.121675,0.443863,2020
TSLA,0.333674,1.494079,-0.205158,0.313068,-0.854096,2020
F,1.624345,-0.611756,-0.528172,-1.072969,0.865408,2019
GM,-2.301539,1.744812,-0.761207,0.319039,-0.24937,2019
RACE,1.462108,-2.060141,-0.322417,-0.384054,1.133769,2019
TSLA,-1.099891,-0.172428,-0.877858,0.042214,0.582815,2019


 ### Indexes in Pandas


In [19]:
df.reset_index(inplace=True)
df

Unnamed: 0,index,sale,ni,at,lt,seq,year
0,F,1.764052,0.400157,0.978738,2.240893,1.867558,2020
1,GM,-0.977278,0.950088,-0.151357,-0.103219,0.410599,2020
2,RACE,0.144044,1.454274,0.761038,0.121675,0.443863,2020
3,TSLA,0.333674,1.494079,-0.205158,0.313068,-0.854096,2020
4,F,1.624345,-0.611756,-0.528172,-1.072969,0.865408,2019
5,GM,-2.301539,1.744812,-0.761207,0.319039,-0.24937,2019
6,RACE,1.462108,-2.060141,-0.322417,-0.384054,1.133769,2019
7,TSLA,-1.099891,-0.172428,-0.877858,0.042214,0.582815,2019


In [20]:
df.rename(columns={'index':'ticker'}, inplace=True)
df


Unnamed: 0,ticker,sale,ni,at,lt,seq,year
0,F,1.764052,0.400157,0.978738,2.240893,1.867558,2020
1,GM,-0.977278,0.950088,-0.151357,-0.103219,0.410599,2020
2,RACE,0.144044,1.454274,0.761038,0.121675,0.443863,2020
3,TSLA,0.333674,1.494079,-0.205158,0.313068,-0.854096,2020
4,F,1.624345,-0.611756,-0.528172,-1.072969,0.865408,2019
5,GM,-2.301539,1.744812,-0.761207,0.319039,-0.24937,2019
6,RACE,1.462108,-2.060141,-0.322417,-0.384054,1.133769,2019
7,TSLA,-1.099891,-0.172428,-0.877858,0.042214,0.582815,2019


In [21]:
df.sort_values(by=['ticker','year'], inplace=True)
df

Unnamed: 0,ticker,sale,ni,at,lt,seq,year
4,F,1.624345,-0.611756,-0.528172,-1.072969,0.865408,2019
0,F,1.764052,0.400157,0.978738,2.240893,1.867558,2020
5,GM,-2.301539,1.744812,-0.761207,0.319039,-0.24937,2019
1,GM,-0.977278,0.950088,-0.151357,-0.103219,0.410599,2020
6,RACE,1.462108,-2.060141,-0.322417,-0.384054,1.133769,2019
2,RACE,0.144044,1.454274,0.761038,0.121675,0.443863,2020
7,TSLA,-1.099891,-0.172428,-0.877858,0.042214,0.582815,2019
3,TSLA,0.333674,1.494079,-0.205158,0.313068,-0.854096,2020


In [22]:
df.sort_values(by=['ticker','year'], ascending=False, inplace=True)
df

Unnamed: 0,ticker,sale,ni,at,lt,seq,year
3,TSLA,0.333674,1.494079,-0.205158,0.313068,-0.854096,2020
7,TSLA,-1.099891,-0.172428,-0.877858,0.042214,0.582815,2019
2,RACE,0.144044,1.454274,0.761038,0.121675,0.443863,2020
6,RACE,1.462108,-2.060141,-0.322417,-0.384054,1.133769,2019
1,GM,-0.977278,0.950088,-0.151357,-0.103219,0.410599,2020
5,GM,-2.301539,1.744812,-0.761207,0.319039,-0.24937,2019
0,F,1.764052,0.400157,0.978738,2.240893,1.867558,2020
4,F,1.624345,-0.611756,-0.528172,-1.072969,0.865408,2019


### DataFrame attributes

In [23]:
#get a list of columns using the `columns` attribute
list(df.columns)

['ticker', 'sale', 'ni', 'at', 'lt', 'seq', 'year']

In [24]:
df.index

Index([3, 7, 2, 6, 1, 5, 0, 4], dtype='int64')

In [25]:
df.dtypes

ticker     object
sale      float64
ni        float64
at        float64
lt        float64
seq       float64
year        int64
dtype: object

In [26]:
df.shape

(8, 7)

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8 entries, 3 to 4
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ticker  8 non-null      object 
 1   sale    8 non-null      float64
 2   ni      8 non-null      float64
 3   at      8 non-null      float64
 4   lt      8 non-null      float64
 5   seq     8 non-null      float64
 6   year    8 non-null      int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 512.0+ bytes


In [28]:
df['at'].dtype

dtype('float64')

In [29]:
df['at'].describe()

count    8.000000
mean    -0.138299
std      0.674228
min     -0.877858
25%     -0.586431
50%     -0.263788
75%      0.076742
max      0.978738
Name: at, dtype: float64

In [30]:
df.describe([.01, .1, .25, .5, .75, .9, .99])

Unnamed: 0,sale,ni,at,lt,seq,year
count,8.0,8.0,8.0,8.0,8.0,8.0
mean,0.118689,0.399886,-0.138299,0.184581,0.525068,2019.5
std,1.481767,1.299628,0.674228,0.946787,0.829243,0.534522
min,-2.301539,-2.060141,-0.877858,-1.072969,-0.854096,2019.0
1%,-2.217423,-1.958754,-0.869693,-1.024745,-0.811765,2019.0
10%,-1.460385,-1.046272,-0.796202,-0.590729,-0.430788,2019.0
25%,-1.007931,-0.28226,-0.586431,-0.173428,0.245606,2019.0
50%,0.238859,0.675123,-0.263788,0.081944,0.513339,2019.5
75%,1.502667,1.464225,0.076742,0.314561,0.932498,2020.0
90%,1.666257,1.569299,0.826348,0.895595,1.353906,2020.0


### Accessing DataFrames

In [31]:
df.head(3)

Unnamed: 0,ticker,sale,ni,at,lt,seq,year
3,TSLA,0.333674,1.494079,-0.205158,0.313068,-0.854096,2020
7,TSLA,-1.099891,-0.172428,-0.877858,0.042214,0.582815,2019
2,RACE,0.144044,1.454274,0.761038,0.121675,0.443863,2020


In [32]:
df.head(2)

Unnamed: 0,ticker,sale,ni,at,lt,seq,year
3,TSLA,0.333674,1.494079,-0.205158,0.313068,-0.854096,2020
7,TSLA,-1.099891,-0.172428,-0.877858,0.042214,0.582815,2019


In [33]:
df.tail()

Unnamed: 0,ticker,sale,ni,at,lt,seq,year
6,RACE,1.462108,-2.060141,-0.322417,-0.384054,1.133769,2019
1,GM,-0.977278,0.950088,-0.151357,-0.103219,0.410599,2020
5,GM,-2.301539,1.744812,-0.761207,0.319039,-0.24937,2019
0,F,1.764052,0.400157,0.978738,2.240893,1.867558,2020
4,F,1.624345,-0.611756,-0.528172,-1.072969,0.865408,2019


In [34]:
# we can slice rows of a DataFrame just as in previous lectures
df[:2]

Unnamed: 0,ticker,sale,ni,at,lt,seq,year
3,TSLA,0.333674,1.494079,-0.205158,0.313068,-0.854096,2020
7,TSLA,-1.099891,-0.172428,-0.877858,0.042214,0.582815,2019


In [35]:
# We cannot, however, slice the matrix of a DataFrame using the same notation
#df[:2,:2]

In [36]:
#instead, we have to use the index location method, .iloc()
df.iloc[:2]

Unnamed: 0,ticker,sale,ni,at,lt,seq,year
3,TSLA,0.333674,1.494079,-0.205158,0.313068,-0.854096,2020
7,TSLA,-1.099891,-0.172428,-0.877858,0.042214,0.582815,2019


In [37]:
df.iloc[:2, 2:]

Unnamed: 0,ni,at,lt,seq,year
3,1.494079,-0.205158,0.313068,-0.854096,2020
7,-0.172428,-0.877858,0.042214,0.582815,2019


In [38]:
#we can also use the label-based method, .loc()
df.loc[3:3, ['ticker','sale']]

Unnamed: 0,ticker,sale
3,TSLA,0.333674


In [39]:
df.loc[:, ['ticker','ni']]

Unnamed: 0,ticker,ni
3,TSLA,1.494079
7,TSLA,-0.172428
2,RACE,1.454274
6,RACE,-2.060141
1,GM,0.950088
5,GM,1.744812
0,F,0.400157
4,F,-0.611756


In [40]:
df[['ticker','ni']]

Unnamed: 0,ticker,ni
3,TSLA,1.494079
7,TSLA,-0.172428
2,RACE,1.454274
6,RACE,-2.060141
1,GM,0.950088
5,GM,1.744812
0,F,0.400157
4,F,-0.611756
