In [54]:
#import pandas as panel data
import pandas as pd 
#import numpy as np
import numpy as np

In [11]:
#creating a series object
s = pd.Series(
        np.random.randn(5),
        index=['a','b','c','d','e'],
        name='example')
s

a    1.176892
b    0.594525
c    0.046952
d   -0.728211
e    0.152048
Name: example, dtype: float64

In [8]:
pd.Series(5, index=['a','b','c','d','e'])

a    5
b    5
c    5
d    5
e    5
dtype: int64

In [12]:
#First element of a series
s[0]

1.1768919299990448

In [16]:
#Slicing a series
s[:3]

a    1.176892
b    0.594525
c    0.046952
Name: example, dtype: float64

In [15]:
#Indexes in a particular order
s[[4, 3, 1]]

e    0.152048
d   -0.728211
b    0.594525
Name: example, dtype: float64

In [17]:
#Get at underlying data behind series
s.values

array([ 1.17689193,  0.59452477,  0.04695247, -0.72821073,  0.15204761])

In [18]:
#Selecting where the index is something
s['e'] = 500
s

a      1.176892
b      0.594525
c      0.046952
d     -0.728211
e    500.000000
Name: example, dtype: float64

In [19]:
#Index into a series using an array of T/F
s[[True, True, False, False, True]]

a      1.176892
b      0.594525
e    500.000000
Name: example, dtype: float64

In [20]:
#Which of our series elements is greater than 0?
s > 0

a     True
b     True
c     True
d    False
e     True
Name: example, dtype: bool

In [21]:
#or the extremely common
s[s > 0], s > 0

(a      1.176892
 b      0.594525
 c      0.046952
 e    500.000000
 Name: example, dtype: float64,
 a     True
 b     True
 c     True
 d    False
 e     True
 Name: example, dtype: bool)

In [24]:
#and you can mutate the data too
s[s < 0.5] *= -1
s

a      1.176892
b      0.594525
c     -0.046952
d      0.728211
e    500.000000
Name: example, dtype: float64

In [25]:
#Add them
s + s

a       2.353784
b       1.189050
c      -0.093905
d       1.456421
e    1000.000000
Name: example, dtype: float64

In [None]:
#Exponentiate them
np.exp(s)

In [26]:
#columnar aggregate computations
s.mean()

100.49053499384031

In [27]:
s.abs()

a      1.176892
b      0.594525
c      0.046952
d      0.728211
e    500.000000
Name: example, dtype: float64

In [28]:
#if the indexes dont match up you will get nans
s + s[s > 0]

a       2.353784
b       1.189050
c            NaN
d       1.456421
e    1000.000000
Name: example, dtype: float64

In [42]:
#constructing a data frame
d = {'one' : pd.Series([1.,2.,3.], index=['a','b','c']),
     'two' : pd.Series([1.,2.,3.,4.], index=['a','b','c','d'])}

df = pd.DataFrame(d)

df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [43]:
#passing np arrays and scalars

d = {'one' : 'Mellow',
    'two' : np.array([1.,2.,3.,4.])}

df = pd.DataFrame(d)
df

Unnamed: 0,one,two
0,Mellow,1.0
1,Mellow,2.0
2,Mellow,3.0
3,Mellow,4.0


In [44]:
df.columns = ['1', '2']
df.index = ['a', 'b','c','d']
df

Unnamed: 0,1,2
a,Mellow,1.0
b,Mellow,2.0
c,Mellow,3.0
d,Mellow,4.0


In [45]:
d = {'one' : 'Hellow',
    'two' : np.array([1.,2.,3.,4.])}

df = pd.DataFrame(d)

df

Unnamed: 0,one,two
0,Hellow,1.0
1,Hellow,2.0
2,Hellow,3.0
3,Hellow,4.0


In [47]:
d = {'one' : 'Hellow',
     'two' : np.array([1., 2., 3., 4.])}

df = pd.DataFrame(d)
df.index = ['a', 'b','c','d']

#named series
df['one']

a    Hellow
b    Hellow
c    Hellow
d    Hellow
Name: one, dtype: object

In [None]:
del df['one']

In [48]:
df['three'] = df['two'] + df['two']
df['four'] = 'four'
df['five'] = df['four'][:2]

In [49]:
df

Unnamed: 0,one,two,three,four,five
a,Hellow,1.0,2.0,four,four
b,Hellow,2.0,4.0,four,four
c,Hellow,3.0,6.0,four,
d,Hellow,4.0,8.0,four,


In [None]:
#get a column
df['two']

In [None]:
#or more
df[['five','two']]

In [None]:
# select by indexes and column names
df.loc['a', 'two']

In [50]:
df.loc['d':'a':-1, 'two':'three']

Unnamed: 0,two,three
d,4.0,8.0
c,3.0,6.0
b,2.0,4.0
a,1.0,2.0


In [None]:
#select rows and columns by their ordering 
df.iloc[1:3, 0]

In [51]:
df.iloc[1:3]

Unnamed: 0,one,two,three,four,five
b,Hellow,2.0,4.0,four,four
c,Hellow,3.0,6.0,four,


In [52]:
#Dataframewise operations
df.copy()

Unnamed: 0,one,two,three,four,five
a,Hellow,1.0,2.0,four,four
b,Hellow,2.0,4.0,four,four
c,Hellow,3.0,6.0,four,
d,Hellow,4.0,8.0,four,


In [56]:
df.two.astype(np.int)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  df.two.astype(np.int)


a    1
b    2
c    3
d    4
Name: two, dtype: int64

In [57]:
#Rows to columns, and vice versa
df.T

Unnamed: 0,a,b,c,d
one,Hellow,Hellow,Hellow,Hellow
two,1.0,2.0,3.0,4.0
three,2.0,4.0,6.0,8.0
four,four,four,four,four
five,four,four,,


In [None]:
#Visualize: gives top row
df.head(2)

In [59]:
#random sample of data
df.sample(2)

Unnamed: 0,one,two,three,four,five
a,Hellow,1.0,2.0,four,four
c,Hellow,3.0,6.0,four,


In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, a to d
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   one     4 non-null      object 
 1   two     4 non-null      float64
 2   three   4 non-null      float64
 3   four    4 non-null      object 
 4   five    2 non-null      object 
dtypes: float64(2), object(3)
memory usage: 364.0+ bytes


In [60]:
df.describe(include='all')

Unnamed: 0,one,two,three,four,five
count,4,4.0,4.0,4,2
unique,1,,,1,1
top,Hellow,,,four,four
freq,4,,,4,2
mean,,2.5,5.0,,
std,,1.290994,2.581989,,
min,,1.0,2.0,,
25%,,1.75,3.5,,
50%,,2.5,5.0,,
75%,,3.25,6.5,,


In [61]:
for i in range(20):
    df[i] = i
    
df.head()

Unnamed: 0,one,two,three,four,five,0,1,2,3,4,...,10,11,12,13,14,15,16,17,18,19
a,Hellow,1.0,2.0,four,four,0,1,2,3,4,...,10,11,12,13,14,15,16,17,18,19
b,Hellow,2.0,4.0,four,four,0,1,2,3,4,...,10,11,12,13,14,15,16,17,18,19
c,Hellow,3.0,6.0,four,,0,1,2,3,4,...,10,11,12,13,14,15,16,17,18,19
d,Hellow,4.0,8.0,four,,0,1,2,3,4,...,10,11,12,13,14,15,16,17,18,19


In [62]:
df.head().T

Unnamed: 0,a,b,c,d
one,Hellow,Hellow,Hellow,Hellow
two,1.0,2.0,3.0,4.0
three,2.0,4.0,6.0,8.0
four,four,four,four,four
five,four,four,,
0,0,0,0,0
1,1,1,1,1
2,2,2,2,2
3,3,3,3,3
4,4,4,4,4


In [63]:
pd.set_option('display.max_rows', 100)
pd.set_option('precision', 7)

In [64]:
s

a      1.1768919
b      0.5945248
c     -0.0469525
d      0.7282107
e    500.0000000
Name: example, dtype: float64

In this assignment I learned that pd is equivalent to panel data. Pandas has two data structures a series and a dataframe. A series is like a column in excel. A series has three important parts: the data, the index, and the name. The only reason to name a series is if you're creating a data frame. Indexing into a series using an array of True or False allows to perform selection. I learned that taking the mean of a series is a columnar aggregate computation. When you've got highly- dimensional data what you use is a data frame. 