In [735]:
import numpy as np
import pandas as pd

## SERIES

In [736]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])

print(s)

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64


Creating a series based on a dict. 

If the passed index contains values not in the keys of the dict, than the passed data of the new series will have NaN values for the new indices.

The series only contains values that match the keys() and index. The rest are excluded, new indices have values that are set as NaN

In [737]:
power_levels = {'goku': 10000, 'piccolo': 8000, 'gohan': 6000}
print(power_levels, '\n')
s = pd.Series(power_levels, )
print(s)
print(f"\nIndex: {s.index}\n")

power_levels = {'goku': 10000, 'piccolo': 8000, 'gohan': 6000}
s = pd.Series(power_levels, index = ['goku', 'piccolo', 'gohan', 'beerus'])
print(s)
print(f"\nIndex: {s.index}\n")

power_levels = {1: 10000, 2: 8000, 3: 6000}
s = pd.Series(power_levels, index = ['goku', 'piccolo', 'gohan', 'beerus'])
print(s)
print(f"\nIndex: {s.index}")

{'goku': 10000, 'piccolo': 8000, 'gohan': 6000} 

goku       10000
piccolo     8000
gohan       6000
dtype: int64

Index: Index(['goku', 'piccolo', 'gohan'], dtype='object')

goku       10000.0
piccolo     8000.0
gohan       6000.0
beerus         NaN
dtype: float64

Index: Index(['goku', 'piccolo', 'gohan', 'beerus'], dtype='object')

goku      NaN
piccolo   NaN
gohan     NaN
beerus    NaN
dtype: float64

Index: Index(['goku', 'piccolo', 'gohan', 'beerus'], dtype='object')


Creating a series based on ndarrays

In [738]:
arr = np.array([1, 2, 3, 2, 3, 4])
# s = pd.Series(arr, index = [1, 2, 3]) # error because length of index vals do not match than len(arr)
s = pd.Series(arr, index = [10, 11, 12, 13, 14, 15])
print(s)

10    1
11    2
12    3
13    2
14    3
15    4
dtype: int64


Series based on scalar values.

Adding multiple index vals to a single scalar extends the scalar to the indices in the series.

In [739]:
s = pd.Series(1.0, index = [1, 2, 3])
print(s)

1    1.0
2    1.0
3    1.0
dtype: float64


You can also create Series based on list objects

In [740]:
series = pd.Series(data = range(5), index = list('VWXYZ'), name = 'series!')
series

V    0
W    1
X    2
Y    3
Z    4
Name: series!, dtype: int64

In [741]:
s = pd.Series([1, 'gohan'])
print(s.dtype) # prints object as we have ambiguous values in the series
print(s.apply(type)) # prints all types in the series.

object
0    <class 'int'>
1    <class 'str'>
dtype: object


RangeIndex returns an `index` object.

You can set the index directly as s.index but it raises an error if the index vals don't match the length.

Otherwise, if it's larger / smaller, you can use `series.reindex` which adds NaN vals to new index vals or truncates the vals if the index is smaller than the length.

In [742]:
index = pd.RangeIndex(start = 100, stop = 106)

#s.index = index
#print(s, '\n')

index2 = pd.RangeIndex(start = 100, stop = 103)
s2 = s.reindex(index = index2)
print(s2)

100    NaN
101    NaN
102    NaN
dtype: object


DataFrames are sets of Pandas Series into a single array. They have index values like Series and can be reindexed as well.

In [743]:
data = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
data = pd.DataFrame(data = data, index = index2)

print(f"Data:\n{data}\n")

index = pd.RangeIndex(start = 100, stop = 104)
data2 = data.reindex(index = index)

print(f"ReIndexed Data:\n{data2}")

#data.index = index # won't work, the `index` is larger than the length of `data`

Data:
     0  1  2
100  1  2  3
101  4  5  6
102  7  8  9

ReIndexed Data:
       0    1    2
100  1.0  2.0  3.0
101  4.0  5.0  6.0
102  7.0  8.0  9.0
103  NaN  NaN  NaN


Can also print length, size, shape of dataframes.

Len refers to the length of the first dimensionshap. To get the length of the ith dim, just get the ith element in the df.shape. If it's second dim, just transpose data2 and take len(data2)

In [744]:
print(f"Len: {len(data2)}")
print(f"Size: {data2.size}")
print(f"Shape: {data2.shape}")

Len: 4
Size: 12
Shape: (4, 3)


Returns the raw array of Pandas, similar to an Ndarray but with extensions.

Can be indexed like an np.array.

In [745]:
s = pd.Series([1, 2, 3])
print(s.array)
print(f"\nArr at index 1: {s.array[1]}")

<NumpyExtensionArray>
[np.int64(1), np.int64(2), np.int64(3)]
Length: 3, dtype: int64

Arr at index 1: 2


In [746]:
s = pd.Series([True, True, False])
s1 = s.to_numpy(dtype = int)
s2 = s.to_numpy()
print(s1)
print(s2)

[1 1 0]
[ True  True False]


Raw indexing based on indices will yield an error if index does not exist.

Using series.get to index will return None or a specified default value rather than raising an error.

In [747]:
print(s.get(4))
print(s.get(4, default = 'does not exist dummy!'))
#print(s[4]) # error

None
does not exist dummy!


Series support np like operations. But their operations align based on the index label, not the positional index.

In [748]:
s = pd.Series([1, 2, 3])
d = pd.Series([2, 2, 2])
print(f"Add:\n{s + d}\n")
print(f"Mul:\n{s*d}")



Add:
0    3
1    4
2    5
dtype: int64

Mul:
0    2
1    4
2    6
dtype: int64


If index labels aren't present in both series, the corresponding index label will have it's value set as NaN

In [749]:
s = pd.Series([1, 2])
d = pd.Series([2, 2, 2])
print(f"Add:\n{s + d}\n")

Add:
0    3.0
1    4.0
2    NaN
dtype: float64



*> Note that a series will only be 1 dimensional. Convert to a Df if you want the additional dimension*

In [750]:
print("Original Series")
print(s.shape)
print(d.shape)

print("\nDataframes")
print(s.to_frame().shape)
print(d.to_frame().shape)

print(f"\nS:\n{s.to_frame()}")
print(f"\nD.T:\n{d.to_frame().T}")

s.to_frame() @ d.to_frame().T

Original Series
(2,)
(3,)

Dataframes
(2, 1)
(3, 1)

S:
   0
0  1
1  2

D.T:
   0  1  2
0  2  2  2


Unnamed: 0,0,1,2
0,2,2,2
1,4,4,4


In [751]:
s = {'one': pd.Series([1, 2, 3, 4], index = ['a', 'b', 'c', 'd'])}
s = pd.Series(s)
print(s)
print()
print(s['one'])

one    a    1
b    2
c    3
d    4
dtype: int64
dtype: object

a    1
b    2
c    3
d    4
dtype: int64


## DATAFRAME

In [752]:
df = pd.DataFrame([[1, 2, 3], [3, 4, 5], [4, 5]])
df

Unnamed: 0,0,1,2
0,1,2,3.0
1,3,4,5.0
2,4,5,


In [753]:
d = {
    "one": pd.Series([1, 2, 3, 4, 5, 6], index = ['a', 'b', 'c', 'd', 'e', 'f']),
    "two": pd.Series([1, 2, 3, 4, 5, 6], index = ['a', 'b', 'c','d', 'e', 'f'])    
}

df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1,1
b,2,2
c,3,3
d,4,4
e,5,5
f,6,6


In [754]:
d = {
    'one': [1, 2, 3],
    'two': [2, 3, 4]
}

df = pd.DataFrame(d, index = ['two', 'three', 'four'], columns = ['ID', "PIN"])
df

Unnamed: 0,ID,PIN
two,,
three,,
four,,


In [755]:
d = {
    'one': [1, 2, 3],
    'two': [2, 3, 4]
}

df = pd.DataFrame(d, index = ['two', 'three', 'four'])
df.columns = ['ID', 'PIN']
df

Unnamed: 0,ID,PIN
two,1,2
three,2,3
four,3,4


In [756]:
rng = np.random.default_rng(seed = 1)
d = rng.normal((2, 2))
d = pd.DataFrame(d)

In [757]:
d = {"RNG1": rng.normal(size = (3)), "RNG2": rng.normal(size = (3))}
d = pd.DataFrame(d, index = [0, 1, 2])
d

Unnamed: 0,RNG1,RNG2
0,0.330437,0.446375
1,-1.303157,-0.536953
2,0.905356,0.581118


Here, each element of data2 is a dict, representing a row of the dataframe.
Each column is denoted by the key of each dict. Each dict represents a row of the dataframe


The second dict in the list has no 'd' but the first does, therefore in the column of 'd', the second row's value for that column gets marked as NaN.

In [758]:
data2 = [{"a": 1, "b": 2, 'c': 1, 'd': 5}, {"a": 5, "b": 10, "c": 20}]
pd.DataFrame(data2)

Unnamed: 0,a,b,c,d
0,1,2,1,5.0
1,5,10,20,


In [759]:
dict = {u'2012-07-01': 391,
 u'2012-07-02': 392,
 u'2012-07-03': 392,
 u'2012-07-04': 392,
 u'2012-07-05': 392,
 u'2012-07-06': 392}

print(pd.DataFrame(list(dict.items())))
print()
print(pd.DataFrame(dict.items()))

print()
print(list(dict.items()))
print()
print(dict.items())


            0    1
0  2012-07-01  391
1  2012-07-02  392
2  2012-07-03  392
3  2012-07-04  392
4  2012-07-05  392
5  2012-07-06  392

            0    1
0  2012-07-01  391
1  2012-07-02  392
2  2012-07-03  392
3  2012-07-04  392
4  2012-07-05  392
5  2012-07-06  392

[('2012-07-01', 391), ('2012-07-02', 392), ('2012-07-03', 392), ('2012-07-04', 392), ('2012-07-05', 392), ('2012-07-06', 392)]

dict_items([('2012-07-01', 391), ('2012-07-02', 392), ('2012-07-03', 392), ('2012-07-04', 392), ('2012-07-05', 392), ('2012-07-06', 392)])


In [760]:
data = {'A1': {'A': [1, 2, 3], 'B': [4, 5, 6]}, 'B2': {'A': [7, 6], 'B': [9, 2, 3]}}
dframe = pd.DataFrame.from_dict(data)
print(dframe)

          A1         B2
A  [1, 2, 3]     [7, 6]
B  [4, 5, 6]  [9, 2, 3]


By default, each row in the dataframe is a series.

In [761]:
s1 = pd.Series([1, 2, 3], index = ['one', 'two', 'three'])
s2 = pd.Series([3, 4, 5], index = ['one', 'two', 'three'])

pd.DataFrame(data = (s1, s2))

Unnamed: 0,one,two,three
0,1,2,3
1,3,4,5


In [762]:
df = pd.DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], index = ['one', 'two', 'three'], columns = ['a', 'b', 'c'])
print(df, '\n')
#del df['a']
#df['a'] = 0
#df.loc['one', 'a'] = 0
#df.at['one', 'a'] = 0
df.iat[0, 0] = 0
print(df.loc['one'])

       a  b  c
one    1  1  1
two    2  2  2
three  3  3  3 

a    0
b    1
c    1
Name: one, dtype: int64


In [763]:
df = df.T
df.pop('one')
df = df.T
print(df)

       a  b  c
two    2  2  2
three  3  3  3


In [764]:
data = pd.read_csv('data/iris.csv')
data = data.assign(sepal_lw_ratio = data['sepal_length'] / data['sepal_width'])
data

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,sepal_lw_ratio
0,5.1,3.5,1.4,0.2,setosa,1.457143
1,4.9,3.0,1.4,0.2,setosa,1.633333
2,4.7,3.2,1.3,0.2,setosa,1.468750
3,4.6,3.1,1.5,0.2,setosa,1.483871
4,5.0,3.6,1.4,0.2,setosa,1.388889
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica,2.233333
146,6.3,2.5,5.0,1.9,virginica,2.520000
147,6.5,3.0,5.2,2.0,virginica,2.166667
148,6.2,3.4,5.4,2.3,virginica,1.823529


In [765]:
data['species']

0         setosa
1         setosa
2         setosa
3         setosa
4         setosa
         ...    
145    virginica
146    virginica
147    virginica
148    virginica
149    virginica
Name: species, Length: 150, dtype: object

In [766]:
print(f'orig:\n\n{data.loc[0]}')
print()
print(f'.loc {data.loc[0, 'sepal_length']}')
print(f'.iloc {data.iloc[0, 0]}')
print()

orig:

sepal_length           5.1
sepal_width            3.5
petal_length           1.4
petal_width            0.2
species             setosa
sepal_lw_ratio    1.457143
Name: 0, dtype: object

.loc 5.1
.iloc 5.1



In [767]:
print(data[0:5])
print(data.iloc[0:5, 0:2])

   sepal_length  sepal_width  petal_length  petal_width species  \
0           5.1          3.5           1.4          0.2  setosa   
1           4.9          3.0           1.4          0.2  setosa   
2           4.7          3.2           1.3          0.2  setosa   
3           4.6          3.1           1.5          0.2  setosa   
4           5.0          3.6           1.4          0.2  setosa   

   sepal_lw_ratio  
0        1.457143  
1        1.633333  
2        1.468750  
3        1.483871  
4        1.388889  
   sepal_length  sepal_width
0           5.1          3.5
1           4.9          3.0
2           4.7          3.2
3           4.6          3.1
4           5.0          3.6


In [768]:
print(np.sum(data.iloc[:5, :4], axis =1))
print()
print(np.asarray(np.sum(data.iloc[:5, :4], axis = 1)))

0    10.2
1     9.5
2     9.4
3     9.4
4    10.2
dtype: float64

[10.2  9.5  9.4  9.4 10.2]


In [769]:
dframe = pd.DataFrame(data = [range(5), range(5)], columns= list('ABCDE')).T
print(dframe)

dframe.drop(labels = ['A', 'B'])

   0  1
A  0  0
B  1  1
C  2  2
D  3  3
E  4  4


Unnamed: 0,0,1
C,2,2
D,3,3
E,4,4


In [770]:
dframe = dframe.rename(columns={dframe.columns[0]: 'X', dframe.columns[1]: 'Y'})

print(dframe)
print()
print(dframe.iloc[0])
dframe = dframe.T
print()
print(dframe)
print()
print(dframe.iloc[0])
print()
print('or')
print()
print(dframe.loc['X'])

   X  Y
A  0  0
B  1  1
C  2  2
D  3  3
E  4  4

X    0
Y    0
Name: A, dtype: int64

   A  B  C  D  E
X  0  1  2  3  4
Y  0  1  2  3  4

A    0
B    1
C    2
D    3
E    4
Name: X, dtype: int64

or

A    0
B    1
C    2
D    3
E    4
Name: X, dtype: int64


### Data Alignment and Arithmetic

If a DataFrame is added to another DataFrame, yet one is smaller than the other, the elements that don't line up element-wise will be set to NaN

In [771]:
df1 = pd.DataFrame(np.array([[1, 1], [1, 1]]), columns = ['A', 'B'])
df2 = pd.DataFrame(np.array([[1, 1, 1], [1, 1, 1], [1, 1, 1]]), columns = ['A', 'B', 'C'])

print(df1)
print()
print(df2)

   A  B
0  1  1
1  1  1

   A  B  C
0  1  1  1
1  1  1  1
2  1  1  1


In [772]:
print(df1 + df2)

     A    B   C
0  2.0  2.0 NaN
1  2.0  2.0 NaN
2  NaN  NaN NaN


As default, addition is done based on the alignment of the index labels and the column labels.

In [773]:
df1 = pd.DataFrame(np.array([[1, 1], [1, 1]]), columns = ['B', 'C'])
df2 = pd.DataFrame(np.array([[1, 1, 1], [1, 1, 1], [1, 1, 1]]), columns = ['A', 'B', 'C'])

print(df1)
print()
print(df2)

   B  C
0  1  1
1  1  1

   A  B  C
0  1  1  1
1  1  1  1
2  1  1  1


In [774]:
print(df1 + df2)

    A    B    C
0 NaN  2.0  2.0
1 NaN  2.0  2.0
2 NaN  NaN  NaN


Note how the labels of both the columns and the rows are different, such that when added we get NaN values for all besides the two that match which are [0, 'B'] and [0, 'C']

In [775]:
df1 = pd.DataFrame(np.array([[1, 1], [1, 1]]), columns = ['B', 'C'])
df2 = pd.DataFrame(np.array([[1, 1, 1], [1, 1, 1], [1, 1, 1]]), columns = ['A', 'B', 'C'], index = [0, 'Y', 'Z'])

print(df1)
print()
print(df2)

   B  C
0  1  1
1  1  1

   A  B  C
0  1  1  1
Y  1  1  1
Z  1  1  1


In [776]:
print(df1 + df2)

    A    B    C
0 NaN  2.0  2.0
1 NaN  NaN  NaN
Y NaN  NaN  NaN
Z NaN  NaN  NaN


> [!NOTE]
> 
> *Arithmetic amongst DF's are done in alignment with the labels of the rows and columns*

When combining a dataframe and a series, each labvel index of the series automatically matches with the column label of the DataFrame.

In [777]:
s1 = pd.Series([1, 1])
df2 = pd.DataFrame(np.array([[1, 1, 1], [1, 1, 1], [1, 1, 1]]), columns = ['A', 'B', 'C']).T
print(s1)
print()
print(df2)
print()
print(df2 + s1)

0    1
1    1
dtype: int64

   0  1  2
A  1  1  1
B  1  1  1
C  1  1  1

     0    1   2
A  2.0  2.0 NaN
B  2.0  2.0 NaN
C  2.0  2.0 NaN


In [778]:
s1 = pd.Series([1, 2, 3], index = ['a', 'b', 'c'])
s2 = pd.Series([1, 3, 5], index = ['b', 'a', 'c'])

np.remainder(s1, s2)

a    1
b    0
c    3
dtype: int64

In [779]:
data = pd.read_csv('data/iris.csv').T
print(data.to_string())

                 0       1       2       3       4       5       6       7       8       9       10      11      12      13      14      15      16      17      18      19      20      21      22      23      24      25      26      27      28      29      30      31      32      33      34      35      36      37      38      39      40      41      42      43      44      45      46      47      48      49          50          51          52          53          54          55          56          57          58          59          60          61          62          63          64          65          66          67          68          69          70          71          72          73          74          75          76          77          78          79          80          81          82          83          84          85          86          87          88          89          90          91          92          93          94          95          96          97          98 

# Play

In [780]:
series = pd.Series(data = np.arange(start = 10,  stop = 50, step = 10), index = ['a', 'b', 'c', 'd'], name = 'Hello')

print(f"Series '{series.name}':")
print(series)
print()

print(f"{series.name} at index 'c': {series.loc['c']}")

Series 'Hello':
a    10
b    20
c    30
d    40
Name: Hello, dtype: int64

Hello at index 'c': 30


In [781]:
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35],
    'City': ['New York', 'Los Angeles', 'Chicago']
}

dataframe = pd.DataFrame(data)
dataframe.attrs['name'] = 'People'

print(f"Dataframe {dataframe.attrs['name']}:")
print(dataframe)
print()

print(f"{dataframe.attrs['name']} at column 'Age'")
print(dataframe.loc[:, 'Age'])

Dataframe People:
      Name  Age         City
0    Alice   25     New York
1      Bob   30  Los Angeles
2  Charlie   35      Chicago

People at column 'Age'
0    25
1    30
2    35
Name: Age, dtype: int64


In [782]:
df = pd.DataFrame({
    'A': [1, 2, 3],
    'B': [4, 5, 6]
}, index=['x', 'y', 'z'])

print(df)
print()
print(f"ILOC:\n{df.loc['y']}\n")
print(f"ILOC:\n{df.iloc[1]}")

   A  B
x  1  4
y  2  5
z  3  6

ILOC:
A    2
B    5
Name: y, dtype: int64

ILOC:
A    2
B    5
Name: y, dtype: int64


In [783]:
# 4. df.shape will return (100, 5) and df.head(3) will return the first 3 rows.

df = pd.DataFrame(np.random.randn(100, 5))

print("DataFrame")
print(df)

print("Shape:")
print(df.shape)
print()
print("First 3 Rows")
print(df.head(3))



DataFrame
           0         1         2         3         4
0  -0.116503  0.831895  1.234120  0.754091  0.432358
1  -0.679768 -0.255101 -1.342611 -0.169630  0.016976
2   2.095852  0.334302 -0.865249  0.163043  0.365603
3   0.326044 -0.166129 -0.179732  0.771494  1.454527
4   0.144160  1.230908 -1.147613 -0.142578  0.795627
..       ...       ...       ...       ...       ...
95  2.857114  1.799255  0.837647  0.377799 -0.505069
96  1.145158 -1.408684 -2.579466 -0.019864  0.314572
97 -0.173368  0.578501  0.182681  2.021466 -1.612565
98  1.645869 -0.356312 -0.222742  1.537413 -0.829323
99 -0.202511  1.500822 -1.555864  0.341969  0.091389

[100 rows x 5 columns]
Shape:
(100, 5)

First 3 Rows
          0         1         2         3         4
0 -0.116503  0.831895  1.234120  0.754091  0.432358
1 -0.679768 -0.255101 -1.342611 -0.169630  0.016976
2  2.095852  0.334302 -0.865249  0.163043  0.365603


In [784]:
# s.head(2) will return the first 2 values. s.tail(1) will return the last value



s = pd.Series([7, 8, 9], index=['A', 'B', 'C'])

print('Series:')
print(s)
print()

print(f"First two values: \n{s.head(2)}\n")
print(f"Last value:\n{s.tail(1)}\n")


Series:
A    7
B    8
C    9
dtype: int64

First two values: 
A    7
B    8
dtype: int64

Last value:
C    9
dtype: int64



# More basic functionality

In [785]:
df = pd.DataFrame(np.random.randn(5, 5))
print(df)
print(f"\nShape{df.shape}\n")

# df.shape = (3, 3) # Attribute error, DataFrame.shape is read only

print(f"DF with new column names")

df.columns = ['a', 'b', 'c', 'd', 'e']
print(df)

          0         1         2         3         4
0  0.369494  1.963846  0.179076 -0.455118  0.999962
1  0.106848 -0.103569  0.565814  0.367478 -0.330119
2  0.644167  1.308286  0.787749  1.160945  0.845189
3 -0.053971  1.339710 -0.306350 -0.413817  1.204890
4  0.890573 -0.244748 -0.405191  1.391446  0.738002

Shape(5, 5)

DF with new column names
          a         b         c         d         e
0  0.369494  1.963846  0.179076 -0.455118  0.999962
1  0.106848 -0.103569  0.565814  0.367478 -0.330119
2  0.644167  1.308286  0.787749  1.160945  0.845189
3 -0.053971  1.339710 -0.306350 -0.413817  1.204890
4  0.890573 -0.244748 -0.405191  1.391446  0.738002


When converting a df to ndarray using to_numpy(), editing the resulting ndarray provided by to_numpy() will change the original dataframe. Only true for homogeneous array / dataframes.

In [786]:
df = pd.DataFrame(np.random.randn(5, 5))

print('DF:')
print(df)
print()

print('NDARRAY OF DF')
print(df.to_numpy())
print()

npdf = df.to_numpy()
npdf[0, :] = 10

print('MODIFED VERSION OF NDARRAY')
print(npdf)
print()

print('MODIFIED DF:')
print(df)

DF:
          0         1         2         3         4
0  0.832903 -1.501105  1.112483  1.094856 -0.220738
1 -2.138615  0.818698  0.474559  0.884259  0.505790
2 -0.546287  1.022558 -1.783485  0.183639 -1.284571
3 -0.473128 -0.137765  0.074343 -0.400110 -0.365764
4 -0.493099  0.919478  0.397552  1.189579 -1.861261

NDARRAY OF DF
[[ 0.83290322 -1.50110507  1.11248286  1.09485573 -0.2207383 ]
 [-2.13861548  0.81869821  0.47455911  0.88425881  0.50578967]
 [-0.54628724  1.02255807 -1.78348526  0.18363936 -1.28457097]
 [-0.47312782 -0.13776473  0.07434318 -0.40011007 -0.36576401]
 [-0.49309881  0.91947814  0.39755161  1.18957948 -1.86126136]]

MODIFED VERSION OF NDARRAY
[[10.         10.         10.         10.         10.        ]
 [-2.13861548  0.81869821  0.47455911  0.88425881  0.50578967]
 [-0.54628724  1.02255807 -1.78348526  0.18363936 -1.28457097]
 [-0.47312782 -0.13776473  0.07434318 -0.40011007 -0.36576401]
 [-0.49309881  0.91947814  0.39755161  1.18957948 -1.86126136]]

MODIFIED

In [787]:
df = pd.DataFrame(np.random.randn(5, 5))
df.loc[5, 5] = 'hello'

print('DF:')
print(df)
print()

print('NDARRAY OF DF')
print(df.to_numpy())
print()

npdf = df.to_numpy()
npdf[0, :] = 10

print('MODIFED VERSION OF NDARRAY')
print(npdf)
print()

print('MODIFIED DF:')
print(df)

DF:
          0         1         2         3         4      5
0 -0.313884 -0.125907 -1.578457  0.098918  1.693509    NaN
1  1.426816  0.654617 -0.603562 -3.035202  0.152815    NaN
2 -0.083070 -0.027167 -0.551952 -2.047868  0.378545    NaN
3 -0.424706 -1.312694 -0.177202 -1.502278 -0.242833    NaN
4  0.109339  0.918594 -1.864341  0.551352  0.712183    NaN
5       NaN       NaN       NaN       NaN       NaN  hello

NDARRAY OF DF
[[-0.3138840111351728 -0.12590721220834603 -1.5784574831660478
  0.09891781163386655 1.6935085284753604 nan]
 [1.426815969183286 0.6546170697254089 -0.6035620116444785
  -3.0352017575067034 0.15281478193940334 nan]
 [-0.08307040583427681 -0.027167372540137083 -0.551952013089347
  -2.0478683382543985 0.37854462088275825 nan]
 [-0.4247062857162334 -1.3126940505344034 -0.17720165660043438
  -1.50227843510936 -0.24283264962228288 nan]
 [0.10933929837581002 0.918594353656509 -1.8643406439645296
  0.5513519295384942 0.7121825907813073 nan]
 [nan nan nan nan nan 'hello

In [788]:
df = pd.DataFrame([[1, 2, 3],[4, 5, 6]])
series = pd.Series([1, 1, 1])


print(df)
print()
print(series)

df + series

   0  1  2
0  1  2  3
1  4  5  6

0    1
1    1
2    1
dtype: int64


Unnamed: 0,0,1,2
0,2,3,4
1,5,6,7


In default operations, without functions, the elements in the series are added elementwise to the columns. Will result in NaN values if the df has less columns than the len(series)

In [789]:
df = pd.DataFrame([[1, 2, 3],[4, 5, 6]]).T
series = pd.Series([1, 1, 1])


print(df)
print()
print(series)

df + series

   0  1
0  1  4
1  2  5
2  3  6

0    1
1    1
2    1
dtype: int64


Unnamed: 0,0,1,2
0,2,5,
1,3,6,
2,4,7,


In [790]:
df = pd.DataFrame([[1, 2, 3],[4, 5, 6]])
series = pd.Series([1, 1, 1])


print(df)
print()
print(series)
print()
print(df.sub(series))
print()
print(df.sub(series,  axis = 0))

   0  1  2
0  1  2  3
1  4  5  6

0    1
1    1
2    1
dtype: int64

   0  1  2
0  0  1  2
1  3  4  5

     0    1    2
0  0.0  1.0  2.0
1  3.0  4.0  5.0
2  NaN  NaN  NaN


In [791]:
#print(df.add(series, axis = 0, fill_value = 'WTF0'))

df = pd.DataFrame([[1, 2, 3],[4, 5, 6]])
df2 = pd.DataFrame([[1, 1, 1], [1, 1, 1], [2, 2, 2]])

print(df2.add(df).fillna('wtf'))

     0    1    2
0  2.0  3.0  4.0
1  5.0  6.0  7.0
2  wtf  wtf  wtf


In [792]:
df2 = pd.DataFrame([[1, 1, 1], [6, 1, 1], [9, 2, 2]])

print(df2)
print()
print(df2.empty) # is the dataframe empty?
print()
print((df2 > 4).any(axis = 1))
print()
print((df2 > 4).all(axis = 1))

   0  1  2
0  1  1  1
1  6  1  1
2  9  2  2

False

0    False
1     True
2     True
dtype: bool

0    False
1    False
2    False
dtype: bool


In [793]:
series = pd.Series([1, 2, 3, 4, 5, 6, 7])
print(series.cumprod())
print()
print(series.cumsum())

0       1
1       2
2       6
3      24
4     120
5     720
6    5040
dtype: int64

0     1
1     3
2     6
3    10
4    15
5    21
6    28
dtype: int64


In [794]:
df = pd.DataFrame(np.random.randn(3, 3))
print(df)

          0         1         2
0  1.490002 -0.777403  2.374111
1  0.946467 -0.349183 -1.783298
2 -2.168071 -0.369750  1.088318


In [823]:
df = pd.DataFrame(np.random.randn(3, 3))
print(df)
print()
df.columns = df.columns.astype(str)
print(df.query('`2` > `1`'))


          0         1         2
0  0.489743  0.871131  0.225641
1  0.582761 -1.534673  1.372107
2 -1.594214 -0.594730 -0.287107

          0         1         2
1  0.582761 -1.534673  1.372107
2 -1.594214 -0.594730 -0.287107
