In [None]:
import pandas as pd

couple of useful twists

for unary operations like negation and trigonometric functions, these ufuncs will preserve index and column labels in the output, and for binary operations such as addition and multiplication, Pandas will automatically align indices when passing the objects to the ufunc.

This means that keeping the context of data and combining data from different sources—both potentially error-prone tasks with raw NumPy arrays—become essentially foolproof ones with Pandas.

In [None]:
import numpy as np

In [None]:
rng = np.random.RandomState(42)

In [None]:
ser = pd.Series(rng.randint(0,10,4))

In [None]:
ser

In [None]:
df = pd.DataFrame(rng.randint(0,10,(3,4)), columns=['A','B','C','D'])

In [None]:
df

If we apply a NumPy ufunc on either of these objects, the result will be another Pandas object with the indices preserved:

In [None]:
np.exp(ser)

In [None]:
np.sin(df*np.pi/4)

For binary operations on two Series or DataFrame objects, Pandas will align indices in the process of performing the operation. This is very convenient when you are working with incomplete data, as we’ll see in some of the examples that follow.

In [None]:
population2 = pd.Series({'Khartoum':30000 , 'Addis':40000, 'Cairo':70000}, name='POPULATION-2')

In [None]:
population2

In [None]:
area2 = pd.Series({ 'Addis':444, 'Khartoum':333, 'Doha':888}, name='AREA-2')

In [None]:
area2

In [None]:
population2/area2


NaN = “Not a Number,”

In [None]:
A = pd.Series([2,4,6], index=[0,1,2])
B = pd.Series([1,3,5], index=[1,2,3])
A+B

In [None]:
A

In [None]:
B

In [None]:
A+B

If using NaN values is not the desired behavior, we can modify the fill value using appropriate object methods in place of the operators. For example, calling A.add(B)
is equivalent to calling A + B, but allows optional explicit specification of the fill value for any elements in A or B that might be missing:

In [None]:
A.add(B, fill_value=0)

In [None]:
A

In [None]:
B

In [None]:
A = pd.DataFrame(rng.randint(0,20,(2,2)), columns = list('AB'))
A

In [None]:
B = pd.DataFrame(rng.randint(0,10,(3,3)), columns=list('BAC'))
B

In [None]:
A+B

In [None]:
A.stack().mean()

In [None]:
fill

In [None]:
fill = A.stack().mean()
A.add(B, fill_value=fill)
# add NOT replace

In [None]:
A = rng.randint(10, size=(3,4))
A

In [None]:
A[0]

In [None]:
A-A[0]

In [None]:
A

In [None]:
df = pd.DataFrame(A, columns=list('QRST'))

In [None]:
df

In [None]:
df.iloc[0]

In [None]:
df - df.iloc[0]

If you would instead like to operate column-wise, you can use the object methods mentioned earlier, while specifying the axis keyword

In [None]:
df

In [None]:
df.subtract(df['R'], axis=0)

In [None]:
df

In [None]:
halfrow = df.iloc[0,::2]

In [None]:
halfrow

In [None]:
df - halfrow

In [None]:
df

________________________________

________________________________

### Handling Missing Data:
None and NaN

In [None]:
vals1 = np.array([1,None, 3,4])
vals1

In [None]:
for x in ['object','int']:
    print('dtype = ',x)
    %timeit np.arange(1E6, dtype=x).sum()
    print()
    

In [None]:
vals1.sum()

In [None]:
vals2 = np.array([1,np.nan, 3, 4])

In [None]:
type(vals2)

In [None]:
vals2.dtype

You should be aware that NaN is a bit like a data virus—it infects any other object it touches. Regardless of the operation, the result of arithmetic with NaN will be another NaN:

In [None]:
1 + np.nan

In [None]:
0 * np.nan

NumPy does provide some special aggregations that will ignore these missing values:

In [None]:
vals2.sum()

In [None]:
np.nansum(vals2)

In [None]:
np.nanmin(vals2)

In [None]:
np.nanmax(vals2)

Keep in mind that NaN is specifically a floating-point value; there is no equivalent NaN value for integers, strings, or other types.

In [None]:
pd.Series([1,np.nan,2,None])

#Pandas automatically type-casts when NA values are present

In [None]:
x = pd.Series(range(2), dtype=int)
x

In [None]:
x[0] = None
x

- isnull ( ) 
- notnull ( ) 
- dropna ( )
- fillna ( )

In [None]:
data = pd.Series([1,np.nan, 'hello','None'])
data

In [None]:
data.isnull()

In [None]:
data[data.notnull()]

## To drop or fail. This is the questions

dropna ()
fillna ()

In [None]:
data

In [None]:
data.dropna()

In [None]:
df = pd.DataFrame([1,np.nan,2] , [2,3,5], [np.nan,4,6])
df

In [None]:
df = pd.DataFrame([[1, np.nan, 2],
[2, 3, 5],
[np.nan, 4, 6]])
df

We cannot drop single values from a DataFrame; we can only drop full rows or full columns.

In [None]:
df.dropna()
# By default, dropna() will drop all rows in which any null value is present:

In [None]:
df

In [None]:
df.dropna(axis='columns')

#axis=1 drops all columns containing a null value:

In [None]:
df.dropna(axis=1)

In [None]:
df

In [None]:
df[3]=np.nan

In [None]:
df

In [None]:
df.dropna(axis='columns', how='all')

#You can also specify how='all', which will only drop rows/columns that are all null values:

For finer-grained control, the thresh parameter lets you specify a minimum number of non-null values for the row/column to be kept:

Here the first and last row have been dropped, because they contain only two nonnull values.

In [None]:
df

In [None]:
df.dropna(axis='rows', thresh=3)

### Filling null values

In [None]:
data = pd.Series([1,np.nan,2,None,3], index=list('abcde'))
data

####  We can fill NA entries with a single value, such as zero:

In [None]:
data.fillna(0)

#### We can specify a forward-fill to propagate the previous value forward:

In [None]:
data.fillna(method='ffill')

#### Or we can specify a back-fill to propagate the next values backward:

In [None]:
data.fillna(method='bfill')

#### For DataFrames, the options are similar, but we can also specify an axis along which the fills take place:

In [None]:
df

In [None]:
df.fillna(method='ffill', axis=1)