# Data preprocessing

Pandas is the shorthand for 'Python and Data Analysis'. It provides a rich set of features for exploring and manipulating data, making it the go-to toolkit for a lot of data scientists.

Check also: https://pandas.pydata.org/

In [None]:
import numpy as np
import pandas as pd
print(pd.__version__)

Creating some pandas series...

In [None]:
ser_a = pd.Series([1, 2, 3, 4], index=["a", "b", "c", "d"])
ser_b = pd.Series([1, 2, 3, 4], index=["b", "a", "c", "d"])

ser_a + ser_b

... doing some element-wise operations

In [None]:
ser_a + ser_b
ser_a - ser_b
ser_a * ser_b
ser_a / ser_b

... doing some aggregation 

In [None]:
ser_c = pd.Series([1, np.nan, 3, 4], index=["a", "b", "c", "d"])
print(ser_c.dtype)               # Which dtype has `ser_c`?

ser_c.count()             # => 3
ser_c.sum()               # => 8
ser_c.mean()              # => 2.67
ser_c.mean(skipna=False)  # => nan
ser_c.max()               # => 4
ser_c.min()               # => 1
ser_c.idxmax()            # => "d"

ser_d = pd.Series([1, "a", 3, 4], index=["a", "b", "c", "d"])
print(ser_d.dtype)        # which dtype has `ser_d`?

ser_e = pd.Series([1, 1, 1, np.nan, 3, 4])
ser_e

Cheking unique values

In [None]:
ser_e.unique() # => [ 1., nan,  3.,  4.]
ser_e.nunique() # => 3
ser_e.value_counts()

In [None]:
ser_e.hist(bins=10)

Checking null values

In [None]:
df = pd.DataFrame() # create an empty dataframe
df["ser_e"] = ser_e # add the column "ser_e"
df["isna"] = ser_e.isna() # add the column "isna", True if the value is NaN
df["notna"] = ser_e.notna() # add the column "notna", False if the value is NaN
df

In [None]:
ser_e.mean() # the mean is 2
# fill the missing value with the average
df["mean"] = ser_e.fillna(ser_e.mean())
df

In [None]:
# fill the missing value with the previous (not NaN) value
df["ffill"] = ser_e.fillna(method="ffill")
df

In [None]:
# fill the missing value with the following (not NaN) value
df["bfill"] = ser_e.fillna(method="bfill")
df

In [None]:
ser_e.dropna()

Creating pandas dataframes

In [None]:
# create a dataframe with random data
import random
random.seed(3)
df = pd.DataFrame([[random.randint(0, 9) for i in range(10)] for i in range(5)],
                  index=[i for i in range(5)], 
                  columns=list('abcdefghij'))
df

Doing some operations with pandas dataframes

In [None]:
# Select a column (i.e., a series)
df['a']

# Add another column
df['k'] = df['a'] * df['b']

# Get the headers (i.e., the column names)
df.columns

# Get just the first two rows
df.head(2)

# Get just the last two rows
df.tail(2)

# Sort the dataframe by columns
df.sort_values(by=['a', 'b'], ascending=[False, True])

# To get some statistics (e.g., count, mean, std, min, etc.)
df.describe()

Why is the `mean` around 4.0 and 5.0?