## Data preparation

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Handling Missing Values in DataFrames

In [3]:
df = pd.DataFrame({"id":[np.nan,2,3,4,5],"grade":[np.nan,"b",np.nan,"c",np.nan],
                   "award":[np.nan, "gold", "silver","bronze", np.nan]})
display(df)

Unnamed: 0,id,grade,award
0,,,
1,2.0,b,gold
2,3.0,,silver
3,4.0,c,bronze
4,5.0,,


In [4]:
# Dropping rows or columns with missing values

display(df.dropna(how="any"))

Unnamed: 0,id,grade,award
1,2.0,b,gold
3,4.0,c,bronze


In [5]:
display(df.dropna(how="all",subset=["grade","award"]))

Unnamed: 0,id,grade,award
1,2.0,b,gold
2,3.0,,silver
3,4.0,c,bronze


In [6]:
# Imputing missing values
values = {"grade": "e", "award": "iron"}
display(df.fillna(value=values))

Unnamed: 0,id,grade,award
0,,e,iron
1,2.0,b,gold
2,3.0,e,silver
3,4.0,c,bronze
4,5.0,e,iron


In [7]:
df["id"].fillna(df["id"].mean(),inplace=True)
display(df)

Unnamed: 0,id,grade,award
0,3.5,,
1,2.0,b,gold
2,3.0,,silver
3,4.0,c,bronze
4,5.0,,


In [8]:
df["award"].fillna(df["award"].mode()[0],inplace=True)
display(df)

Unnamed: 0,id,grade,award
0,3.5,,bronze
1,2.0,b,gold
2,3.0,,silver
3,4.0,c,bronze
4,5.0,,bronze


In [9]:
# Binning in DataFrames

# Equal-width binning (using cut)

df = pd.DataFrame({"values": np.random.rand(100)})
res, bins = pd.cut(df["values"],10,retbins=True)
display(bins)

array([0.02592671, 0.12148423, 0.21609564, 0.31070705, 0.40531846,
       0.49992987, 0.59454127, 0.68915268, 0.78376409, 0.8783755 ,
       0.97298691])

In [10]:
display(res)

0        (0.5, 0.595]
1      (0.121, 0.216]
2        (0.5, 0.595]
3      (0.784, 0.878]
4      (0.121, 0.216]
           ...       
95     (0.311, 0.405]
96    (0.0259, 0.121]
97     (0.878, 0.973]
98       (0.405, 0.5]
99     (0.311, 0.405]
Name: values, Length: 100, dtype: category
Categories (10, interval[float64]): [(0.0259, 0.121] < (0.121, 0.216] < (0.216, 0.311] < (0.311, 0.405] ... (0.595, 0.689] < (0.689, 0.784] < (0.784, 0.878] < (0.878, 0.973]]

In [11]:
df2 = pd.DataFrame({"values": np.random.rand(100)})
new_res = pd.cut(df2["values"],bins)
display(new_res)

0                NaN
1       (0.405, 0.5]
2     (0.216, 0.311]
3       (0.5, 0.595]
4     (0.595, 0.689]
           ...      
95    (0.595, 0.689]
96    (0.878, 0.973]
97      (0.5, 0.595]
98    (0.878, 0.973]
99      (0.405, 0.5]
Name: values, Length: 100, dtype: category
Categories (10, interval[float64]): [(0.0259, 0.121] < (0.121, 0.216] < (0.216, 0.311] < (0.311, 0.405] ... (0.595, 0.689] < (0.689, 0.784] < (0.784, 0.878] < (0.878, 0.973]]

In [12]:
res, bins = pd.qcut(df["values"],10,retbins=True,labels=list("abcdefghij"))
display(bins)

array([0.02687282, 0.11522762, 0.25156469, 0.29144621, 0.3740765 ,
       0.44879065, 0.55849515, 0.64784105, 0.70127939, 0.81884693,
       0.97298691])

In [13]:
display(res)

0     g
1     b
2     f
3     j
4     b
     ..
95    d
96    a
97    j
98    e
99    d
Name: values, Length: 100, dtype: category
Categories (10, object): ['a' < 'b' < 'c' < 'd' ... 'g' < 'h' < 'i' < 'j']

In [14]:
# min-max normalization
df = pd.DataFrame({"values": np.random.randn(100)})
df.head()

Unnamed: 0,values
0,-0.495266
1,1.782823
2,-0.859281
3,-0.836725
4,2.230421


In [15]:
min = df["values"].min()
display(min)

-2.1367918715430485

In [16]:
max = df["values"].max()
display(max)

2.502397467242966

In [17]:
df["values"] = [(x-min)/(max-min) for x in df["values"]]
df.head()

Unnamed: 0,values
0,0.353839
1,0.844892
2,0.275374
3,0.280236
4,0.941374


In [18]:
# z-normalization;
df = pd.DataFrame({"values": np.random.randn(100)})
df.head()

Unnamed: 0,values
0,0.059072
1,-0.201043
2,-0.163893
3,-0.307853
4,-0.01533


In [19]:
mean = df["values"].mean()
display(mean)

-0.035790384741349816

In [20]:
std = df["values"].std()
display(std)

1.0811228926348126

In [21]:
df["values"] = df["values"].apply(lambda x: (x-mean)/std)
df.head()

Unnamed: 0,values
0,0.087745
1,-0.152853
2,-0.11849
3,-0.251648
4,0.018925


In [22]:
# Selection of top-ranked categorical features

In [23]:
df = pd.DataFrame({"id":[1,2,3,4,5],"grade":["b","b","a","c","a"],
                   "award":["gold", "gold", "silver","bronze", "bronze"],
                   "class": [1,1,1,0,0]})
display(df)

Unnamed: 0,id,grade,award,class
0,1,b,gold,1
1,2,b,gold,1
2,3,a,silver,1
3,4,c,bronze,0
4,5,a,bronze,0


In [24]:
for col in df.columns:
    df[col] = df[col].astype("category")
res = [(col,[g.groupby("class").size().values for (n,g) in df.groupby(col)]) for col in df.columns.drop("class")]
display(res)

[('id',
  [array([0, 1], dtype=int64),
   array([0, 1], dtype=int64),
   array([0, 1], dtype=int64),
   array([1, 0], dtype=int64),
   array([1, 0], dtype=int64)]),
 ('grade',
  [array([1, 1], dtype=int64),
   array([0, 2], dtype=int64),
   array([1, 0], dtype=int64)]),
 ('award',
  [array([2, 0], dtype=int64),
   array([0, 2], dtype=int64),
   array([0, 1], dtype=int64)])]

In [25]:
def score(values): # simple scoring function
    return np.max([np.max(v)-np.min(v) for v in values])

In [26]:
scores = [(col,score(r)) for (col,r) in res]
display(scores)

[('id', 1), ('grade', 2), ('award', 2)]

In [27]:
sorted_scores = sorted(scores,key=lambda tup: tup[1],reverse=True)
display(sorted_scores)

[('grade', 2), ('award', 2), ('id', 1)]

In [28]:
filtered = [col for (col,score) in sorted_scores[:2]]
display(filtered)

['grade', 'award']

In [29]:
new_df = df.loc[:,filtered]
display(new_df)

Unnamed: 0,grade,award
0,b,gold
1,b,gold
2,a,silver
3,c,bronze
4,a,bronze
