# Data Preprocessing

## 1 Binning

In [9]:
import pandas as pd
import numpy as np

In [10]:
df_ages = pd.DataFrame({'age': np.random.randint(0, 100, 10)})

In [11]:
df_ages

Unnamed: 0,age
0,32
1,13
2,19
3,95
4,26
5,95
6,62
7,39
8,76
9,17


In [12]:
df_ages['age_bins'] = pd.cut(x=df_ages['age'], bins=10)


In [13]:
df_ages['age_bins'] = pd.cut(x=df_ages['age'], bins=[21, 29, 33,37, 49])


In [14]:
df_ages

Unnamed: 0,age,age_bins
0,32,"(29.0, 33.0]"
1,13,
2,19,
3,95,
4,26,"(21.0, 29.0]"
5,95,
6,62,
7,39,"(37.0, 49.0]"
8,76,
9,17,


In [15]:
df_ages.age_bins.unique()

[(29.0, 33.0], NaN, (21.0, 29.0], (37.0, 49.0]]
Categories (4, interval[int64, right]): [(21, 29] < (29, 33] < (33, 37] < (37, 49]]

In [16]:
df_ages['age_by_decade'] = pd.cut(x=df_ages['age'], bins=[0, 5, 12, 20,50,100], labels=['infant', 'child', 'teenager','adult','senior'])

In [17]:
df_ages

Unnamed: 0,age,age_bins,age_by_decade
0,32,"(29.0, 33.0]",adult
1,13,,teenager
2,19,,teenager
3,95,,senior
4,26,"(21.0, 29.0]",adult
5,95,,senior
6,62,,senior
7,39,"(37.0, 49.0]",adult
8,76,,senior
9,17,,teenager


In [18]:
#write a code to count the number of instances for each decade
#eg 30s-1 

## 2 Missing value imputation


In [33]:
#Mean based imputation
from sklearn.impute import SimpleImputer

X= np.array([1,np.nan,3,3,9])
print(X)
imp = SimpleImputer(missing_values=np.nan, strategy='median')
X_imp=(imp.fit_transform(X.reshape(-1,1)))
X_imp

[ 1. nan  3.  3.  9.]


array([[1.],
       [3.],
       [3.],
       [3.],
       [9.]])

In [34]:
X=np.array([[1, 2], [np.nan, 3], [9, np.nan]])
X

array([[ 1.,  2.],
       [nan,  3.],
       [ 9., nan]])

In [35]:
print(imp.fit_transform(X))

[[1.  2. ]
 [5.  3. ]
 [9.  2.5]]


In [37]:
#Mode based imputation
import pandas as pd
from sklearn.impute import SimpleImputer
df = pd.DataFrame([["a", "x"],
                   [np.nan, "y"],
                   ["a", np.nan],
                   ["b", "y"]], dtype="category")

imp = SimpleImputer(strategy="most_frequent")
print(imp.fit_transform(df))


[['a' 'x']
 ['a' 'y']
 ['a' 'y']
 ['b' 'y']]


In [14]:
df2 = pd.DataFrame([["a", "x"],
                   [np.nan, "x"],
                   ["b", np.nan],
                   ["b", "y"]], dtype="category")
print(imp.transform(df2))

[['a' 'x']
 ['a' 'x']
 ['b' 'y']
 ['b' 'y']]


## 3 Standardization

In [15]:
from sklearn import preprocessing
import numpy as np
X_train = np.array([[ 1., -1.,  2.],
                    [ 2.,  0.,  0.],                    
                    [ 0.,  1., -1.]])
scaler = preprocessing.StandardScaler().fit(X_train)
X_scaled = scaler.transform(X_train)
X_scaled

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

In [16]:
X_scaled.mean(axis=0)
#axis= 0 speicfy column while axis =1 speicy row

array([0., 0., 0.])

## 4 Min-Max Normalization / 0-1 Normalization

In [17]:
min_max_scaler = preprocessing.MinMaxScaler()
X_train_minmax = min_max_scaler.fit_transform(X_train)
X_train_minmax

array([[0.5       , 0.        , 1.        ],
       [1.        , 0.5       , 0.33333333],
       [0.        , 1.        , 0.        ]])

## 5 One Hot Encoding

In [49]:
import pandas as pd
df = pd.read_csv("weather_data.csv")
df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,35,7,Sunny
2,1/3/2017,28,2,Snow
3,1/4/2017,24,7,Snow
4,1/5/2017,32,4,Rain
5,1/6/2017,31,2,Sunny


In [50]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(df['event'].values)

In [51]:
m=vectorizer.transform(df.event)

In [52]:
print(vectorizer.get_feature_names_out())

['rain' 'snow' 'sunny']


In [53]:
m

<6x3 sparse matrix of type '<class 'numpy.int64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [55]:
print(m)

  (0, 0)	1
  (1, 2)	1
  (2, 1)	1
  (3, 1)	1
  (4, 0)	1
  (5, 2)	1
