# Encoding numerical data:
    1.binning: continuous value to discreate value in a range of intervals
    2.binarization: continous value to binary

# Types of binning:
    1.unsupervised:
        1a-equal width
        1b-equal frequency
        1c- kmeans binning
    2.supervised:
        2a-decesion tree binning
    3.custom binning 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.compose import ColumnTransformer

In [2]:
df=pd.read_csv("//Users//udayladdha//Desktop//Data science//DataSets//train.csv",usecols=["Age","Fare","Survived"])
df.head()

Unnamed: 0,Survived,Age,Fare
0,0,22.0,7.25
1,1,38.0,71.2833
2,1,26.0,7.925
3,1,35.0,53.1
4,0,35.0,8.05


In [3]:
df.isnull().sum()

Survived      0
Age         177
Fare          0
dtype: int64

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Age       714 non-null    float64
 2   Fare      891 non-null    float64
dtypes: float64(2), int64(1)
memory usage: 21.0 KB


In [5]:
df.dropna(inplace=True)

In [6]:
df.isnull().sum()

Survived    0
Age         0
Fare        0
dtype: int64

In [7]:
df.shape

(714, 3)

# Without binning

In [8]:
x=df.iloc[:,1:]
y=df.iloc[:,0]

In [9]:
x.shape

(714, 2)

In [10]:
y.shape

(714,)

In [11]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=1)

In [12]:
clf=DecisionTreeClassifier()

In [13]:
clf.fit(x_train,y_train)
y_pred=clf.predict(x_test)
accuracy_score(y_pred,y_test)

0.6573426573426573

In [14]:
np.mean(cross_val_score(DecisionTreeClassifier(),x,y,cv=10,scoring="accuracy"))

0.6345070422535211

# With binning

In [15]:
k_bin_age=KBinsDiscretizer(n_bins=10,encode='ordinal',strategy="quantile")
k_bin_fare=KBinsDiscretizer(n_bins=10,encode='ordinal',strategy="quantile")

In [16]:
trf1=ColumnTransformer([
    ("first",k_bin_age,[0]),
    ("second",k_bin_fare,[1])
])

In [17]:
x_train_trf1=trf1.fit_transform(x_train)
x_test_trf1=trf1.transform(x_test)

In [18]:
trf1.named_transformers_["first"].n_bins_  # no. of bins created

array([10])

In [19]:
trf1.named_transformers_["first"].bin_edges_ # range of intervals

array([array([ 0.42, 14.  , 19.  , 22.  , 25.  , 28.  , 32.  , 36.  , 42.  ,
              50.  , 80.  ])                                                ],
      dtype=object)

In [20]:
clf=DecisionTreeClassifier()

In [21]:
clf.fit(x_train_trf1,y_train)
y_pred=(clf.predict(x_test_trf1))
accuracy_score(y_pred,y_test)

0.7132867132867133

In [22]:
clf.fit(x_train,y_train)
y_pred=clf.predict(x_test)
print("accuracy without binning:",accuracy_score(y_pred,y_test))


clf.fit(x_train_trf1,y_train)
y_pred=(clf.predict(x_test_trf1))
print("accuracy with binning:   ",accuracy_score(y_pred,y_test))


accuracy without binning: 0.6643356643356644
accuracy with binning:    0.7132867132867133


In [23]:
def discitizer(bins,strategy):
    k_bin_age=KBinsDiscretizer(n_bins=bins,encode='ordinal',strategy=strategy)
    k_bin_fare=KBinsDiscretizer(n_bins=bins,encode='ordinal',strategy=strategy)
    
    
    trf=ColumnTransformer([
    ("first",k_bin_age,[0]),
    ("second",k_bin_fare,[1])
    ])
    
    x_trf=trf.fit_transform(x)
    print(np.mean(cross_val_score(DecisionTreeClassifier(),x,y,cv=10,scoring="accuracy")))

In [24]:
discitizer(10,"kmeans")

0.6289514866979656


# Binarization

In [25]:
from sklearn.preprocessing import Binarizer

In [26]:
df=pd.read_csv("//Users//udayladdha//Desktop//Data science//DataSets//train.csv",usecols=["Age","Fare","SibSp","Parch","Survived"])
df.head()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare
0,0,22.0,1,0,7.25
1,1,38.0,1,0,71.2833
2,1,26.0,0,0,7.925
3,1,35.0,1,0,53.1
4,0,35.0,0,0,8.05


In [27]:
df.dropna(inplace=True)

In [28]:
df.shape

(714, 5)

In [29]:
df["family"]=df["SibSp"]+df["Parch"]

In [30]:
df.head()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,family
0,0,22.0,1,0,7.25,1
1,1,38.0,1,0,71.2833,1
2,1,26.0,0,0,7.925,0
3,1,35.0,1,0,53.1,1
4,0,35.0,0,0,8.05,0


In [31]:
df=df.drop(columns=df[["SibSp","Parch"]])

In [32]:
df.head()

Unnamed: 0,Survived,Age,Fare,family
0,0,22.0,7.25,1
1,1,38.0,71.2833,1
2,1,26.0,7.925,0
3,1,35.0,53.1,1
4,0,35.0,8.05,0


In [33]:
x=df.drop(columns="Survived")
y=df["Survived"]

In [34]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)
x_train

Unnamed: 0,Age,Fare,family
328,31.0,20.5250,2
73,26.0,14.4542,1
253,30.0,16.1000,1
719,33.0,7.7750,0
666,25.0,13.0000,0
...,...,...,...
92,46.0,61.1750,1
134,25.0,13.0000,0
337,41.0,134.5000,0
548,33.0,20.5250,2


# Without binarization

In [35]:
dt=DecisionTreeClassifier()

In [36]:
dt.fit(x_train,y_train)
y_pred=dt.predict(x_test)
print("without binarization:",accuracy_score(y_test,y_pred))

without binarization: 0.6223776223776224


In [37]:
np.mean(cross_val_score(DecisionTreeClassifier(),x,y,cv=10,scoring="accuracy"))

0.6456768388106416

# With binarization

In [38]:
# lets check koi passenger akele travel kr rha hai ya nhi:
             # if family is 0 then passenger is travelling alone 

In [39]:
trf=ColumnTransformer([
    ("bin",Binarizer(copy=False),["family"]) # copy= false mean we are making  changes in same column not creating another column
],remainder="passthrough")

In [40]:
x_train_trf=trf.fit_transform(x_train)
x_test_trf=trf.transform(x_test)

In [41]:
pd.DataFrame(x_train_trf,columns=["family","Age","Fare"]) # if family is 1 then he is travelling with someone and if family is 0 he is travelling alone

Unnamed: 0,family,Age,Fare
0,1.0,31.0,20.5250
1,1.0,26.0,14.4542
2,1.0,30.0,16.1000
3,0.0,33.0,7.7750
4,0.0,25.0,13.0000
...,...,...,...
566,1.0,46.0,61.1750
567,0.0,25.0,13.0000
568,0.0,41.0,134.5000
569,1.0,33.0,20.5250


In [42]:
dt=DecisionTreeClassifier()

dt.fit(x_train,y_train)
y_pred=dt.predict(x_test)
print("without binarization:",accuracy_score(y_test,y_pred))


dt.fit(x_train_trf,y_train)
y_pred=dt.predict(x_test_trf)
print("with binarization:",accuracy_score(y_test,y_pred))

without binarization: 0.6363636363636364
with binarization: 0.6083916083916084


In [43]:
# ans is not accurate but this was done just to show how it works and sometime these tecnique gives us better result