# Encoding Notebook

In [16]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, balanced_accuracy_score, roc_auc_score
from sklearn.datasets import load_breast_cancer, load_iris, make_moons, make_circles, make_classification
from sklearn.linear_model import LogisticRegression
from category_encoders import WOEEncoder, TargetEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import OneHotEncoder, LabelEncoder


from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import MinMaxScaler, KBinsDiscretizer, FunctionTransformer
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from mlxtend.feature_extraction import PrincipalComponentAnalysis
from mlxtend.preprocessing import standardize




from matplotlib import pyplot as plt
import seaborn as sns; sns.set()
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import gc; gc.enable()

from sklearn.metrics import mean_absolute_error

In [2]:
df = pd.read_csv('cleaned_columns_dropped.csv', index_col=0)

In [3]:
df.head(2)

Unnamed: 0,id,amount_tsh,gps_height,installer,longitude,latitude,wpt_name,basin,region,district_code,...,water_quality,quality_group,quantity,quantity_group,source,source_class,waterpoint_type_group,status_group,year_recorded,month_recorded
0,69572,6000.0,1390,Roman,34.938093,-9.856322,none,Lake Nyasa,Iringa,5,...,soft,good,enough,enough,spring,groundwater,communal standpipe,2,2011,3
1,8776,0.0,1399,GRUMETI,34.698766,-2.147466,Zahanati,Lake Victoria,Mara,2,...,soft,good,insufficient,insufficient,rainwater harvesting,surface,communal standpipe,2,2013,3


In [4]:
df.drop(columns=['payment_type', 'quantity_group'], inplace=True)

In [5]:
df.columns.tolist()

['id',
 'amount_tsh',
 'gps_height',
 'installer',
 'longitude',
 'latitude',
 'wpt_name',
 'basin',
 'region',
 'district_code',
 'lga',
 'ward',
 'population',
 'public_meeting',
 'scheme_management',
 'permit',
 'construction_year',
 'extraction_type',
 'extraction_type_group',
 'extraction_type_class',
 'management',
 'management_group',
 'payment',
 'water_quality',
 'quality_group',
 'quantity',
 'source',
 'source_class',
 'waterpoint_type_group',
 'status_group',
 'year_recorded',
 'month_recorded']

In [6]:
#target encode
# lst_te = ['wpt_name', 'basin', 'region', 'district_code', 'lga', 'ward', 'scheme_management','installer','source']
# ohe = ['extraction_type', 'extraction_type_group', 'extraction_type_class','management', 'payment', 'water_quality', 'management_group', 'quality_group', 
#       'quantity','source_class', 'waterpoint_type_group']

target_encoded = ['wpt_name', 'basin', 'region', 'district_code', 'lga', 'ward', 'scheme_management','installer','source',
'extraction_type', 'extraction_type_group', 'extraction_type_class','management', 'payment', 'water_quality', 'management_group', 'quality_group', 
      'quantity','source_class', 'waterpoint_type_group']

target = 'status_group'

encoder = TargetEncoder()
encoder_ohe = OneHotEncoder(sparse=False)

# for c in lst_te:
#     df[str(c) + '_encoded'] = encoder.fit_transform(df[c].values, df[target])
#     df.drop(columns=c, inplace=True)
    
for c in target_encoded:
    df[str(c) + '_encoded'] = encoder.fit_transform(df[c].values, df[target])
    df.drop(columns=c, inplace=True)   

    
# df_new = df[ohe]
# encoder_ohe.fit(df_new)
# x = encoder_ohe.transform(df_new)
# df1 = pd.DataFrame(x)
# df = pd.concat([df, df1], axis=1)
# df.drop(columns=ohe, inplace=True)


df.head()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,population,public_meeting,permit,construction_year,status_group,...,extraction_type_group_encoded,extraction_type_class_encoded,management_encoded,payment_encoded,water_quality_encoded,management_group_encoded,quality_group_encoded,quantity_encoded,source_class_encoded,waterpoint_type_group_encoded
0,69572,6000.0,1390,34.938093,-9.856322,109,True,False,1999,2,...,1.299365,1.299365,1.077369,1.572488,1.208706,1.150886,1.208706,1.376966,1.144495,1.236968
1,8776,0.0,1399,34.698766,-2.147466,280,True,True,2010,2,...,1.299365,1.299365,1.298081,0.973055,1.208706,1.150886,1.208706,1.142309,1.208208,1.236968
2,34310,25.0,686,37.460664,-3.821329,250,True,True,2009,2,...,1.299365,1.299365,1.077369,1.401113,1.208706,1.150886,1.208706,1.376966,1.208208,1.236968
3,67743,0.0,263,38.486161,-11.155298,58,True,True,1986,0,...,1.118466,1.118466,1.077369,0.973055,1.208706,1.150886,1.208706,0.056196,1.144495,1.236968
4,19728,0.0,0,31.130847,-1.825359,0,True,True,0,2,...,1.299365,1.299365,1.261848,0.973055,1.208706,1.179215,1.208706,1.250864,1.208208,1.236968


In [7]:
df.to_csv('encoded_data.csv')

In [8]:
used_cols = [c for c in df.columns.tolist() if c not in [target]]
X, y = df[used_cols], df[target]

In [9]:
clf = LGBMClassifier()

scores = cross_val_score(clf, X, y, cv=5)
print(scores.mean(), "+/-", scores.std())

0.806835032198127 +/- 0.0024627432892697283


In [15]:
# This result is just with doing target encoding
clf = XGBClassifier(objective = 'multi:softmax', booster = 'gbtree', nrounds = 'min.error.idx', 
                      num_class = 3, maximize = False, eval_metric = 'merror', eta = .1,
                      max_depth = 14, colsample_bytree = .4)

scores = cross_val_score(clf, X, y, cv=5)
print(scores.mean(), "+/-", scores.std())

0.8242759269682157 +/- 0.003614192216550845


In [12]:
df['construction_year'].unique()

array([1999, 2010, 2009, 1986,    0, 2011, 1987, 1991, 1978, 1992, 2008,
       1974, 2000, 2002, 2004, 1972, 2003, 1980, 2007, 1973, 1985, 1970,
       1995, 2006, 1962, 2005, 1997, 2012, 1996, 1977, 1983, 1984, 1990,
       1982, 1976, 1988, 1989, 1975, 1960, 1961, 1998, 1963, 1971, 1994,
       1968, 1993, 2001, 1979, 1967, 2013, 1969, 1981, 1964, 1966, 1965])

# PCA

In [None]:
used_cols = [c for c in df.columns.tolist() if c not in [target]]
X, y = df[used_cols].values, df[target].values
X = standardize(X)

pca = PrincipalComponentAnalysis(n_components=7,
                                 solver='svd')
pca.fit(X)
X_pca = pca.transform(X)