In [45]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
pd.set_option('display.max_columns',100)

In [55]:

class OneHotEncoding():
	"""String to numbers categorical encoder."""

	def __init__(self, variable=None):
		if not isinstance(variable, list):
			self.variable = [variable]
		else:
			self.variable = variable
			
	def fit(self, X, y):
		self.onehot_dic_ = {}
		for col in self.variable:
			self.onehot_dic_[col+'_fitted'] = OneHotEncoder(sparse=False,dtype=int)
			self.onehot_dic_[col+'_fitted'].fit((np.array(X[col])).reshape(-1, 1))

		return self

	def transform(self, X):
		# encode labels
		X = X.copy()
		for col in self.variable:
			self.heading = list()
			for i in list(self.onehot_dic_[col+'_fitted'].categories_[0]):  #generating the new columns
				self.heading.append(col+'_'+str(i)) 
			dummies = self.onehot_dic_[col+'_fitted'].transform((np.array(X[col])).reshape(-1, 1))
			X.drop(columns=[col],axis=1,inplace=True)
			dummies = pd.DataFrame(data=dummies,columns=self.heading)
			X = pd.concat([X,dummies],axis=1)    #concatinating the dummy to X
			
		return X

In [56]:
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,row_id,loan_type,property_type,loan_purpose,occupancy,loan_amount,preapproval,msa_md,state_code,county_code,applicant_ethnicity,applicant_race,applicant_sex,applicant_income,population,minority_population_pct,ffiecmedian_family_income,tract_to_msa_md_income_pct,number_of_owner-occupied_units,number_of_1_to_4_family_units,lender,co_applicant,accepted
0,110049,1,1,2,1,78.0,3,77,31,20,2,5,1,14.0,1993.0,29.525,58433.0,68.959,918.0,942.0,5053,False,0
1,83668,1,1,1,1,322.0,2,358,32,2,2,5,1,143.0,4402.0,42.265,109352.0,100.0,773.0,1081.0,5952,True,1
2,253898,1,1,1,1,144.0,3,139,3,299,1,5,1,48.0,7829.0,44.841,98621.0,100.0,1799.0,1968.0,4576,False,1
3,97905,1,1,3,1,523.0,3,266,48,83,2,5,1,454.0,4691.0,9.089,112005.0,100.0,1561.0,1982.0,3108,True,0
4,417495,1,1,3,1,143.0,3,171,52,257,1,5,1,25.0,2408.0,30.84,75925.0,92.052,670.0,893.0,5113,False,1


In [57]:
data.isnull().any()

row_id                            False
loan_type                         False
property_type                     False
loan_purpose                      False
occupancy                         False
loan_amount                       False
preapproval                       False
msa_md                            False
state_code                        False
county_code                       False
applicant_ethnicity               False
applicant_race                    False
applicant_sex                     False
applicant_income                   True
population                         True
minority_population_pct            True
ffiecmedian_family_income          True
tract_to_msa_md_income_pct         True
number_of_owner-occupied_units     True
number_of_1_to_4_family_units      True
lender                            False
co_applicant                      False
accepted                          False
dtype: bool

In [58]:
feature = ['loan_purpose','applicant_ethnicity']
onehot = OneHotEncoding(feature)

In [59]:
onehot.fit(data[['loan_purpose','applicant_ethnicity']],data.accepted)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


<__main__.OneHotEncoding at 0x1fcafe3af60>

In [60]:
onehot.onehot_dic_

{'loan_purpose_fitted': OneHotEncoder(categorical_features=None, categories=None, drop=None,
               dtype=<class 'int'>, handle_unknown='error', n_values=None,
               sparse=False),
 'applicant_ethnicity_fitted': OneHotEncoder(categorical_features=None, categories=None, drop=None,
               dtype=<class 'int'>, handle_unknown='error', n_values=None,
               sparse=False)}

In [61]:
onehot.onehot_dic_['loan_purpose_fitted'].categories_

[array([1., 2., 3.])]

In [62]:
onehot.onehot_dic_['applicant_ethnicity_fitted'].categories_

[array([1., 2., 3., 4.])]

In [63]:
onehot.transform(data[['loan_purpose','applicant_ethnicity']])

Unnamed: 0,loan_purpose_1.0,loan_purpose_2.0,loan_purpose_3.0,applicant_ethnicity_1.0,applicant_ethnicity_2.0,applicant_ethnicity_3.0,applicant_ethnicity_4.0
0,0,1,0,0,1,0,0
1,1,0,0,0,1,0,0
2,1,0,0,1,0,0,0
3,0,0,1,0,1,0,0
4,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...
319995,1,0,0,0,1,0,0
319996,0,0,1,0,1,0,0
319997,0,1,0,0,1,0,0
319998,0,0,1,0,1,0,0


In [None]:
osumi.isnull().any()

In [None]:
osumi.notnull().values.all()

In [94]:
np.isfinite(osumi).all()

occupancy                  True
loan_purpose_1.0           True
loan_purpose_2.0           True
loan_purpose_3.0           True
applicant_ethnicity_1.0    True
applicant_ethnicity_2.0    True
applicant_ethnicity_3.0    True
applicant_ethnicity_4.0    True
dtype: bool

In [95]:
osumi.head()

Unnamed: 0,occupancy,loan_purpose_1.0,loan_purpose_2.0,loan_purpose_3.0,applicant_ethnicity_1.0,applicant_ethnicity_2.0,applicant_ethnicity_3.0,applicant_ethnicity_4.0
0,1,0,1,0,0,1,0,0
1,1,1,0,0,0,1,0,0
2,1,1,0,0,1,0,0,0
3,1,0,0,1,0,1,0,0
4,1,0,0,1,1,0,0,0


In [96]:
from sklearn.ensemble import GradientBoostingClassifier

gradientboost = GradientBoostingClassifier(n_estimators= 300,learning_rate=0.1,random_state=42)
gradientboost.fit(osumi,data.accepted)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=300,
                           n_iter_no_change=None, presort='auto',
                           random_state=42, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [87]:
data[['loan_purpose','occupancy']].head()

Unnamed: 0,loan_purpose,occupancy
0,2,1
1,1,1
2,1,1
3,3,1
4,3,1


In [12]:
lb = LabelEncoder()

In [None]:
lb.fit()

In [13]:
hothot = OneHotEncoder()

In [None]:
hothot.fit()

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gradientboost = GradientBoostingClassifier(n_estimators= 300,learning_rate=0.1,random_state=42)
gradientboost.fit(X_train,y_train)