##  Learning pipeline

In [1]:
import pandas as pd
df = pd.read_csv('train.csv')

In [2]:
#Inspect df
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [3]:
df.shape
#12 columns with 614 rows

(614, 13)

In [4]:
#Find data types
df.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [5]:
#Havve both categorical and numeric variables
# Gender, married, dependents, education, self_employed, property_area, and loan status all objects
# Will have to possibly encode these columns
# Dropping Loan_ID so I can really test data
df = df.drop('Loan_ID', axis =1)

In [6]:
dfcat = (df[['Gender','Married','Dependents','Education','Self_Employed','Property_Area','Loan_Status']])
dfcat.head()
#inspecting cat columns closer

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,Urban,Y
1,Male,Yes,1,Graduate,No,Rural,N
2,Male,Yes,0,Graduate,Yes,Urban,Y
3,Male,Yes,0,Not Graduate,No,Urban,Y
4,Male,No,0,Graduate,No,Urban,Y


In [7]:
#Looks like Gender, Married, Self_employed, and Loan_status can be one hot encoded as only 2 values.
#Dependents is a string and can be converted to an int
#Check unique values for Education and Property_area
#df['Dependents'].astype(str).astype(int)
#Tested using this but returned error as some non integer elements 
#Error returned invalid literal for int() with base 10: '3+'


In [8]:
#Find out how many unique values for dependents and what is causing error
#3+is causing error
df['Dependents'].value_counts()

0     345
1     102
2     101
3+     51
Name: Dependents, dtype: int64

In [9]:
#Replaced 3+ with 3
df['Dependents']= df['Dependents'].replace(to_replace ="3+", value ="3")
#Check if it worked
df['Dependents'].value_counts()

0    345
1    102
2    101
3     51
Name: Dependents, dtype: int64

In [10]:
df['Dependents'] = pd.to_numeric(df['Dependents'])

In [11]:
df["Education"].value_counts()
#Gave me unique values and it's only 2 can use one hot encoding

Graduate        480
Not Graduate    134
Name: Education, dtype: int64

In [12]:
df["Property_Area"].value_counts()
#Gave me 3 unique values low cardinality and can use one hot

Semiurban    233
Urban        202
Rural        179
Name: Property_Area, dtype: int64

In [13]:
df.dtypes
#now Dependents a float

Gender                object
Married               object
Dependents           float64
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [14]:
#Split training data into a train and test so I can validate performance of model
from sklearn.model_selection import train_test_split
X= df.drop('Loan_Status', axis=1)  #Dropping loan status as that will be target.  These will be dependent variables
y =df['Loan_Status'] #Target and Independent variable
#Important part as this will give me my train test split.  Size i'm guessing will give me 20 percent test size
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size =0.2)  

In [15]:
#Pipeline being used here to build a transformer type.  Create transformer for different variable types
#Importing Pipeline process I'm not sure how it works
from sklearn.pipeline import Pipeline
#Simpleimputer will fill in any missing values.  Nice function if there is values missing which will result in errors
from sklearn.impute import SimpleImputer
#Standardscaler standardizes a feature by subtracting the mean and then scaling to unit variance. 
#Unit variance means dividing all the values by the standard deviation
#Results in a standard deviation equal to 1
#Onehot great for low cardinality will make a column for each unique value and transforms cat data into int
from sklearn.preprocessing import StandardScaler, OneHotEncoder
#numeric transformer seems to be using transformer to work on numerical data, then simpleimputer to fill values, and 
#puts all this into pipeline, which will give an estimator  
#Pipeline applys a list of transforms and a final estimator.  Final estimator implements it
#steps is list of(name, transfor), (implementing/transform)
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])
#categorical_transformer used on categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [16]:
#This will work with columns with int and float dtypes in DF  Will give an index 
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns
#This will work with categorical features and drop Loan Status our target
categorical_features = df.select_dtypes(include=['object']).drop(['Loan_Status'], axis=1).columns

In [17]:
numeric_features
#Worked selected only those with numeric features

Index(['Dependents', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History'],
      dtype='object')

In [18]:
categorical_features
#Worked showing cat features in index form

Index(['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area'], dtype='object')

In [19]:
##Use of ColumnTransformer to apply the transformations to only the correct columns.  COOL
from sklearn.compose import ColumnTransformer
#Preprocessing using index created previously of numeric and cat and Column transformer
#Implementing the features and transformer into a a way to be fitted into pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [21]:
#Fitting classifier. Combine the preprocessor created with a classifier
#Used RandomForestClassifier is a meta estimator that fits a number of decision tree classifiers 
#on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting.
from sklearn.ensemble import RandomForestClassifier
#Using random forest classifier Added preprocessed features.
#Pipeline will include steps list of name, transform and way of implementing it
rf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier())])

In [22]:
#Now calling the fit method on raw data and the preprocessing steps will be applied followed by training classifier
rf.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             

In [25]:
#Prediction on new data will give predicted results not score
y_pred = rf.predict(X_test)

In [26]:
#This will give a score on model implementation
from sklearn.metrics import accuracy_score, log_loss
#KneighborsClassifier is a classifier implementing the k-nearest neighbors vote
from sklearn.neighbors import KNeighborsClassifier
#SVC is C-support vector classfication the fit time scales at least quadratically with the number of samples
#may be impracticl beyond tens of thousnds of samples.  For LARGE DATA SETS
#NUSVC NU SUPPORT VECOTR CLASSIFICATION similiar to SVC but uses a parameter to control the number of support vectors
from sklearn.svm import SVC, LinearSVC, NuSVC
#Decision tree classifier makes decision trees
from sklearn.tree import DecisionTreeClassifier
#Adaboost classifier is a meta estimator that begins by fitting a classifier on the original data set
#then fits additional copies of the classifier on the same dataset but where the weights of incorrectly classified
#are adjusted such that subsequent classifiers focus more on difficult cases
#Gradiant boosting is an additive model in foward stage fashion.  It allows for optimization of arbitrary differentiable
#loss functions
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
#Linear discriminant a classifier with a linear decision boundary, generated by fitting class conditional densities
#using Bayes rule.  Assumes all classes share same covariance matrix
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
#Quadratic discriminant is a classifier with a quadratic decision boundary.  Generated by fitting class conditional
#densities to the data using Bayes rule
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="rbf", C=0.025, probability=True),
    NuSVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier()
    ]
for classifier in classifiers:
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', classifier)])
    pipe.fit(X_train, y_train)   
    print(classifier)
    print("model score: %.3f" % pipe.score(X_test, y_test))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')
model score: 0.740
SVC(C=0.025, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=True, random_state=None,
    shrinking=True, tol=0.001, verbose=False)
model score: 0.699
NuSVC(cache_size=200, class_weight=None, coef0=0.0,
      decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
      kernel='rbf', max_iter=-1, nu=0.5, probability=True, random_state=None,
      shrinking=True, tol=0.001, verbose=False)
model score: 0.797




DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')
model score: 0.675
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
model score: 0.740
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learni

In [None]:
#NUSVC gave best score .797
#Decision Tree gave worst score .675

In [27]:
#pipeline used in grid search to find best performing parameters
#Exhaustive search over specified parameter values for an estimator
#Gridsearchcv class computes accuracy metrics for an algorithm on varioius combinations of parameters, 
#over a cross-validation procedure.  Useful for finding the best parameters for a prediction algorithm
param_grid = { 
    'classifier__n_estimators': [200, 500],
    'classifier__max_features': ['auto', 'sqrt', 'log2'],
    'classifier__max_depth' : [4,5,6,7,8],
    'classifier__criterion' :['gini', 'entropy']}
from sklearn.model_selection import GridSearchCV
CV = GridSearchCV(rf, param_grid, n_jobs= 1)
                  
CV.fit(X_train, y_train)
print(CV.best_params_)    
print(CV.best_score_)



{'classifier__criterion': 'gini', 'classifier__max_depth': 4, 'classifier__max_features': 'auto', 'classifier__n_estimators': 200}
0.8126272912423625
