In [2]:
import pandas as pd

In [3]:
data = pd.read_csv('dataset/german_credit_data_weka_dataset.csv')
data.head()

Unnamed: 0,checking_account_status,duration,credit_history,purpose,credit_amount,savings,present_employment,installment_rate,personal,other_debtors,...,property,age,other_installment_plans,housing,existing_credits,job,dependents,telephone,foreign_worker,customer_type
0,A11,6,A34,A43,1169.0,A65,A75,4.0,A93,A101,...,A121,67.0,A143,A152,2.0,A173,1,A192,A201,1
1,A12,48,A32,A43,5951.0,A61,A73,2.0,A92,A101,...,A121,22.0,A143,A152,1.0,A173,1,A191,A201,2
2,A14,12,A34,A46,2096.0,A61,A74,2.0,A93,A101,...,A121,49.0,A143,A152,1.0,A172,2,A191,A201,1
3,A11,42,A32,A42,7882.0,A61,A74,2.0,A93,A103,...,A122,45.0,A143,A153,1.0,A173,2,A191,A201,1
4,A11,24,A33,A40,4870.0,A61,A73,3.0,A93,A101,...,A124,53.0,A143,A153,2.0,A173,2,A191,A201,2


In [4]:
data.shape

(1000, 21)

In [6]:
data.columns

Index(['checking_account_status', 'duration', 'credit_history', 'purpose',
       'credit_amount', 'savings', 'present_employment', 'installment_rate',
       'personal', 'other_debtors', 'present_residence', 'property', 'age',
       'other_installment_plans', 'housing', 'existing_credits', 'job',
       'dependents', 'telephone', 'foreign_worker', 'customer_type'],
      dtype='object')

In [7]:
data = data.drop(['telephone', 'personal', 'present_residence', 'other_installment_plans'], axis= 1)

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   checking_account_status  1000 non-null   object 
 1   duration                 1000 non-null   int64  
 2   credit_history           1000 non-null   object 
 3   purpose                  1000 non-null   object 
 4   credit_amount            1000 non-null   float64
 5   savings                  1000 non-null   object 
 6   present_employment       1000 non-null   object 
 7   installment_rate         1000 non-null   float64
 8   other_debtors            1000 non-null   object 
 9   property                 1000 non-null   object 
 10  age                      1000 non-null   float64
 11  housing                  1000 non-null   object 
 12  existing_credits         1000 non-null   float64
 13  job                      1000 non-null   object 
 14  dependents               

In [10]:
data['savings'].unique()

array(['A65', 'A61', 'A63', 'A64', 'A62'], dtype=object)

In [11]:
from sklearn.preprocessing import LabelEncoder

savings_dict = {'A65':0, 'A61':1, 'A62':2, 'A63':3, 'A64':4}
data['savings'].replace(savings_dict, inplace= True)
data.head()

Unnamed: 0,checking_account_status,duration,credit_history,purpose,credit_amount,savings,present_employment,installment_rate,other_debtors,property,age,housing,existing_credits,job,dependents,foreign_worker,customer_type
0,A11,6,A34,A43,1169.0,0,A75,4.0,A101,A121,67.0,A152,2.0,A173,1,A201,1
1,A12,48,A32,A43,5951.0,1,A73,2.0,A101,A121,22.0,A152,1.0,A173,1,A201,2
2,A14,12,A34,A46,2096.0,1,A74,2.0,A101,A121,49.0,A152,1.0,A172,2,A201,1
3,A11,42,A32,A42,7882.0,1,A74,2.0,A103,A122,45.0,A153,1.0,A173,2,A201,1
4,A11,24,A33,A40,4870.0,1,A73,3.0,A101,A124,53.0,A153,2.0,A173,2,A201,2


In [12]:
data = pd.get_dummies(data, columns=['checking_account_status',
                                     'credit_history',
                                     'purpose',
                                     'present_employment',
                                     'property',
                                     'housing',
                                     'other_debtors',
                                     'job',
                                     'foreign_worker'])
data.shape

(1000, 48)

In [14]:
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [15]:
def naive_bayes(x_train, y_train):
    
    classifier = GaussianNB()
    classifier.fit(x_train, y_train)
    
    return classifier

In [16]:
def k_nn(x_train, y_train):
    
    classifier = KNeighborsClassifier(n_neighbors= 10)
    classifier.fit(x_train, y_train)
    
    return classifier

In [17]:
def svc(x_train, y_train):
    
    classifier = SVC(kernel= 'rbf', gamma= 'scale')
    classifier.fit(x_train, y_train)
    
    return classifier

In [18]:
def decision_tree(x_train, y_train):
    
    classifier = DecisionTreeClassifier(max_depth= 6)
    classifier.fit(x_train, y_train)
    
    return classifier

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [20]:
X = data.drop('customer_type', axis= 1)
Y = data['customer_type']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size= 0.2)

In [21]:
def build_train_classifier(x_train, y_train, classification_fn):
    
    model = classification_fn(x_train, y_train)
    y_pred = model.predict(x_test)
    
    train_score = model.score(x_train, y_train)
    test_score = accuracy_score(y_test, y_pred)
    
    print("trainig score is ", train_score)
    print("testing score is ", test_score)

In [22]:
build_train_classifier(x_train, y_train, naive_bayes)

trainig score is  0.7525
testing score is  0.71


In [23]:
build_train_classifier(x_train, y_train, k_nn)

trainig score is  0.70625
testing score is  0.72


In [24]:
build_train_classifier(x_train, y_train, svc)

trainig score is  0.70125
testing score is  0.74


In [25]:
build_train_classifier(x_train, y_train, decision_tree)

trainig score is  0.82625
testing score is  0.695


In [26]:
#warm start

In [28]:
x_train1, x_train2, y_train1, y_train2 = train_test_split(x_train, y_train, test_size= 0.5)

In [30]:
rfc = RandomForestClassifier(max_depth= 4, n_estimators= 2, warm_start= True)

rfc.fit(x_train1, y_train1)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=4, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=2,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=True)

In [31]:
y_pred = rfc.predict(x_test)
test_score = accuracy_score(y_test, y_pred)
print("testing score is ", test_score)

testing score is  0.635


In [32]:
rfc.n_estimators += 2
rfc.fit(x_train2, y_train2)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=4, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=4,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=True)

In [33]:
y_pred = rfc.predict(x_test)
test_score = accuracy_score(y_test, y_pred)
print("testing score is ", test_score)

testing score is  0.73
