In [1]:
#import libraries
import numpy as np 
import pandas as pd
from pgmpy.models import BayesianNetwork



In [2]:
#import data
kunden_train = pd.read_table("data_dmc2002_train.txt")
kunden_test = pd.read_table("data_dmc2002_class.txt")
kunden_realclass = pd.read_table("data_dmc2002_realclass.txt")

kunden_train.head()

Unnamed: 0,ID,payment_type,power_consumption,HHH,HGEW,MTREG0G,MTKAU0G,MTSTR0G,MTBEB0G,MTSTA0G,...,SCMWGR7,SCMWGR21,SCMWGR22,PHARM1,PHARM2,PHARM3,PHARM4,PHARM5,PHARM6,canceler
0,22891,2,138,7.0,0.0,16.0,-7.0,1.0,3.0,1.0,...,3.0,2.0,3.0,1.0,1.0,6.0,7.0,2.0,6.0,no
1,26663,1,6034,2.0,1.0,14.0,-8.0,1.0,1.0,2.0,...,5.0,4.0,2.0,6.0,2.0,3.0,4.0,4.0,6.0,no
2,2253,2,4996,2.0,0.0,16.0,-6.0,1.0,1.0,6.0,...,5.0,2.0,3.0,7.0,5.0,2.0,3.0,7.0,6.0,no
3,26769,2,1514,1.0,0.0,15.0,-7.0,1.0,1.0,2.0,...,2.0,3.0,5.0,6.0,1.0,2.0,1.0,7.0,7.0,no
4,11264,2,3115,8.0,0.0,16.0,-5.0,1.0,3.0,1.0,...,4.0,2.0,2.0,3.0,1.0,4.0,5.0,6.0,6.0,no


In [3]:
#data cleaning & preprocessing of training data
kunden_train.drop_duplicates
kunden_train=kunden_train.drop(columns= ["payment_type","power_consumption","HHH","HGEW","MTREG0G",
                                        "MTKAU0G","MTSTR0G","MTADE0G","MTKDI0G","MTKLE0G","MTKKL0G","MTKGB0G","MTKGL0G",
                                        "PHARM1","PHARM2","PHARM3","PHARM4","PHARM5","PHARM6",'SCMWGR4','SCMWGR7', 'SCMWGR21','SCMWGR22'], axis= 1)
#rename coloumns 
kunden_train=kunden_train.rename(columns={'MTBEB0G':'Bebauungstyp','MTSTA0G':'Status','MTBON0G':'Bonität','MTALT0G':'Alter','MTFAM0G':'Familienstand'})

#filling missing values with median of training data 
kunden_train = kunden_train.fillna(kunden_train.median(numeric_only=True))

#data cleaning & preprocessing of test data
kunden_test.drop_duplicates
kunden_test=kunden_test.drop(columns= ["payment_type","power_consumption","HHH","HGEW","MTREG0G",
                                        "MTKAU0G","MTSTR0G","MTADE0G","MTKDI0G","MTKLE0G","MTKKL0G","MTKGB0G","MTKGL0G",
                                        "PHARM1","PHARM2","PHARM3","PHARM4","PHARM5","PHARM6",'SCMWGR4','SCMWGR7', 'SCMWGR21','SCMWGR22'], axis= 1)

#filling missing values with median of test data 
kunden_test=kunden_test.rename(columns={'MTBEB0G':'Bebauungstyp','MTSTA0G':'Status','MTBON0G':'Bonität','MTALT0G':'Alter','MTFAM0G':'Familienstand'})
#filling missing values with median of test data 
kunden_test = kunden_test.fillna(kunden_train.median(numeric_only=True))

kunden_train.to_csv('Kunden')



In [4]:
kunden_train.head()

Unnamed: 0,ID,Bebauungstyp,Status,Bonität,Alter,Familienstand,SCMWGR2,SCMWGR3,SCMWGR5,SCMWGR6,canceler
0,22891,3.0,1.0,2.0,5.0,2.0,3.0,5.0,4.0,3.0,no
1,26663,1.0,2.0,2.0,4.0,7.0,5.0,1.0,5.0,4.0,no
2,2253,1.0,6.0,8.0,4.0,9.0,5.0,5.0,3.0,3.0,no
3,26769,1.0,2.0,1.0,6.0,7.0,3.0,2.0,2.0,2.0,no
4,11264,3.0,1.0,8.0,4.0,3.0,3.0,5.0,2.0,1.0,no


In [5]:
#create a bayesian network
model = BayesianNetwork( [('Bonität','Status'),('Status','Bebauungstyp'),
                          ('Status','canceler'),('Bebauungstyp','canceler'),('Alter','canceler'),
                          ('Alter','Familienstand'),('SCMWGR2','Bonität'),('SCMWGR3','Bonität'),('SCMWGR5','Bonität'),
                          ('SCMWGR6','Bonität')])

In [6]:
from pgmpy.estimators import MaximumLikelihoodEstimator

mle = MaximumLikelihoodEstimator(model,kunden_train)
cpd_canceler=mle.estimate_cpd('canceler')
print(cpd_canceler)






+---------------+-------------------+-----+-------------------+
| Alter         | Alter(1.0)        | ... | Alter(8.0)        |
+---------------+-------------------+-----+-------------------+
| Bebauungstyp  | Bebauungstyp(0.0) | ... | Bebauungstyp(5.0) |
+---------------+-------------------+-----+-------------------+
| Status        | Status(1.0)       | ... | Status(9.0)       |
+---------------+-------------------+-----+-------------------+
| canceler(no)  | 0.5               | ... | 0.5               |
+---------------+-------------------+-----+-------------------+
| canceler(yes) | 0.5               | ... | 0.5               |
+---------------+-------------------+-----+-------------------+


In [None]:
#predict 
from pgmpy.inference import VariableElimination
model.fit(kunden_train)
inference = VariableElimination(model)

df_predict = pd.DataFrame()
for kunde in kunden_test.index:
    r=inference.map_query(['canceler'],evidence={'Bebauungstyp':kunden_test['Bebauungstyp'][kunde],
                                                 'Status':kunden_test['Status'][kunde],'Bonität':kunden_test['Bonität'][kunde],
                                                 'Alter':kunden_test['Alter'][kunde],'Familienstand':kunden_test['Familienstand'][kunde],
                                                 'SCMWGR2':kunden_test['SCMWGR2'][kunde],'SCMWGR3':kunden_test['SCMWGR3'][kunde],
                                                 'SCMWGR5':kunden_test['SCMWGR5'][kunde],
                                                 'SCMWGR6':kunden_test['SCMWGR6'][kunde]},show_progress=False)
    temp=pd.DataFrame(columns=['ID','canceler'])
    temp.loc[kunde]=[kunden_test['ID'][kunde],r['canceler']]
    df_predict = pd.concat([df_predict,temp])

In [None]:
from sklearn.metrics import precision_recall_fscore_support,accuracy_score,recall_score

accu = accuracy_score(kunden_realclass['canceler'],df_predict['canceler'])
prec,recall,fscore,u = precision_recall_fscore_support(kunden_realclass['canceler'],df_predict['canceler'])


print('Accuracy: '+str(accu*100)+'%')
print('Precision: '+"{:.2f}".format(prec[0]*100)+'%')
print('Recall: '+"{:.2f}".format(recall[0]*100)+'%')
print('F1: '+"{:.2f}".format(fscore[0]*100)+'%')
