In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,recall_score, f1_score,precision_score,accuracy_score
# import missingno as mns, pip install missingno
# skilearn is moduel in changer of mancgi

In [2]:
df  = pd.read_csv('credit_customers.csv')
df.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,<0,6.0,critical/other existing credit,radio/tv,1169.0,no known savings,>=7,4.0,male single,none,...,real estate,67.0,none,own,2.0,skilled,1.0,yes,yes,good
1,0<=X<200,48.0,existing paid,radio/tv,5951.0,<100,1<=X<4,2.0,female div/dep/mar,none,...,real estate,22.0,none,own,1.0,skilled,1.0,none,yes,bad
2,no checking,12.0,critical/other existing credit,education,2096.0,<100,4<=X<7,2.0,male single,none,...,real estate,49.0,none,own,1.0,unskilled resident,2.0,none,yes,good
3,<0,42.0,existing paid,furniture/equipment,7882.0,<100,4<=X<7,2.0,male single,guarantor,...,life insurance,45.0,none,for free,1.0,skilled,2.0,none,yes,good
4,<0,24.0,delayed previously,new car,4870.0,<100,1<=X<4,3.0,male single,none,...,no known property,53.0,none,for free,2.0,skilled,2.0,none,yes,bad


In [3]:
missing_column_values = df.isnull().sum()
missing_colun_per= (df.isnull().sum()/len(df)) * 100
total_missing_values = pd.concat([missing_column_values,missing_colun_per], axis = 1,keys=['Missing values','Percentage'])
total_missing_values =total_missing_values.sort_values('Percentage', ascending= False)
total_missing_values.head(20)

Unnamed: 0,Missing values,Percentage
checking_status,0,0.0
property_magnitude,0,0.0
foreign_worker,0,0.0
own_telephone,0,0.0
num_dependents,0,0.0
job,0,0.0
existing_credits,0,0.0
housing,0,0.0
other_payment_plans,0,0.0
age,0,0.0


In [4]:
# checking the shape of the data
df.shape

(1000, 21)

In [5]:
df['class'].value_counts()

class
good    700
bad     300
Name: count, dtype: int64

In [6]:
from sklearn.utils import resample
df_good = df[df['class'] == 'good']
df_bad = df[df['class']== 'bad']
df_sam = resample(df_bad,n_samples= 700)
df = pd.concat([df_good, df_sam], ignore_index= True)
df = df.sample(frac= 1)
df['class'].value_counts()

class
bad     700
good    700
Name: count, dtype: int64

In [7]:
encoder = LabelEncoder()
categorical_data = df.select_dtypes(include=['object','category'])
for x in categorical_data:
    df[x] = encoder.fit_transform(df[x])
df.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
1259,1,24.0,1,4,1199.0,2,3,4.0,3,2,...,0,60.0,1,1,2.0,3,1.0,0,1,0
47,3,24.0,3,6,3181.0,2,2,4.0,0,2,...,1,26.0,1,1,1.0,1,1.0,1,1,1
605,3,36.0,3,0,7409.0,4,3,3.0,3,2,...,1,37.0,1,1,2.0,1,1.0,0,1,1
638,0,36.0,3,2,3711.0,4,0,2.0,2,2,...,0,27.0,1,1,1.0,1,1.0,0,1,1
394,3,24.0,3,6,1552.0,2,1,3.0,3,2,...,0,32.0,0,1,1.0,1,2.0,0,1,1


In [8]:
x = df.drop('class', axis=1)
y = df['class']
xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size= 0.2, random_state= 1)

In [9]:
model1 = LogisticRegression(solver= 'newton-cholesky')
model1.fit(xtrain,ytrain)

In [10]:
pred1 = model1.predict(xtest)
print('\nClassificaton Report\n', classification_report(ytest,pred1))
print('\nAccuracy Report\n', accuracy_score(ytest,pred1))
print('\nPrecision Report\n', precision_score(ytest,pred1))
print('\nRecall Report\n', recall_score(ytest,pred1))
print('\nF1 score Report\n', f1_score(ytest,pred1))
print('\nonfunsion Matix  Report\n', confusion_matrix(ytest,pred1))


Classificaton Report
               precision    recall  f1-score   support

           0       0.69      0.72      0.70       137
           1       0.72      0.69      0.70       143

    accuracy                           0.70       280
   macro avg       0.70      0.70      0.70       280
weighted avg       0.70      0.70      0.70       280


Accuracy Report
 0.7

Precision Report
 0.7153284671532847

Recall Report
 0.6853146853146853

F1 score Report
 0.7

onfunsion Matix  Report
 [[98 39]
 [45 98]]


In [11]:
model2 = RandomForestClassifier()
model2.fit(xtrain,ytrain)

In [12]:
pred2 = model2.predict(xtest)
print('\nClassificaton Report\n', classification_report(ytest,pred2))
print('\nAccuracy Report\n', accuracy_score(ytest,pred2))
print('\nPrecision Report\n', precision_score(ytest,pred2))
print('\nRecall Report\n', recall_score(ytest,pred2))
print('\nF1 score Report\n', f1_score(ytest,pred2))
print('\nconfunsion Matix  Report\n', confusion_matrix(ytest,pred2))


Classificaton Report
               precision    recall  f1-score   support

           0       0.89      0.93      0.91       137
           1       0.93      0.89      0.91       143

    accuracy                           0.91       280
   macro avg       0.91      0.91      0.91       280
weighted avg       0.91      0.91      0.91       280


Accuracy Report
 0.9107142857142857

Precision Report
 0.9338235294117647

Recall Report
 0.8881118881118881

F1 score Report
 0.910394265232975

confunsion Matix  Report
 [[128   9]
 [ 16 127]]


In [13]:
df[df['class']== 1].transpose()

Unnamed: 0,47,605,638,394,498,232,543,186,356,329,...,126,294,3,230,386,415,125,192,139,688
checking_status,3.0,3.0,0.0,3.0,0.0,1.0,3.0,3.0,3.0,3.0,...,3.0,3.0,3.0,0.0,3.0,3.0,0.0,3.0,0.0,3.0
duration,24.0,36.0,36.0,24.0,9.0,15.0,36.0,15.0,6.0,24.0,...,36.0,14.0,36.0,24.0,24.0,9.0,10.0,10.0,18.0,13.0
credit_history,3.0,3.0,3.0,3.0,3.0,2.0,1.0,3.0,3.0,3.0,...,3.0,2.0,3.0,3.0,3.0,3.0,3.0,1.0,4.0,3.0
purpose,6.0,0.0,2.0,6.0,6.0,3.0,4.0,9.0,4.0,9.0,...,4.0,4.0,2.0,3.0,6.0,3.0,4.0,4.0,3.0,6.0
credit_amount,3181.0,7409.0,3711.0,1552.0,2118.0,3643.0,3535.0,3812.0,3518.0,2670.0,...,909.0,802.0,9055.0,4351.0,1311.0,1388.0,7308.0,1231.0,3244.0,1409.0
savings_status,2.0,4.0,4.0,2.0,2.0,2.0,2.0,0.0,2.0,2.0,...,1.0,2.0,4.0,4.0,0.0,2.0,2.0,2.0,2.0,0.0
employment,2.0,3.0,0.0,1.0,0.0,3.0,1.0,2.0,0.0,3.0,...,3.0,0.0,0.0,0.0,1.0,0.0,4.0,3.0,0.0,4.0
installment_commitment,4.0,3.0,2.0,3.0,2.0,1.0,4.0,1.0,2.0,4.0,...,4.0,4.0,2.0,1.0,4.0,4.0,2.0,3.0,1.0,2.0
personal_status,0.0,3.0,2.0,3.0,3.0,0.0,3.0,0.0,3.0,3.0,...,3.0,3.0,3.0,0.0,2.0,0.0,3.0,3.0,0.0,0.0
other_parties,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0


### horse classification

In [61]:
dg = pd.read_csv('horse.csv')
dg.head()

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,530101.0,38.5,66.0,28.0,cool,reduced,,more_3_sec,...,45.0,8.4,,,died,no,11300.0,0.0,0,no
1,yes,adult,534817.0,39.2,88.0,20.0,,,pale_cyanotic,less_3_sec,...,50.0,85.0,cloudy,2.0,euthanized,no,2208.0,0.0,0,no
2,no,adult,530334.0,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,...,33.0,6.7,,,lived,no,0.0,0.0,0,yes
3,yes,young,5290409.0,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,...,48.0,7.2,serosanguious,5.3,died,yes,2208.0,0.0,0,yes
4,no,adult,530255.0,37.3,104.0,35.0,,,dark_cyanotic,more_3_sec,...,74.0,7.4,,,died,no,4300.0,0.0,0,no


In [62]:
missing_column_values = dg.isnull().sum()
missing_colun_per= (dg.isnull().sum()/len(df)) * 100
total_missing_values = pd.concat([missing_column_values,missing_colun_per], axis = 1,keys=['Missing values','Percentage'])
total_missing_values =total_missing_values.sort_values('Percentage', ascending= False)
total_missing_values.head(20)

Unnamed: 0,Missing values,Percentage
nasogastric_reflux_ph,246,17.571429
abdomo_protein,201,14.357143
abdomo_appearance,168,12.0
abdomen,124,8.857143
nasogastric_reflux,109,7.785714
nasogastric_tube,107,7.642857
rectal_exam_feces,107,7.642857
peripheral_pulse,73,5.214286
rectal_temp,65,4.642857
respiratory_rate,64,4.571429


In [63]:
numerical_data = dg.select_dtypes(include=['int','float'])
categorical_data = dg.select_dtypes(include=['object','category'])
for x in numerical_data:
    dg[x].fillna(np.mean(dg[x]), inplace= True)

for x in categorical_data:
    dg[x].fillna(dg[x].mode()[0], inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dg[x].fillna(np.mean(dg[x]), inplace= True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dg[x].fillna(dg[x].mode()[0], inplace = True)


In [64]:
dg.shape

(299, 28)

In [65]:
dg['outcome'].value_counts()

outcome
lived         182
died           76
euthanized     41
Name: count, dtype: int64

In [66]:
dg['outcome'].unique()

array(['died', 'euthanized', 'lived'], dtype=object)

In [67]:
from sklearn.utils import resample
dg_lived = dg[dg['outcome'] == 'lived']
dg_died = dg[dg['outcome'] == 'died']
dg_euthanized = dg[dg['outcome'] == 'euthanized']
dg_sam1 = resample(dg_died, n_samples= 182)
dg_sam2 = resample(dg_euthanized, n_samples= 182)
dg = pd.concat([dg_lived, dg_sam1,dg_sam2], ignore_index=True)
dg= dg.sample(frac= 1)
dg['outcome'].value_counts()

outcome
euthanized    182
lived         182
died          182
Name: count, dtype: int64

In [68]:
encoder = LabelEncoder()
categorical_data = dg.select_dtypes(include=['object','category'])
for x in categorical_data:
    dg[x] = encoder.fit_transform(dg[x])
dg.head()

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
383,0,0,529493.0,38.3,112.0,16.0,1,3,1,2,...,51.0,6.0,1,1.0,1,0,5205.0,0.0,0,1
73,1,0,528461.0,38.0,42.0,68.0,0,2,3,1,...,41.0,7.6,1,3.054082,2,1,2205.0,0.0,0,0
337,0,0,529461.0,40.3,114.0,36.0,1,3,3,2,...,57.0,8.1,2,4.5,0,1,3205.0,0.0,0,1
65,1,0,534963.0,38.18547,40.0,30.455319,3,2,3,1,...,39.0,56.0,1,3.054082,2,1,3111.0,0.0,0,0
495,0,0,528006.0,38.18547,40.0,16.0,1,2,3,1,...,50.0,7.0,1,3.9,1,1,2208.0,0.0,0,1


In [86]:
x = dg.drop('outcome', axis = 1)
y = dg['outcome']
xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size= 0.15, random_state= 1)

In [100]:
model1 = RandomForestClassifier()
model1.fit(xtrain, ytrain)

In [101]:
pred1 = model1.predict(xtest)
print('\nClassificaton Report\n', classification_report(ytest,pred1))
print('\nAccuracy Report\n', accuracy_score(ytest,pred1))
print('\nPrecision Report\n',precision_score(ytest,pred1, average='macro'))
print('\nRecall Report\n', recall_score(ytest,pred1, average= 'macro'))
print('\nF1 score Report\n', f1_score(ytest,pred1, average= 'macro'))
print('\nconfunsion Matix  Report\n', confusion_matrix(ytest,pred1))


Classificaton Report
               precision    recall  f1-score   support

           0       0.96      1.00      0.98        24
           1       0.97      1.00      0.99        38
           2       1.00      0.90      0.95        20

    accuracy                           0.98        82
   macro avg       0.98      0.97      0.97        82
weighted avg       0.98      0.98      0.98        82


Accuracy Report
 0.975609756097561

Precision Report
 0.9781196581196582

Recall Report
 0.9666666666666667

F1 score Report
 0.9713244149334375

confunsion Matix  Report
 [[24  0  0]
 [ 0 38  0]
 [ 1  1 18]]
