In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.decomposition import PCA

In [2]:
link = 'http://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data'
df = pd.read_csv(link, header = None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [3]:
df = df.replace('?', np.NaN)
df.iloc[:,1] = df.iloc[:,1].astype(float)
df.info()
print('\nNans in each column:')
print(df.isna().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
0     678 non-null object
1     678 non-null float64
2     690 non-null float64
3     684 non-null object
4     684 non-null object
5     681 non-null object
6     681 non-null object
7     690 non-null float64
8     690 non-null object
9     690 non-null object
10    690 non-null int64
11    690 non-null object
12    690 non-null object
13    677 non-null object
14    690 non-null int64
15    690 non-null object
dtypes: float64(3), int64(2), object(11)
memory usage: 56.6+ KB

Nans in each column:
0     12
1     12
2      0
3      6
4      6
5      9
6      9
7      0
8      0
9      0
10     0
11     0
12     0
13    13
14     0
15     0
dtype: int64


In [4]:
df.describe(include= 'all')

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
count,678,678.0,690.0,684,684,681,681,690.0,690,690,690.0,690,690,677.0,690.0,690
unique,2,,,3,3,14,9,,2,2,,2,3,170.0,,2
top,b,,,u,g,c,v,,t,f,,f,g,0.0,,-
freq,468,,,519,519,137,399,,361,395,,374,625,132.0,,383
mean,,31.568171,4.758725,,,,,2.223406,,,2.4,,,,1017.385507,
std,,11.957862,4.978163,,,,,3.346513,,,4.86294,,,,5210.102598,
min,,13.75,0.0,,,,,0.0,,,0.0,,,,0.0,
25%,,22.6025,1.0,,,,,0.165,,,0.0,,,,0.0,
50%,,28.46,2.75,,,,,1.0,,,0.0,,,,5.0,
75%,,38.23,7.2075,,,,,2.625,,,3.0,,,,395.5,


In [5]:
import ml_nechai as ml

#### Dropping data

In [6]:
df1 = ml.dropna(df, subset= [1,2,7,10,14])
# Get targets
targets = df1[15].astype('category')
# Save target-values as color for plotting
# red: -,  green: +
label_color = ['red' if i == '-' else 'green' for i in targets]
print(label_color[:3], label_color[-3:])

['green', 'green', 'green'] ['red', 'red', 'red']


In [7]:
# Split data to train/test sets
X_train, X_test, y_train, y_test = train_test_split(df1[[1,2,7,10,14]], targets, test_size=0.2, random_state=0)
# Create classifier
lr = LogisticRegression()
# Fit classifier
lr = lr.fit(X_train, y_train)
# Predict on test data
y_pred = lr.predict(X_test)
# Get accuracy score
accuracy_score(y_test, y_pred)



0.7205882352941176

#### Prepocessing

In [8]:
df = ml.normalize(df, columns = [2, 7, 10])
df = ml.standardize(df, columns = [1, 10, 14])
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,0.256842,-0.95592,u,g,w,v,-0.290872,t,t,0.014925,f,g,202,0.0,+
1,a,0.675489,-0.060007,u,g,q,h,0.244013,t,t,0.089552,f,g,43,0.0056,+
2,a,0.161654,-0.855481,u,g,q,h,-0.216167,t,f,0.0,f,g,280,0.00824,+
3,b,0.211729,-0.646569,u,g,w,v,0.456175,t,t,0.074627,t,g,100,3e-05,+
4,b,0.096541,0.174015,u,g,w,v,-0.153415,t,f,0.0,f,s,120,0.0,+


In [9]:
df.describe(include= 'all')

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
count,678,678.0,690.0,684,684,681,681,690.0,690,690,690.0,690,690,677.0,690.0,690
unique,2,,,3,3,14,9,,2,2,,2,3,170.0,,2
top,b,,,u,g,c,v,,t,f,,f,g,0.0,,-
freq,468,,,519,519,137,399,,361,395,,374,625,132.0,,383
mean,,0.267942,1.0297720000000001e-17,,,,,1.029772e-16,,,0.035821,,,,0.010174,
std,,0.179817,1.0,,,,,1.0,,,0.072581,,,,0.052101,
min,,0.0,-0.9559198,,,,,-0.6643947,,,0.0,,,,0.0,
25%,,0.13312,-0.7550425,,,,,-0.6150897,,,0.0,,,,0.0,
50%,,0.221203,-0.4035072,,,,,-0.3655762,,,0.0,,,,5e-05,
75%,,0.36812,0.4919034,,,,,0.1200038,,,0.044776,,,,0.003955,


_Displaying Nans in 1st col_

In [10]:
df[pd.isnull(df[1])]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
83,a,,-0.252849,u,g,d,v,0.232061,t,f,0.0,t,g,300.0,0.0,-
86,b,,-0.880591,u,g,d,v,-0.402929,t,f,0.0,t,s,928.0,0.0,-
92,b,,0.048467,y,p,aa,v,1.875562,t,f,0.0,f,g,0.0,0.0,-
97,b,,-0.855481,u,g,c,bb,-0.414881,t,f,0.0,t,s,320.0,0.0,-
254,b,,-0.830371,u,g,k,v,-0.58969,f,f,0.0,f,g,380.0,0.0201,-
286,a,,-0.654604,u,g,ff,ff,-0.664395,f,t,0.029851,t,g,200.0,0.00105,-
329,b,,-0.152411,y,p,i,v,-0.638995,f,f,0.0,t,g,411.0,0.0,-
445,a,,1.30395,u,g,ff,ff,-0.664395,f,f,0.0,f,g,,0.052,-
450,b,,-0.353288,y,p,i,bb,1.427335,f,f,0.0,f,g,0.0,1e-05,-
500,b,,-0.152411,u,g,x,v,0.829698,t,t,0.044776,t,g,290.0,0.02279,+


#### First is KNN

In [11]:
df1 = ml.replacenaKNN(df, 1, train = [2,7,10])

In [12]:
df1.iloc[[83, 86, 92, 97, 254, 286, 329, 445, 450, 500, 515, 608]]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
83,a,0.173143,-0.252849,u,g,d,v,0.232061,t,f,0.0,t,g,300.0,0.0,-
86,b,0.150105,-0.880591,u,g,d,v,-0.402929,t,f,0.0,t,s,928.0,0.0,-
92,b,0.169654,0.048467,y,p,aa,v,1.875562,t,f,0.0,f,g,0.0,0.0,-
97,b,0.249143,-0.855481,u,g,c,bb,-0.414881,t,f,0.0,t,s,320.0,0.0,-
254,b,0.257895,-0.830371,u,g,k,v,-0.58969,f,f,0.0,f,g,380.0,0.0201,-
286,a,0.230316,-0.654604,u,g,ff,ff,-0.664395,f,t,0.029851,t,g,200.0,0.00105,-
329,b,0.181925,-0.152411,y,p,i,v,-0.638995,f,f,0.0,t,g,411.0,0.0,-
445,a,0.179188,1.30395,u,g,ff,ff,-0.664395,f,f,0.0,f,g,,0.052,-
450,b,0.263398,-0.353288,y,p,i,bb,1.427335,f,f,0.0,f,g,0.0,1e-05,-
500,b,0.228301,-0.152411,u,g,x,v,0.829698,t,t,0.044776,t,g,290.0,0.02279,+


In [13]:
df1 = ml.replacena(df1, subset= [0,3,4,5,6,13], mode = 'mode')

In [14]:
df1.info()
print('\nNans in each column:')
print(df1.isna().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
0     690 non-null object
1     690 non-null float64
2     690 non-null float64
3     690 non-null object
4     690 non-null object
5     690 non-null object
6     690 non-null object
7     690 non-null float64
8     690 non-null object
9     690 non-null object
10    690 non-null float64
11    690 non-null object
12    690 non-null object
13    690 non-null object
14    690 non-null float64
15    690 non-null object
dtypes: float64(5), object(11)
memory usage: 56.6+ KB

Nans in each column:
0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
dtype: int64


In [15]:
# Get targets
targets = df1[15].astype('category')
# Save target-values as color for plotting
# red: -,  green: +
label_color = ['red' if i == '-' else 'green' for i in targets]
# Split data to train/test sets
X_train, X_test, y_train, y_test = train_test_split(df1[[1,2,7,10,14]], targets, test_size=0.2, random_state=0)
# Create classifier
lr = LogisticRegression()
# Fit classifier
lr = lr.fit(X_train, y_train)
# Predict on test data
y_pred = lr.predict(X_test)
# Get accuracy score
accuracy_score(y_test, y_pred)



0.6739130434782609

Accuracy lost |x_x|    
Trying something else

#### Mean replace

In [16]:
df1 = ml.replacena(df, subset=[1])

In [17]:
df1 = ml.replacena(df1, subset= [0,3,4,5,6,13], mode = 'mode')

In [18]:
# Get targets
targets = df1[15].astype('category')
# Save target-values as color for plotting
# red: -,  green: +
label_color = ['red' if i == '-' else 'green' for i in targets]
# Split data to train/test sets
X_train, X_test, y_train, y_test = train_test_split(df1[[1,2,7,10,14]], targets, test_size=0.2, random_state=0)
# Create classifier
lr = LogisticRegression()
# Fit classifier
lr = lr.fit(X_train, y_train)
# Predict on test data
y_pred = lr.predict(X_test)
# Get accuracy score
accuracy_score(y_test, y_pred)



0.6884057971014492

Just a little better than before

#### Median replace

In [19]:
df1 = ml.replacena(df, subset=[1], mode = 'median')

In [20]:
df1 = ml.replacena(df1, subset= [0,3,4,5,6,13], mode = 'mode')

In [21]:
# Get targets
targets = df1[15].astype('category')
# Save target-values as color for plotting
# red: -,  green: +
label_color = ['red' if i == '-' else 'green' for i in targets]
# Split data to train/test sets
X_train, X_test, y_train, y_test = train_test_split(df1[[1,2,7,10,14]], targets, test_size=0.2, random_state=0)
# Create classifier
lr = LogisticRegression()
# Fit classifier
lr = lr.fit(X_train, y_train)
# Predict on test data
y_pred = lr.predict(X_test)
# Get accuracy score
accuracy_score(y_test, y_pred)



0.6739130434782609

Became even worse =(

#### Linear regression

In [22]:
df1 = ml.replacena_lr(df, 1)

In [23]:
df1 = ml.replacena(df1, subset= [0,3,4,5,6,13], mode = 'mode')

In [24]:
# Get targets
targets = df1[15].astype('category')
# Save target-values as color for plotting
# red: -,  green: +
label_color = ['red' if i == '-' else 'green' for i in targets]
# Split data to train/test sets
X_train, X_test, y_train, y_test = train_test_split(df1[[1,2,7,10,14]], targets, test_size=0.2, random_state=0)
# Create classifier
lr = LogisticRegression()
# Fit classifier
lr = lr.fit(X_train, y_train)
# Predict on test data
y_pred = lr.predict(X_test)
# Get accuracy score
accuracy_score(y_test, y_pred)



0.6884057971014492

Accuracy slightly increased compared to KNN, still less than dropping