# Анализ данных

In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib as plt

In [3]:
df = pd.read_csv("Bank_Personal_Loan_Modelling.csv", sep=";")
df.sample(10)

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
3179,3180,43,17,53,90245,2,70,1,130,0,1,0,1,0
2922,2923,52,26,49,90011,1,140,3,0,0,0,0,1,0
3917,3918,41,15,89,94608,3,10,1,292,0,0,0,1,0
3293,3294,44,20,62,94939,2,250,1,0,0,1,0,1,0
2686,2687,50,24,81,95053,2,40,3,0,0,0,0,0,0
1904,1905,38,14,91,95060,2,0,1,0,0,0,0,1,0
703,704,41,17,141,94022,2,760,1,92,0,0,0,0,0
4005,4006,56,32,32,95827,2,80,1,79,0,1,0,1,0
565,566,55,29,79,90210,3,80,1,0,0,0,0,1,0
3321,3322,41,15,120,94521,1,520,1,0,0,0,0,1,0


# Пояснение
* ID - Идентификационный номер
* age - Возраст
* Experience - Опытность
* income - Доход
* ZIP code - почтовый индекс
* Family - Фамилия
* CCAvg - Средние траты с кредитки за месяц
* Education -  Образование
* Mortgage - ипотека
* Personal Loan - Личный займ
* Securities Account - Защищенный аккаунт
* CD account - Учетная запись компакт-диска
* Online - Подключен ли к онлайн банку
* Credit card - Кредитная карта

In [5]:
df.shape

(5000, 14)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   ID                  5000 non-null   int64 
 1   Age                 5000 non-null   int64 
 2   Experience          5000 non-null   int64 
 3   Income              5000 non-null   int64 
 4   ZIP Code            5000 non-null   int64 
 5   Family              5000 non-null   int64 
 6   CCAvg               5000 non-null   object
 7   Education           5000 non-null   int64 
 8   Mortgage            5000 non-null   int64 
 9   Personal Loan       5000 non-null   int64 
 10  Securities Account  5000 non-null   int64 
 11  CD Account          5000 non-null   int64 
 12  Online              5000 non-null   int64 
 13  CreditCard          5000 non-null   int64 
dtypes: int64(13), object(1)
memory usage: 547.0+ KB


In [7]:
df.isna().sum()

ID                    0
Age                   0
Experience            0
Income                0
ZIP Code              0
Family                0
CCAvg                 0
Education             0
Mortgage              0
Personal Loan         0
Securities Account    0
CD Account            0
Online                0
CreditCard            0
dtype: int64

In [8]:
df.describe()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,2500.5,45.3384,20.1046,73.7742,93152.503,2.3964,1.881,56.4988,0.096,0.1044,0.0604,0.5968,0.294
std,1443.520003,11.463166,11.467954,46.033729,2121.852197,1.147663,0.839869,101.713802,0.294621,0.305809,0.23825,0.490589,0.455637
min,1.0,23.0,-3.0,8.0,9307.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1250.75,35.0,10.0,39.0,91911.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2500.5,45.0,20.0,64.0,93437.0,2.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,3750.25,55.0,30.0,98.0,94608.0,3.0,3.0,101.0,0.0,0.0,0.0,1.0,1.0
max,5000.0,67.0,43.0,224.0,96651.0,4.0,3.0,635.0,1.0,1.0,1.0,1.0,1.0


In [9]:
df.isnull().sum()

ID                    0
Age                   0
Experience            0
Income                0
ZIP Code              0
Family                0
CCAvg                 0
Education             0
Mortgage              0
Personal Loan         0
Securities Account    0
CD Account            0
Online                0
CreditCard            0
dtype: int64

### Мы получили полную информацию о дата сете, и видим что пропуском в нем нет, так же аномальных значений не выявлено

In [11]:
forheat = df
forheat.drop(['CCAvg'], axis=1, inplace = True)
forheat

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,2,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,4996,29,3,40,92697,1,3,0,0,0,0,1,0
4996,4997,30,4,15,92037,4,1,85,0,0,0,1,0
4997,4998,63,39,24,93023,2,3,0,0,0,0,0,0
4998,4999,65,40,49,90034,3,2,0,0,0,0,1,0


In [12]:
df

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,2,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,4996,29,3,40,92697,1,3,0,0,0,0,1,0
4996,4997,30,4,15,92037,4,1,85,0,0,0,1,0
4997,4998,63,39,24,93023,2,3,0,0,0,0,0,0
4998,4999,65,40,49,90034,3,2,0,0,0,0,1,0


In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X = df.iloc[:, 0:12]
y = df['CreditCard']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [15]:
X.head(1)

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online
0,1,25,1,49,91107,4,1,0,0,1,0,0


In [16]:
y.head(10)

0    0
1    0
2    0
3    0
4    1
5    0
6    0
7    1
8    0
9    0
Name: CreditCard, dtype: int64

In [17]:
X_train.shape

(4000, 12)

In [18]:
X_test.shape

(1000, 12)

### Мы видим что 20% дата сета выделилось на тестовую часть


In [20]:
y_train.value_counts()

CreditCard
0    2822
1    1178
Name: count, dtype: int64

In [21]:
y_test.value_counts()

CreditCard
0    708
1    292
Name: count, dtype: int64

In [22]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

In [23]:
y_pred =  knn.predict(X_test)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,

In [24]:
from sklearn.metrics import confusion_matrix
cm= confusion_matrix(y_test,y_pred)
cm

array([[583, 125],
       [247,  45]], dtype=int64)

In [25]:
from sklearn.metrics import f1_score
print(f1_score(y_test, y_pred))

0.19480519480519481


In [26]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_pred))

0.628


### Мы видим что значения у F1_score меньше нормы, но мы примем эти данные за неимением выбора ಥ_ಥ

In [28]:
from sklearn.metrics import recall_score
print(recall_score(y_test,y_pred))

0.1541095890410959


### Очень "хороший" показатель, даже слишком, возьмем другие модели ༼ つ ◕_◕ ༽つ ---> (Другая модель)

In [30]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(random_state=21)
tree.fit(X_train, y_train)

In [31]:
tree_prediction = tree.predict(X_test)
tree_prediction

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1,

In [32]:
from sklearn.metrics import *

print(accuracy_score(tree_prediction, y_test))
print(confusion_matrix(tree_prediction, y_test))
print(classification_report(tree_prediction, y_test))

0.626
[[515 181]
 [193 111]]
              precision    recall  f1-score   support

           0       0.73      0.74      0.73       696
           1       0.38      0.37      0.37       304

    accuracy                           0.63      1000
   macro avg       0.55      0.55      0.55      1000
weighted avg       0.62      0.63      0.62      1000



In [33]:
from sklearn.naive_bayes import GaussianNB

In [34]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)

In [35]:
gnb_pred_test = gnb.predict(X_test)

In [36]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, gnb_pred_test)

0.752

In [37]:
gnb_accuracy = accuracy_score(y_test, gnb_pred_test)

#### Итог: Мы увидели что используя KNN наши значения были приближены к норме, используя DTC почти тоже самое.
#### Однако GNB показал наивысший результат, что может нас порадовать