In [88]:
import pandas as pd
import matplotlib as plt  
from sklearn.model_selection import train_test_split

##### Read data file from folder

In [89]:
data_set = pd.read_csv("Credit.csv")

In [90]:
data_set.head()

Unnamed: 0.1,Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Gender,Student,Married,Ethnicity,Balance
0,1,14.891,3606,283,2,34,11,Male,No,Yes,Caucasian,333
1,2,106.025,6645,483,3,82,15,Female,Yes,Yes,Asian,903
2,3,104.593,7075,514,4,71,11,Male,No,No,Asian,580
3,4,148.924,9504,681,3,36,11,Female,No,No,Asian,964
4,5,55.882,4897,357,2,68,16,Male,No,Yes,Caucasian,331


##### Delete first column

In [91]:
data_set = data_set.drop("Unnamed: 0", axis = 1)

##### Some insights about the data

In [92]:
data_set.shape

(400, 11)

In [93]:
data_set.describe()

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Balance
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,45.218885,4735.6,354.94,2.9575,55.6675,13.45,520.015
std,35.244273,2308.198848,154.724143,1.371275,17.249807,3.125207,459.758877
min,10.354,855.0,93.0,1.0,23.0,5.0,0.0
25%,21.00725,3088.0,247.25,2.0,41.75,11.0,68.75
50%,33.1155,4622.5,344.0,3.0,56.0,14.0,459.5
75%,57.47075,5872.75,437.25,4.0,70.0,16.0,863.0
max,186.634,13913.0,982.0,9.0,98.0,20.0,1999.0


In [94]:
data_set.isnull().sum()

Income       0
Limit        0
Rating       0
Cards        0
Age          0
Education    0
Gender       0
Student      0
Married      0
Ethnicity    0
Balance      0
dtype: int64

 90 cells have zero as a value in their target column

In [95]:
count_balance_values = data_set['Balance'].value_counts().head(5)
count_balance_values

0       90
133      3
1048     3
531      3
333      2
Name: Balance, dtype: int64

#### Subtask a

Used the function recommended in the task to map the categorical values to binary values

In [96]:
data_set = pd.get_dummies(data_set, columns = ['Gender', 'Ethnicity'])

In [97]:
data_set.head()

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Student,Married,Balance,Gender_Female,Gender_Male,Ethnicity_African American,Ethnicity_Asian,Ethnicity_Caucasian
0,14.891,3606,283,2,34,11,No,Yes,333,0,1,0,0,1
1,106.025,6645,483,3,82,15,Yes,Yes,903,1,0,0,1,0
2,104.593,7075,514,4,71,11,No,No,580,0,1,0,1,0
3,148.924,9504,681,3,36,11,No,No,964,1,0,0,1,0
4,55.882,4897,357,2,68,16,No,Yes,331,0,1,0,0,1


To avoid adding two columns to the existing number of columns, the two boolean values in student and married were mapped to boolean values using the python replace function

In [98]:
data_set[['Student', 'Married']] = data_set[['Student', 'Married']].replace({'No': 0, 'Yes': 1})

In [99]:
data_set.head()

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Student,Married,Balance,Gender_Female,Gender_Male,Ethnicity_African American,Ethnicity_Asian,Ethnicity_Caucasian
0,14.891,3606,283,2,34,11,0,1,333,0,1,0,0,1
1,106.025,6645,483,3,82,15,1,1,903,1,0,0,1,0
2,104.593,7075,514,4,71,11,0,0,580,0,1,0,1,0
3,148.924,9504,681,3,36,11,0,0,964,1,0,0,1,0
4,55.882,4897,357,2,68,16,0,1,331,0,1,0,0,1


#### Subtask b

In [100]:
train, test = train_test_split(data_set, test_size = 0.20, random_state=1)

In [101]:
train.shape

(320, 14)

In [102]:
test.shape

(80, 14)