**Downloading Necessary Data**

In [2]:
import pandas as pd
df = pd.read_csv('bank-full.csv', sep=';')

**Inspecting Data**

In [3]:
print(f"the fields and their types are as follows:")
for column in df.columns:
  print(df[column].dtype, column, sep='\t')

#let's see if any categorical field contains unknown values
for column in df.columns:
  if df[column].dtype == 'object':
    print(df[column].unique())

#it seems the fields job, education, contact and poutcome fields contain unknown data
nrows_with_unknown_val = len(df.loc[(df['job'] == 'unknown') 
                            | (df['education'] == 'unknown') 
                            | (df['contact'] == 'unknown') 
                            | (df['poutcome'] == 'unknown')])
nrows_total = len(df)

print(f"{nrows_with_unknown_val} out of {nrows_total} rows contain unknown data for some of the fields")

for column in df.columns:
  if df[column].dtype == 'int64':
    print(df[column].unique())

#it seems the numerical data is alright

the fields and their types are as follows:
int64	age
object	job
object	marital
object	education
object	default
int64	balance
object	housing
object	loan
object	contact
int64	day
object	month
int64	duration
int64	campaign
int64	pdays
int64	previous
object	poutcome
object	y
['management' 'technician' 'entrepreneur' 'blue-collar' 'unknown'
 'retired' 'admin.' 'services' 'self-employed' 'unemployed' 'housemaid'
 'student']
['married' 'single' 'divorced']
['tertiary' 'secondary' 'unknown' 'primary']
['no' 'yes']
['yes' 'no']
['no' 'yes']
['unknown' 'cellular' 'telephone']
['may' 'jun' 'jul' 'aug' 'oct' 'nov' 'dec' 'jan' 'feb' 'mar' 'apr' 'sep']
['unknown' 'failure' 'other' 'success']
['no' 'yes']
37369 out of 45211 rows contain unknown data for some of the fields
[58 44 33 47 35 28 42 43 41 29 53 57 51 45 60 56 32 25 40 39 52 46 36 49
 59 37 50 54 55 48 24 38 31 30 27 34 23 26 61 22 21 20 66 62 83 75 67 70
 65 68 64 69 72 71 19 76 85 63 90 82 73 74 78 80 94 79 77 86 95 81 18 89
 84 87 92 93 

**Necessary Processing For Calculation**

In [4]:
import numpy as np
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale

#unknown values don't provide any useful information
#rows containing unknown values are dropped for further ananlysis
df = df.loc[(df['job'] != 'unknown') 
    & (df['education'] != 'unknown') 
    & (df['contact'] != 'unknown') 
    & (df['poutcome'] != 'unknown')]


#assign numerical labels to all values
for column in df.columns:
  if df[column].dtype == 'object':
    unq, num_label = np.unique(df[column], return_inverse=True)
    df[column] = num_label

df_no_sub = df[df['y'] == 0]
df_sub = df[df['y'] == 1]

#downsampling to make equal samples of each y label
df_no_sub = resample(df_no_sub, replace=False, n_samples=1786, random_state=16)
df = pd.concat([df_sub, df_no_sub])

#devide feature and labels
X = df.drop('y', axis=1).copy()
y = df['y'].copy()

#one-hot-encoding categorical data (the attributes with just yes-no values don't have to be encoded)
X_enc = pd.get_dummies(X, columns=['job', 'marital', 'education', 'contact', 'poutcome'])

#split train and test data
X_train, X_test, y_train, y_test = train_test_split(X_enc, y, random_state=7)

X_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = num_label


Unnamed: 0,age,default,balance,housing,loan,day,month,duration,campaign,pdays,...,marital_1,marital_2,education_0,education_1,education_2,contact_0,contact_1,poutcome_0,poutcome_1,poutcome_2
26194,34,0,296,0,0,20,9,68,1,136,...,0,1,0,1,0,1,0,1,0,0
44815,60,0,975,0,0,16,11,303,1,792,...,1,0,0,1,0,1,0,1,0,0
38481,28,0,372,1,1,15,8,201,3,294,...,0,1,0,0,1,1,0,0,0,1
40505,36,0,4,1,0,7,5,529,1,61,...,0,1,0,0,1,1,0,0,0,1
24662,56,0,237,1,0,17,9,117,3,131,...,0,1,0,1,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44583,24,0,431,0,0,19,1,418,11,185,...,0,1,0,1,0,1,0,0,0,1
37932,38,0,1407,1,0,14,8,169,3,174,...,1,0,0,1,0,1,0,1,0,0
40869,36,0,3850,1,0,12,1,239,2,100,...,0,1,0,0,1,1,0,1,0,0
43532,73,0,796,0,0,23,0,888,1,182,...,1,0,1,0,0,1,0,0,0,1


**Model Train**

In [5]:
from sklearn.svm import SVC

clf_svm = SVC(random_state=42)
clf_svm.fit(X_train, y_train)

print(clf_svm.score(X_test, y_test))

0.7077267637178052
