# Import the relevant Libraries

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as ss
from bs4 import BeautifulSoup
import ast
from urllib.request import Request,urlopen
from itertools import combinations 

# Get header from .name file and add to dataset

In [None]:
dataset = pd.read_csv('house-votes-84.data',header=None)
soup_link2 = BeautifulSoup(open('house-votes-84.names'))
table_header = soup_link2.find('p')

def before(value, a):
    pos_a = value.find(a)
    if pos_a == -1: return ""
    return value[0:pos_a]

def after(value, a):
    pos_a = value.rfind(a)
    if pos_a == -1: return ""
    adjusted_pos_a = pos_a + len(a)
    if adjusted_pos_a >= len(value): return ""
    return value[adjusted_pos_a:]

header = after(table_header.text,"7. Attribute Information:\n")
header = before(header,"\n8. Missing Attribute Values: Denoted by")
list_header = []
for i in range(header.count('\n')):
    if(i<10):
        buffer = after(header," "+str(i+1)+". ")
    else:
        buffer = after(header,str(i+1)+". ")
    buffer = before(buffer,":")
    list_header.append(str(buffer))

dataset.columns = list_header
dataset.head()

In [None]:
dataset.shape

##### There are 434 rows and 17 columns 

In [None]:
dataset.describe(include = 'object')

#### From the table above, we can see the basic data structure of dataset: 
#### All columns is categorical data
#### First column "republican" have 2 unique value which is (republican and democrat)
#### 16 other columns have 3 unique value which are y, n and ? (while ? is the missing value)

# Checking for missing values

#### Missing values is fill with "?" in the dataset so we have to take care of it


In [None]:
## replace "?" value to most frequent values in each columns
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = "?" , strategy = 'most_frequent',verbose=0)
imputer = imputer.fit(dataset.iloc[:,1:])
dataset.iloc[:, 1:] = imputer.transform(dataset.iloc[:, 1:])
display(dataset)

#### So that all the missing value is fill with the most frequent values
#### Because there only 2 unique values for all columns so we don't need dummy variable

# Calculate Correlation Matrix of dataset to check correlations among columns 

#### Because all columns of the dataset have categorical values so that we’re can not simply use corr() function of Pandas dataframe. we're looking for other measure of association between two categorical features.
#### By using Cramér’s V correlation which based on a nominal variation of Pearson’s Chi-Square Test will help us to handle this scenario

In [None]:
def cramers_v(confusion_matrix):
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))    
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min( (kcorr-1), (rcorr-1)))

cols = list(dataset.columns)
corrM = np.zeros((len(cols),len(cols)))
for col1, col2 in combinations(cols, 2):
    idx1, idx2 = cols.index(col1), cols.index(col2)
    corrM[idx1, idx2] = cramers_v(pd.crosstab(dataset[col1], dataset[col2]))
    corrM[idx2, idx1] = corrM[idx1, idx2]

corr = pd.DataFrame(corrM, index=cols, columns=cols)
display(corr.abs())


# Visualize Correlation Matrix of dataset by heat map

In [None]:
sns.set(font_scale=2)
fig, ax = plt.subplots(figsize=(30, 30))
ax = sns.heatmap(corr, annot=True, ax=ax); ax.set_title("Cramer V Correlation between Variables");

# Selecting features based on correlation

#### We compare the correlation between features and remove one of two features that have a correlation higher than 0.9

In [None]:
columns = np.full((corr.shape[0],), True, dtype=bool)
for i in range(corr.shape[0]):
    for j in range(i+1, corr.shape[0]):
        if corr.iloc[i,j] >= 0.9:
            if columns[j]:
                columns[j] = False
                selected_columns = dataset.columns[columns]
                new_dataset = dataset[selected_columns]
remove_columns = (list(set(dataset.columns) - set(new_dataset.columns)))
print(str(len(remove_columns))+" column removed which is: "+str(remove_columns))

#### Now dataset has only those columns with correlation less than 0.9

In [None]:
display(new_dataset.head())

# Calculate distribution of selected features

#### We create new distribution dataframe for calculating

In [None]:
buffer = pd.crosstab(index = new_dataset["Class Name"],columns=new_dataset["handicapped-infants"])
buffer = buffer.stack()
buffer.index = ['_'.join(idx) for idx in buffer.index]
buffer.name = "handicapped-infants"
distribution = pd.DataFrame(buffer)
for col in new_dataset.columns:
    if (col == "Class Name") or (col == "handicapped-infants"):
        pass
    else:
        buffer = pd.crosstab(index = new_dataset["Class Name"],columns=new_dataset[col])
        buffer = buffer.stack()
        buffer.index = ['_'.join(idx) for idx in buffer.index]
        distribution[col] = buffer.values 
display(distribution)

# Plotting the data to visualize their distribution

#### Using stacked bar to plot distribution

In [None]:
sns.set(font_scale=1.5)
distribution.T.plot(kind='bar', stacked=True,figsize=(20,8),label='big')

In [None]:
#### 1 ################################################################################################################
#### END 1

# Make classifier with SVM
#### Import libraries

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

#### Encode and split the training set

In [None]:
le = LabelEncoder()
oe = OneHotEncoder()
training_dataset = new_dataset.copy()
dataset_features = training_dataset.drop(columns='Class Name')
dataset_classname = training_dataset['Class Name']
X = oe.fit_transform(dataset_features)
Y = le.fit_transform(dataset_classname)

#### Tune the model parameters

In [None]:
param_grid = {'C': np.arange(1, 10, 0.3), 'gamma': ['scale', 'auto'], 'kernel': ['rbf', 'poly', 'sigmoid']}
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=0, cv=10, n_jobs=-1)
grid.fit(X, Y)
print("Best 10-fold accuracy:", grid.best_score_)
print(grid.best_estimator_)

In [None]:
#### 7
#### END 7

In [None]:
#### 8
#### END 8

In [None]:
#### 9
#### END 9

In [None]:
#### 10
#### END 10

In [None]:
#### 11
#### END 11

In [None]:
#### 12
#### END 12

In [None]:
#### 13
#### END 13

In [None]:
#### 14
#### END 14

In [None]:
#### 15
#### END 15

In [None]:
#### 16 ##############################################################################################################
#### END 16

In [None]:
#### 17
#### END 17

In [None]:
#### 18
#### END 18

In [None]:
#### 19
#### END 19

In [None]:
#### 20
#### END 20

In [None]:
#### 21
#### END 21

In [None]:
#### 22
#### END 22

In [None]:
#### 23
#### END 23

In [None]:
#### 24
#### END 24

In [None]:
#### 25
#### END 25

In [None]:
#### 26
#### END 26

In [None]:
#### 26
#### END 26

In [None]:
#### 27
#### END 27

In [None]:
#### 28
#### END 28

In [None]:
#### 29
#### END 29

In [None]:
#### 30
#### END 30

In [None]:
#### 31 ###############################################################################################################
#### END 31

In [None]:
#### 32
#### END 32

In [None]:
#### 33
#### END 33

In [None]:
#### 34
#### END 34

In [None]:
#### 35
#### END 35

In [None]:
#### 36
#### END 36

In [None]:
#### 37
#### END 37

In [None]:
#### 38
#### END 38

In [None]:
#### 39
#### END 39

In [None]:
#### 40
#### END 40

In [None]:
#### 41
#### END 41

In [None]:
#### 42
#### END 42

In [None]:
#### 43
#### END 43

In [None]:
#### 44
#### END 44

In [None]:
#### 45
#### END 45