# Imports

In [11]:
from google.colab import drive
drive.mount('/content/drive', force_remount=False)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
data_path = '/content/drive/MyDrive/ZIWM projekt/data/hepatitis.data'

In [13]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', None)

# Load data

In [14]:
columns = ['class', 'age', 'sex', 'seroid', 'antviral',
           'fatigue', 'malaise', 'anorexia', 'liver_big', 'liver_firm',
           'spleen_palpable',  'spiders', 'ascites', 'varices', 'bilirubin', 
           'alk_phosphate', 'sgot', 'albumin', 'protime', 'histology']

columns_to_drop = ['alk_phosphate', 'protime']

df = pd.read_csv(data_path, names=columns)
df = df.drop(columns_to_drop, axis=1)

df = df[df != '?']
df = df.dropna()
df = df.reset_index(drop=True)
df[['bilirubin', 'sgot', 'albumin']] = df[['bilirubin', 'sgot', 'albumin']].astype('float')
df[['class', 'sex', 'seroid', 'antviral', 'fatigue', 'malaise', 'anorexia', 'liver_big', 'liver_firm', 'spleen_palpable', 'spiders', 'ascites', 'varices', 'histology']] = df[['class', 'sex', 'seroid', 'antviral', 'fatigue', 'malaise', 'anorexia', 'liver_big', 'liver_firm', 'spleen_palpable', 'spiders', 'ascites', 'varices', 'histology']].astype(int)
df.replace({1: 0, 2: 1}, inplace=True)

X = df.drop(['class'], axis=1)
y = df['class']

X.head()

Unnamed: 0,age,sex,seroid,antviral,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,ascites,varices,bilirubin,sgot,albumin,histology
0,30,1,0,1,1,1,1,0,1,1,1,1,1,0.0,18.0,4.0,0
1,50,0,0,1,0,1,1,0,1,1,1,1,1,0.9,42.0,3.5,0
2,78,0,1,1,0,1,1,1,1,1,1,1,1,0.7,32.0,4.0,0
3,34,0,1,1,1,1,1,1,1,1,1,1,1,0.0,200.0,4.0,0
4,34,0,1,1,1,1,1,1,1,1,1,1,1,0.9,28.0,4.0,0


# Features ranking

In [15]:
chi2_selector = SelectKBest(chi2, k=X.shape[1])
chi2_selector.fit(X, y)

chi2_scores = pd.DataFrame(list(zip(X.columns, chi2_selector.scores_)), columns=['feature', 'score'])
chi2_scores = chi2_scores.round(2)
chi2_scores.sort_values('score', ascending=False)



Unnamed: 0,feature,score
13,bilirubin,48.12
14,sgot,20.14
0,age,12.71
16,histology,10.23
10,spiders,7.29
4,fatigue,5.96
5,malaise,5.13
11,ascites,4.6
15,albumin,3.28
1,sex,2.97


In [16]:
chi2_support = chi2_selector.get_support()
chi2_feature = X.loc[:,chi2_support].columns.tolist()
print(chi2_feature)

['age', 'sex', 'seroid', 'antviral', 'fatigue', 'malaise', 'anorexia', 'liver_big', 'liver_firm', 'spleen_palpable', 'spiders', 'ascites', 'varices', 'bilirubin', 'sgot', 'albumin', 'histology']


In [17]:
# X_norm_striped = MinMaxScaler().fit_transform(X[chi2_feature])
df = pd.DataFrame(data=X[chi2_feature], columns=chi2_feature)
df.head()

Unnamed: 0,age,sex,seroid,antviral,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,ascites,varices,bilirubin,sgot,albumin,histology
0,30,1,0,1,1,1,1,0,1,1,1,1,1,0.0,18.0,4.0,0
1,50,0,0,1,0,1,1,0,1,1,1,1,1,0.9,42.0,3.5,0
2,78,0,1,1,0,1,1,1,1,1,1,1,1,0.7,32.0,4.0,0
3,34,0,1,1,1,1,1,1,1,1,1,1,1,0.0,200.0,4.0,0
4,34,0,1,1,1,1,1,1,1,1,1,1,1,0.9,28.0,4.0,0
