## Mengimport data menggunakan pandas

In [1]:
import pandas as pd
import numpy as np

In [2]:
data_train = pd.read_csv('.\DatasetEskperimen\CensusIncome\CencusIncome.data.txt')
data_test = pd.read_csv('.\DatasetEskperimen\CensusIncome\CencusIncome.test.txt')

In [3]:
data = pd.concat([data_train, data_test], keys=['train', 'test'])

## Mengubah kolom class menjadi numerik

In [4]:
def changeClassToNumber(x):
    if x == '<=50K':
        return 0
    elif x == '>50K':
        return 1
    else:
        return -1

In [5]:
data['class'] = data['class'].map(lambda x: changeClassToNumber(x))

## Menghandle missing value

In [6]:
print("Banyaknya data:", len(data))
print("--------------------------------------------------------------------")
print(data.info())

Banyaknya data: 48842
--------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 48842 entries, (train, 0) to (test, 16280)
Data columns (total 15 columns):
age               48842 non-null int64
workclass         46043 non-null object
fnlwgt            48842 non-null int64
education         48842 non-null object
education-num     48842 non-null int64
marital-status    48842 non-null object
occupation        46033 non-null object
relationship      48842 non-null object
race              48842 non-null object
sex               48842 non-null object
capital-gain      48842 non-null int64
capital-loss      48842 non-null int64
hours-per-week    48842 non-null int64
native-country    47985 non-null object
class             48842 non-null int64
dtypes: int64(7), object(8)
memory usage: 6.0+ MB
None


Karena object yang memiliki missing value adalah object, maka missing value akan digantikan dengan nilai modus dari kolom tersebut

In [7]:
data["workclass"].fillna(data["workclass"].mode()[0], inplace=True)
data["occupation"].fillna(data["occupation"].mode()[0], inplace=True)
data["native-country"].fillna(data["native-country"].mode()[0], inplace=True)

In [8]:

data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 48842 entries, (train, 0) to (test, 16280)
Data columns (total 15 columns):
age               48842 non-null int64
workclass         48842 non-null object
fnlwgt            48842 non-null int64
education         48842 non-null object
education-num     48842 non-null int64
marital-status    48842 non-null object
occupation        48842 non-null object
relationship      48842 non-null object
race              48842 non-null object
sex               48842 non-null object
capital-gain      48842 non-null int64
capital-loss      48842 non-null int64
hours-per-week    48842 non-null int64
native-country    48842 non-null object
class             48842 non-null int64
dtypes: int64(7), object(8)
memory usage: 6.0+ MB


In [9]:
def one_hot_encode(df, label):
    onehot = pd.get_dummies(df[label],prefix=label)
    df.drop(label, axis=1,inplace = True)
    return df.join(onehot)

In [10]:
def changeSexToNumber(x):
    if x == 'Female':
        return 0
    elif x == 'Male':
        return 1
    else:
        return np.nan

#data.drop('race',axis=1, inplace=True)
#data.drop('native-country',axis=1, inplace=True)

In [11]:
data = one_hot_encode(data.drop("class", axis=1), "workclass").join(data["class"])
data = one_hot_encode(data.drop("class", axis=1), "education").join(data["class"])
data = one_hot_encode(data.drop("class", axis=1), "marital-status").join(data["class"])
data = one_hot_encode(data.drop("class", axis=1), "occupation").join(data["class"])
data = one_hot_encode(data.drop("class", axis=1), "relationship").join(data["class"])
data = one_hot_encode(data.drop("class", axis=1), "race").join(data["class"])
data = one_hot_encode(data.drop("class", axis=1), "native-country").join(data["class"])
data['sex'] = data['sex'].map(lambda x: changeSexToNumber(x))

In [13]:
data.head(5)

Unnamed: 0,Unnamed: 1,age,fnlwgt,education-num,sex,capital-gain,capital-loss,hours-per-week,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,...,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia,class
train,0,39,77516,13,1,2174,0,40,0,0,0,...,0,0,0,0,0,0,1,0,0,0
train,1,50,83311,13,1,0,0,13,0,0,0,...,0,0,0,0,0,0,1,0,0,0
train,2,38,215646,9,1,0,0,40,0,0,0,...,0,0,0,0,0,0,1,0,0,0
train,3,53,234721,7,1,0,0,40,0,0,0,...,0,0,0,0,0,0,1,0,0,0
train,4,28,338409,13,0,0,0,40,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
import numpy as np
from sklearn.decomposition import PCA
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
feature = data.drop("class",axis=1).columns

In [None]:
feature

In [None]:
scaler = StandardScaler()

In [None]:
data[feature] = scaler.fit_transform(data.drop("class", axis=1))

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

In [None]:
dt = MLPClassifier(activation='logistic', hidden_layer_sizes=(52,52))

In [None]:
dt.fit(data.loc['train'].drop('class',axis=1).values, data.loc['train']['class'])

In [None]:
y_hat = dt.predict(data.loc['test'].drop('class',axis=1))

In [None]:
accuracy_score(data.loc["test"]['class'].values, y_hat)

In [None]:
data.loc['train'].to_csv('train.csv',index=None)

In [None]:
data.loc['test'].to_csv('test.csv', index=None)

In [None]:
import numpy as np
from sklearn.decomposition import PCA
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale
%matplotlib inline

In [None]:
X = data.drop("class", axis=1).values

In [None]:
X = scale(X)
pca = PCA()
pca.fit(X)
var = pca.explained_variance_ratio_
var1 = np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)

In [None]:
plt.figure(figsize=(10,10))
plt.xlabel('Principal Component')
plt.ylabel('Cumulative Proportion of Variance Explained')
plt.plot(var1,"o--")
plt.savefig('PCA.png')

In [None]:
var1[95]

Dari gambar bisa diambil hanya 85 Point Component yang akan diambil

In [None]:
X = data.drop("class", axis=1).values
n = 95

In [None]:
X = scale(X)
pca = PCA(n_components=n)
pca.fit(X)

In [None]:
X = pca.transform(X)

In [None]:
X.shape

In [None]:
data_dummy = pd.DataFrame(X)

In [None]:
data["class"]

In [None]:
data_train = data_dummy[:32561].join(data.loc['train']['class'])
data_test = data_dummy[32561:].reset_index().join(data.loc['test']['class'])
data_test.drop('index',axis=1, inplace=True)

In [None]:
len(data_test)

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
nb = MLPClassifier(hidden_layer_sizes=(48,48),activation='logistic')

In [None]:
nb.fit(data_train.drop("class", axis=1).values, data_train["class"].values)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
y_hat = nb.predict(data_test.drop("class",axis=1).values)


In [None]:
accuracy_score(data.loc["test"]['class'].values, y_hat)