In [1]:
import pandas as pd
import numpy as np

In [2]:
data1 = pd.read_csv("../data/arcene_train.data", sep=' ',
                   names=['f{}'.format(x) for x in range(0,10001)])
# list comprehension과 format활용문법! 알아보자!
data1.shape

(100, 10001)

In [3]:
data1.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f9991,f9992,f9993,f9994,f9995,f9996,f9997,f9998,f9999,f10000
0,0,71,0,95,0,538,404,20,0,0,...,570,86,0,36,0,80,0,0,524,
1,0,41,82,165,60,554,379,0,71,0,...,605,69,7,473,0,57,0,284,423,
2,0,0,1,40,0,451,402,0,0,0,...,593,28,0,24,0,90,0,34,508,
3,0,56,44,275,14,511,470,0,0,0,...,600,0,26,86,0,102,0,0,469,
4,105,0,141,348,0,268,329,0,0,1,...,0,0,0,0,190,301,0,0,354,


In [18]:
# 4개의 파일들을 한번에 읽되 data파일의 dataframe끼리 합치고
# labels파일의 dataframe끼리 합쳐주는 함수 정의해주자

def read_data_file(filepath):
    data = pd.read_csv(filepath, sep=' ',
                      names=['f{}'.format(x) for x in range(0,10001)])
    #끝 칼럼 삭제해주기
    del(data['f10000'])
    return data

data1 = read_data_file('../data/arcene_train.data')
data2 = read_data_file('../data/arcene_valid.data')

merged_data = pd.concat([data1, data2], ignore_index=True)
merged_data.shape

(200, 10000)

In [19]:
def read_label_file(filepath):
    labels = pd.read_csv(filepath, names=['class'])
    return labels

label1 = read_label_file('../data/arcene_train.labels')
label2 = read_label_file('../data/arcene_valid.labels')
merged_label = pd.concat([label1, label2], ignore_index=True)
merged_label.shape

(200, 1)

In [20]:
final_data = pd.concat([merged_data, merged_label], axis=1)

print(final_data.shape)
final_data.head()

(200, 10001)


Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f9991,f9992,f9993,f9994,f9995,f9996,f9997,f9998,f9999,class
0,0,71,0,95,0,538,404,20,0,0,...,570,86,0,36,0,80,0,0,524,1
1,0,41,82,165,60,554,379,0,71,0,...,605,69,7,473,0,57,0,284,423,-1
2,0,0,1,40,0,451,402,0,0,0,...,593,28,0,24,0,90,0,34,508,1
3,0,56,44,275,14,511,470,0,0,0,...,600,0,26,86,0,102,0,0,469,1
4,105,0,141,348,0,268,329,0,0,1,...,0,0,0,0,190,301,0,0,354,-1


In [21]:
# class 숫자를 0과 1로 바꿔주기

from sklearn import preprocessing

le_class = preprocessing.LabelEncoder()
final_data['class'] = le_class.fit_transform(final_data['class'])

final_data.tail()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f9991,f9992,f9993,f9994,f9995,f9996,f9997,f9998,f9999,class
195,24,73,0,436,92,400,0,0,139,261,...,540,0,86,130,365,58,17,3,37,0
196,11,58,50,332,109,393,122,0,75,134,...,355,156,77,26,277,265,0,36,261,0
197,93,32,137,319,0,264,231,21,0,0,...,9,0,0,0,244,309,0,276,312,1
198,119,12,198,339,0,289,410,0,0,4,...,0,37,0,0,256,402,0,0,350,1
199,112,19,171,334,0,282,208,0,0,0,...,0,118,0,0,226,379,0,0,367,0


In [23]:
# 마지막 칼럼인 class라는 label을 맞추자
# class 제외한 칼럼들 모두 feature로 사용

# Logistic Regression 쓰기

from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression

features = ['f{}'.format(x) for x in range(0,10000)]

# Kfold cross validation ( shuffle허용해서 무작위로 섞기)
kf = KFold(n_splits=10, shuffle=True)

# Kfold로 여러번 할 것이기 떄문에 정확도 담을 리스트 만들기
accrs = []
# Kfold 몇번 했는지 인덱스로 표시
fold_idx = 1

for train_idx, test_idx in kf.split(final_data):
    print(f"{fold_idx}")
    train_d, test_d = final_data.iloc[train_idx], final_data.iloc[test_idx]
    
    train_y = train_d['class']
    train_x = train_d[features]
    
    test_y = test_d['class']
    test_x = test_d[features]
                                             # iteration 횟수 300번으로 늘리기(기본값은 100임)
    model = LogisticRegression(solver='lbfgs', max_iter=300)
    model.fit(train_x, train_y)
    
    mean_accr = model.score(test_x, test_y)
    accrs.append(mean_accr)
    
    fold_idx += 1
    
print(np.average(accrs))

1
2
3
4
5
6
7
8
9
10
0.8999999999999998


In [24]:
# feature개수가 너무 많으니 PCA(주성분 분석)으로 feature selection 하기

from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA

features = ['f{}'.format(x) for x in range(0,10000)]

# Kfold cross validation ( shuffle허용해서 무작위로 섞기)
kf = KFold(n_splits=10, shuffle=True)

# Kfold로 여러번 할 것이기 떄문에 정확도 담을 리스트 만들기
accrs = []
# Kfold 몇번 했는지 인덱스로 표시
fold_idx = 1

for train_idx, test_idx in kf.split(final_data):
    print(f"{fold_idx}")
    train_d, test_d = final_data.iloc[train_idx], final_data.iloc[test_idx]
    
    pca = PCA(n_components=100)
    
    train_y = train_d['class']
    # PCA train data의 feature에다가 적용
    train_x = pca.fit_transform(train_d[features])
    
    test_y = test_d['class']
    # PCA test데이터에다가도 적용하지만 fit은 아니고 transform만 적용하는 것임!! 주의**
    test_x = pca.transform(test_d[features])
                                             # iteration 횟수 300번으로 늘리기(기본값은 100임)
    model = LogisticRegression(solver='lbfgs', max_iter=300)
    model.fit(train_x, train_y)
    
    mean_accr = model.score(test_x, test_y)
    accrs.append(mean_accr)
    
    fold_idx += 1
    
print(np.average(accrs))

1
2
3
4
5
6
7
8
9
10
0.885


# PCA 적용 전후와 평균 Accuracy가 차이가 미미하다!