# 빅데이터처리
## Week10_Quiz
### 2016003709 윤가영

In [1]:
# To find out where the pyspark
import findspark
findspark.init()

In [2]:
# Creating Spark Context
from pyspark import SparkContext
sc = SparkContext("local", "first app")

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import math
import time
import copy

#### KFold 모듈 호출

In [4]:
from sklearn.model_selection import KFold

In [5]:
import csv

#### cardiovascular.txt를 불러, 불필요한 데이터 정제 및 데이터 정규화를 하는 함수

In [6]:
def create_data():
    data = sc.textFile('./W10/cardiovascular.txt')\
                  .map(lambda line: str(line).split(';'))\
                  .map(lambda line: line[1:]).collect()[1:] 
    data = np.array(sc.parallelize(data).map(lambda line: convertor(line)).collect())
    
    sample = data[:,:-1]
    label = data[:,-1].reshape(data.shape[0],1)

    for i in range(len(sample)):
        sample[i] = sample[i] / float(sample[i].sum())
    labeled_data = np.hstack((sample, label))
    
    return labeled_data

#### present를 1, absent를 0으로 바꾸고, 각 string을 float로 변환하는 함수

In [7]:
def convertor(line):
    if line[4] == 'Present':
        line[4] = 1
    else:
        line[4] = 0
    
    size = len(line)
    for i in range(size):
        line[i] = float(line[i])
        
    return line

#### LogisticRegression 클래스 생성

In [19]:
class LogisticRegression:
    """
    self.data    = 입력데이터
    self.B       = 최적화된 가중치값
    self.predict = 예측값
    self.score   = 정확도
    """
    
    def set_data(self, data):
        """
        맵함수의 입력데이터를 정의하는 함수
        """
        input_data = sc.parallelize(data).map(lambda line: (np.array(line[:-1]), line[-1])).collect()
        self.data = input_data
    
    
    def calculate_likelihood(self, B):
        """
        오차 계산을 위한 likelihood 식을 계산해주는 함수
        """
        sum_ = 0
        for node in self.data:
            sum_ += (node[1] * np.log(math.exp(np.dot(B, node[0])))) + \
                    ((1 - node[1]) * np.log(math.exp(1-np.dot(B, node[0]))))
        return sum_
    
    
    def fit(self, data, B, maxIter=120, convergenceTol=0.001, learning_rate=0.01):
        """
        최적화된 가중치값 찾는 함수
        """
        self.set_data(data)
        
        prev = self.calculate_likelihood(B)
        cnt = 0
        flag = 0
        
        for _ in range(maxIter):
            gradient = sc.parallelize(self.data)\
                         .map(lambda p: (p[1] - 1/(1 + math.exp(-np.dot(B, p[0])))) * p[0]).sum()
            B = B + (learning_rate * gradient)

            err = self.calculate_likelihood(B)        
            if(abs(prev - err) < convergenceTol):
                self.B = B
                break
            elif(abs(prev - err) < 0):
                cnt += 1
            
            # 일정수준 오차가 좁혀졌는데, 그 이상으로 좁혀지지 않을때 조금씩 이동하도록 수정
            if(cnt>10 and flag==0): 
                flag = 1
                learning_rate = 0.001
            
            prev = err
        
        self.B = B
        
    
    def predict(self, X):
        """
        예측값 구하는 함수
        """
        result = []
        
        for line in X:
            predict = round(math.exp(np.dot(self.B, line)))
            result.append(predict)
        
        self.predict = result
        
        return self.predict
    
    
    def score(self, answer):
        """
        정답률 계산하는 함수
        """
        cnt = 0
        for i in range(len(answer)):
            if(answer[i] == self.predict[i]):
                cnt += 1
        
        self.score = cnt/len(answer) * 100
        
        return self.score

#### Quiz10 답안

In [17]:
def solution():
    X = create_data()
    kf = KFold(n_splits=10)
    
    score_sum = 0
    i = 1

    for train_index, test_index in kf.split(X):
        train, test = X[train_index], X[test_index]
        B = np.random.permutation(9)

        X_test = test[:,:-1]
        answer = test[:,-1]

        lr = LogisticRegression()
        lr.fit(train, B)
            
        predict = lr.predict(X_test)
        print(lr.predict)
        print(answer)
        score = lr.score(answer)
        print("round {}: {}".format(i,score))

        score_sum += score
        i += 1

    print("====================================================")
    
    score_average = score_sum / 10
    print(score_average)
    
    return score_average

In [21]:
score_average = solution()

[1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1]
[1. 1. 0. 1. 1. 0. 0. 1. 0. 1. 1. 1. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0.
 0. 1. 0. 1. 0. 1. 1. 1. 1. 1. 0. 1. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 1.]
round 1: 65.95744680851064
[1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1]
[1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 1. 1. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 1.]
round 2: 68.08510638297872
[0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1]
[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 1. 1. 0. 1. 0.
 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0.]
round 3: 47.82608695652174
[1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 

In [22]:
score_average

59.7086031452359