# A Jupyter Notebook Zika Dataset Analysis
<img src="zika-logo.png" alt="Drawing" style="width: 450px;"/>

In [54]:
from pyspark.mllib.clustering import KMeans
import os, shutil, csv, io
import pandas as pd
import numpy as np
from math import sqrt

In [55]:
def list2csv(arr_input):
    '''Method for converting python list into csv file'''
    output = io.StringIO("")
    csv.writer(output).writerow(arr_input)
    return output.getValue().strip()

def csv2rdd(csv):
    '''Method for converting csv file into rdd'''
    return sc.parallelize(csv)

def rdd2numpy_arr(rdd_input):
    '''Method for converting rdd into numpy array'''
    return np.asarray(rdd.collect())

def save_result(dir_path, rdd_input):
    '''Method for save rdd result into specific folder'''
    if os.path.exists(dir_path):
        shutil.rmtree(dir_path)
    rdd_input.saveAsTextFile(dir_path)
    
def kmeans_error(point, clusters_rdd): 
    '''Method for evaluate the kmeans algorithm error'''
    center = clusters_rdd.centers[clusters_rdd.predict(point)] 
    return sqrt(sum([x**2 for x in (point - center)]))    

def show_rdd(x):
    '''Method for show the RDD content'''
    print(x)

### Read and Show Zika Virus Dataset in Pandas

In [61]:
input_path = 'dataset/lista_dados_sage.csv'
file_input_pd = pd.read_csv(input_path, delimiter=',')

### Read Zika Virus Dataset in Spark

In [62]:
file_input_rdd = sc.textFile(input_path)
file_input_rdd.take(5)

['520010,6,2016,1,0,1,0,0,0,0,0',
 '110001,41,2016,1,1,0,0,0,0,0,0',
 '110001,42,2016,1,1,0,0,0,0,0,0',
 '110001,43,2016,1,1,0,0,0,0,0,0',
 '110001,44,2016,2,2,0,0,0,0,0,0']

In [63]:
#Converting CSV RDD data into RDD Array
parsed_data_rdd = file_input_rdd\
    .map(lambda line: np.array([int(x) for x in line.split(',')]))\
    .cache()

## Applying KMeans Algorithm

In [64]:
clusters_rdd = KMeans.train(parsed_data_rdd, 100, maxIterations=100, initializationMode='random')

In [67]:
#WSSSE = (parsed_data_rdd.map(lambda point: kmeans_error(point, clusters_rdd)).reduce(lambda x, y: x + y))
#print('Within Set Sum of Squared Error = ' + str(WSSSE))