# MIDS W261 Mid Term
Student: Kuan Lin<br/>
Email: kuanlin@ischool.berkeley.edu<br/>
Section: Wed 4pm

# KL Divergence

In [1]:
%%writefile kltext.txt
1.Data Science is an interdisciplinary field about processes and systems to extract knowledge or insights from large volumes of data in various forms (data in various forms, data in various forms, data in various forms), either structured or unstructured,[1][2] which is a continuation of some of the data analysis fields such as statistics, data mining and predictive analytics, as well as Knowledge Discovery in Databases.
2.Machine learning is a subfield of computer science[1] that evolved from the study of pattern recognition and computational learning theory in artificial intelligence.[1] Machine learning explores the study and construction of algorithms that can learn from and make predictions on data.[2] Such algorithms operate by building a model from example inputs in order to make data-driven predictions or decisions,[3]:2 rather than following strictly static program instructions.

Writing kltext.txt


In [2]:
import numpy as np
np.log(3)

1.0986122886681098

In [38]:
%%writefile kldivergence.py
from mrjob.job import MRJob
import re
import numpy as np
class kldivergence(MRJob):
    def mapper1(self, _, line):
        index = int(line.split('.',1)[0])
        letter_list = re.sub(r"[^A-Za-z]+", '', line).lower()
        count = {}
        for l in letter_list:
            if count.has_key(l):
                count[l] += 1
            else:
                count[l] = 1
        for key in count:
            yield key, [index, count[key]*1.0/len(letter_list)]


    def reducer1(self, key, values):
        #Fill in your code
        # let python dictionary to help sort document
        # first key will be document1
        term_freqs = {}
        term_freqs[1] = 0.0
        term_freqs[2] = 0.0
        for docId, term_freq in values:
            term_freqs[docId] = term_freq
        # use None as key to force all term frequencies to be sent to one reducer
        # we don't need the actual termID (key) in the next calculation
        #print key + str(term_freqs)
        yield None, [term_freqs[docId] for docId in (1,2)]
    
    def reducer2(self, key, values):
        kl_sum = 0
        for value in values:
            #kl_sum = kl_sum + value
            p_i = value[0]
            q_i = value[1]
            kl_sum += p_i*np.log(p_i/q_i)
        yield None, kl_sum
            
    def steps(self):
        return [self.mr(mapper=self.mapper1,
                        reducer=self.reducer1),
                self.mr(reducer=self.reducer2)]

if __name__ == '__main__':
    kldivergence.run()

Overwriting kldivergence.py


In [39]:
%load_ext autoreload
%autoreload 2
from kldivergence import kldivergence
mr_job = kldivergence(args=['kltext.txt'])
with mr_job.make_runner() as runner: 
    runner.run()
    # stream_output: get access of the output 
    for line in runner.stream_output():
        print mr_job.parse_output_line(line)



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
(None, 0.08088278445318146)


### with smoothing

In [26]:
%%writefile kldivergence_smoothing.py
from mrjob.job import MRJob
import re
import numpy as np
class kldivergence(MRJob):
    def mapper1(self, _, line):
        index = int(line.split('.',1)[0])
        letter_list = re.sub(r"[^A-Za-z]+", '', line).lower()
        count = {}
        for l in letter_list:
            if count.has_key(l):
                count[l] += 1
            else:
                count[l] = 1
        for key in count:
            # send total words to reducer
            yield key, [index, count[key]*1.0, len(letter_list)*1.0]


    def reducer1(self, key, values):
        #Fill in your code
        # let python dictionary to help sort document
        # first key will be document1
        term_freqs = {}
        term_freqs[1] = 0.0
        term_freqs[2] = 0.0
        doc_length = {}
        doc_length[1] = 0.0
        doc_length[2] = 0.0
        for docId, term_freq, doc_len in values:
            term_freqs[docId] = term_freq
            doc_length[docId] = doc_len
        # use None as key to force all term frequencies to be sent to one reducer
        # we don't need the actual termID (key) in the next calculation
        yield None, [(term_freqs[docId]+1)/(24+doc_length[docId]) for docId in term_freqs] # apply smoothing
    
    def reducer2(self, key, values):
        kl_sum = 0
        for value in values:
            #kl_sum = kl_sum + value
            p_i = value[0]
            q_i = value[1]
            kl_sum += p_i*np.log(p_i/q_i)
        yield None, kl_sum
            
    def steps(self):
        return [self.mr(mapper=self.mapper1,
                        reducer=self.reducer1),
                self.mr(reducer=self.reducer2)]

if __name__ == '__main__':
    kldivergence.run()

Writing kldivergence_smoothing.py


In [27]:
%load_ext autoreload
%autoreload 2
from kldivergence_smoothing import kldivergence
mr_job = kldivergence(args=['kltext.txt'])
with mr_job.make_runner() as runner: 
    runner.run()
    # stream_output: get access of the output 
    for line in runner.stream_output():
        print mr_job.parse_output_line(line)



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
(None, 0.06726997279170038)


# Weighted K-Mean

In [40]:
%%writefile Kmeans.py
from numpy import argmin, array, random
from mrjob.job import MRJob
from mrjob.step import MRJobStep
from itertools import chain

#Calculate find the nearest centroid for data point 
def MinDist(datapoint, centroid_points):
    datapoint = array(datapoint)
    centroid_points = array(centroid_points)
    diff = datapoint - centroid_points 
    diffsq = diff**2
    
    distances = (diffsq.sum(axis = 1))**0.5
    # Get the nearest centroid for each instance
    min_idx = argmin(distances)
    return min_idx

#Check whether centroids converge
def stop_criterion(centroid_points_old, centroid_points_new,T):
    oldvalue = list(chain(*centroid_points_old))
    newvalue = list(chain(*centroid_points_new))
    Diff = [abs(x-y) for x, y in zip(oldvalue, newvalue)]
    Flag = True
    for i in Diff:
        if(i>T):
            Flag = False
            break
    return Flag


class MRKmeans(MRJob):
    centroid_points=[]
    k=3    
    def steps(self):
        return [
            MRJobStep(mapper_init = self.mapper_init, mapper=self.mapper,combiner = self.combiner,reducer=self.reducer)
               ]
    #load centroids info from file
    def mapper_init(self):
        self.centroid_points = [map(float,s.split('\n')[0].split(',')) for s in open("Centroids.txt").readlines()]
        open('Centroids.txt', 'w').close()
    #load data and output the nearest centroid index and data point 
    def mapper(self, _, line):
        D = (map(float,line.split(',')))
        idx = MinDist(D,self.centroid_points)
        # setup for weighted average:
        # weight(X)= 1/||X||
        weight = (D[0]**2 + D[1]**2)**0.5
        #yield int(idx), (D[0],D[1],1) # original unweighted
        yield int(idx), (weight*D[0],weight*D[1],weight) # with weight
    #Combine sum of data points locally
    def combiner(self, idx, inputdata):
        sumx = sumy = num = 0
        for x,y,n in inputdata:
            num = num + n
            sumx = sumx + x
            sumy = sumy + y
        yield int(idx),(sumx,sumy,num)
    #Aggregate sum for each cluster and then calculate the new centroids
    def reducer(self, idx, inputdata): 
        centroids = []
        num = [0]*self.k 
        distances = 0
        for i in range(self.k):
            centroids.append([0,0])
        for x, y, n in inputdata:
            num[idx] = num[idx] + n
            centroids[idx][0] = centroids[idx][0] + x
            centroids[idx][1] = centroids[idx][1] + y
        centroids[idx][0] = centroids[idx][0]/num[idx]
        centroids[idx][1] = centroids[idx][1]/num[idx]
        with open('Centroids.txt', 'a') as f:
            f.writelines(str(centroids[idx][0]) + ',' + str(centroids[idx][1]) + '\n')
        yield idx,(centroids[idx][0],centroids[idx][1])
        
if __name__ == '__main__':
    MRKmeans.run()

Writing Kmeans.py


## Driver code for weighted K-mean

In [41]:
%load_ext autoreload
%autoreload 2

from numpy import random, array
from Kmeans import MRKmeans, stop_criterion
mr_job = MRKmeans(args=['Kmeandata.csv', '--file', 'Centroids.txt']) # need to send Centroids.txt to each job

#Geneate initial centroids
centroid_points = [[0,0],[6,3],[3,6]]
k = 3
with open('Centroids.txt', 'w+') as f:
    f.writelines(','.join(str(j) for j in i) + '\n' for i in centroid_points)

# Update centroids iteratively
for i in range(10):
    # save previous centoids to check convergency
    centroid_points_old = centroid_points[:]
    print "iteration"+str(i+1)+":"
    with mr_job.make_runner() as runner: 
        runner.run()
        # stream_output: get access of the output 
        for line in runner.stream_output():
            key,value =  mr_job.parse_output_line(line)
            print key, value
            centroid_points[key] = value
    with open('Centroids.txt', 'w+') as f:
        f.writelines(','.join(str(j) for j in i) + '\n' for i in centroid_points) # record the new centroid value
    print "\n"
    i = i + 1
print "Centroids\n"
print centroid_points



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
iteration1:
0 [-3.9707251767434597, 0.24753995934048853]
1 [5.559358757604786, 0.13140683641026107]
2 [0.21319986473145544, 5.559691555704146]






iteration2:
0



 [-5.273661830097599, 0.01778068820189739]
1 [5.315666040265944, -0.0191245246454466]
2 [0.07760590556533625, 5.322298286870532]


iteration3:
0



 [-5.29872166540091, -0.006290282704146047]
1 [5.315666040265944, -0.0191245246454466]
2 [0.05740025819123362, 5.3015009631419545]


iteration4:
0



 [-5.29872166540091, -0.006290282704146047]
1 [5.315666040265944, -0.0191245246454466]
2 [0.05740025819123362, 5.3015009631419545]


iteration5:
0



 [-5.29872166540091, -0.006290282704146047]
1 [5.315666040265944, -0.0191245246454466]
2 [0.05740025819123362, 5.3015009631419545]


iteration6:
0



 [-5.29872166540091, -0.006290282704146047]
1 [5.315666040265944, -0.0191245246454466]
2 [0.05740025819123362, 5.3015009631419545]


iteration7:
0



 [-5.29872166540091, -0.006290282704146047]
1 [5.315666040265944, -0.0191245246454466]
2 [0.05740025819123362, 5.3015009631419545]


iteration8:
0



 [-5.29872166540091, -0.006290282704146047]
1 [5.315666040265944, -0.0191245246454466]
2 [0.05740025819123362, 5.3015009631419545]


iteration9:
0



 [-5.29872166540091, -0.006290282704146047]
1 [5.315666040265944, -0.0191245246454466]
2 [0.05740025819123362, 5.3015009631419545]


iteration10:
0 [-5.29872166540091, -0.006290282704146047]
1 [5.315666040265944, -0.0191245246454466]
2 [0.05740025819123362, 5.3015009631419545]


Centroids

[[-5.29872166540091, -0.006290282704146047], [5.315666040265944, -0.0191245246454466], [0.05740025819123362, 5.3015009631419545]]


## Calculate average weighted distance

In [43]:
from numpy import argmin, array, random

#Calculate find the nearest centroid for data point 
def MinDist(datapoint, centroid_points):
    datapoint = array(datapoint)
    centroid_points = array(centroid_points)
    diff = datapoint - centroid_points 
    diffsq = diff**2
    
    distances = (diffsq.sum(axis = 1))**0.5
    # Get the nearest centroid for each instance
    min_idx = argmin(distances)
    return min_idx

# load centroid points
centroid_points = [map(float,s.split('\n')[0].split(',')) for s in open("Centroids.txt").readlines()]

total_weights = 0.0
weighted_dist_sum = 0.0
for line in open('Kmeandata.csv'):
    line = line.strip()
    if line == '': continue
    x,y = map(float,line.split(','))
    assigned_x,assigned_y = centroid_points[MinDist([x,y], centroid_points)] # assigned centroid point
    weight = (x**2 + y**2)**0.5
    distance = ((x-assigned_x)**2 + (y-assigned_y)**2)**0.5
    weighted_dist_sum += distance*weight
    total_weights += weight
    
print "Average weighted distance: %s"% (weighted_dist_sum/total_weights)

Average weighted distance: 1.59505207921
