In [None]:
import clustering_code
from collections import defaultdict
from pprint import pprint

input_file = "survey_responses.txt"

The survey has the following fields : 

* Timestamp	
* What's your family or last name?	
* Distance from where you were born to UMT.	
* Distance from where you lived at 15 to UMT.	
* How many years have you been in post-secondary school?	
* Were you an undergraduate marketing major? 	
* Were you an undergraduate business major?	
* How many people live in your house/apartment (including you)? 

We'll work with this data to do a little bit of clustering. The code is based on Joel Grus's from _Data Science from Scratch_ with a few modifications. 

In [None]:
student_data = defaultdict(list)
with open(input_file,'r') as ifile :
    next(ifile)
    for row in ifile.readlines() :
        row = row.strip().split("\t")
        this_student = row[1]
        student_data[this_student] = row[2:]


In [None]:
student_data

We need numerical data for clustering, so we'll convert over the Yes/No responses.

In [None]:
# Let's change No to 0 and Yes to 1, so everything is numerical
for student in student_data :
    this_data = student_data[student] # get the list of data 

    for idx, item in enumerate(this_data) : # iterate over the list (and its index)
        if item == "No" :
            this_data[idx] = 0 # change the "No" spot to 0
        elif item == "Yes" :
            this_data[idx] = 1 # change the "Yes" spot to 1 
            
    student_data[student] = [float(item) for item in this_data] 
        # overwrite the old list with the new one. Also make everything numeric
            

In [None]:
# Let's take a look at the data
pprint(student_data)

In [None]:
# Let's make a function that prints the means in a nice way.

def pprint_means(the_means) :
    var_labels = ["Birth Dist","Age 15 Dist",
                  "Post-Secondary","Mkt Major",
                  "Biz Major","HH Size"]
    for idx, cluster_mean in enumerate(the_means) :
        print("--- Printing Cluster " + str(idx) + " ---")
        
        for idx2, item in enumerate(cluster_mean) :
            print(": ".join([var_labels[idx2],str(round(item,2))]))

        print("----------------------\n")


We'll pause here for a second to talk about the data. 

In [None]:
# Now, let's explore some clusters. Try different values of
# k and see what emerges

k = 2

assignments, means = clustering_code.train_dict(student_data, k)

# Sorted version
s_assign = ( (k ,assignments[k]) for k in sorted(assignments, key=assignments.get, reverse=False))
print( str(k) + "-means:")
for student, cluster in s_assign :
    print(str(cluster) + " : " + student)


The clustering algorithm also returns the means of the clusters. How do we interpret these? 

In [None]:
pprint_means(means)

In [None]:
for student in student_data :
    print(student_data[student])
    break

In [None]:
# Here's a place where we'll do some work rescaling the data.

# maybe start by getting largest miles.
miles = []
for k in student_data :
    miles.extend(student_data[k][:2])


In [None]:
student_data

Once you've rescaled the data, play around with some other clustering results. What emerges?