In [49]:
import csv
import pandas as pd
from sklearn.cluster import KMeans
from sklearn import preprocessing

In [50]:
df = pd.read_csv("cereals.csv")

In [51]:
df.head()

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups
0,100%_Bran,N,C,70,4,1,130,10,5,6,280,25,3,1,0.33
1,100%_Natural_Bran,Q,C,120,3,5,15,2,8,8,135,0,3,1,1.0
2,All-Bran,K,C,70,4,1,260,9,7,5,320,25,3,1,0.33
3,All-Bran_with_Extra_Fiber,K,C,50,4,0,140,14,8,0,330,25,3,1,0.5
4,Almond_Delight,R,C,110,2,2,200,1,14,8,-1,25,3,1,0.75


Looking at this, it's clear that we will have to do something about "mfr" and "type" colummns because they are not numbers. To get some ennumeration, I'm looking at the value counts of them to see if 1) they are important and 2) to get a use-weighted integer to assign to the contents.

## Preprocessing

In [53]:
df["mfr"].value_counts()

K    23
G    22
P     9
R     8
Q     8
N     6
A     1
dtype: int64

In [54]:
df["type"].value_counts()

C    74
H     3
dtype: int64

The hot/cold dichotomy seems to be insignificant enough to just drop. The "mfr" values are renumbered by prevelance, which might help to preserve some "distance" sense.

In [56]:
mfr_numbs = {
    "K":1,
    "G":2,
    "P":3,
    "R":4,
    "Q":5,
    "N":6,
    "A":7
}

In [57]:
df3 = df
df3["mfr"] = df["mfr"].apply(lambda val: mfr_numbs[val]) # applying the ennumeration scheme
df3 = df3.drop("type", axis=1)
df3 = df3.drop("name", axis=1)

In [58]:
df3.head() # these are all numbers now

Unnamed: 0,mfr,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups
0,6,70,4,1,130,10,5,6,280,25,3,1,0.33
1,5,120,3,5,15,2,8,8,135,0,3,1,1.0
2,1,70,4,1,260,9,7,5,320,25,3,1,0.33
3,1,50,4,0,140,14,8,0,330,25,3,1,0.5
4,4,110,2,2,200,1,14,8,-1,25,3,1,0.75


In [60]:
v4 = preprocessing.scale(df3) # converts dataframe to numpy array and does scaling

## Cluster Analysis

In [73]:
groups = pd.DataFrame(index=df.index)
groups["name"] = df["name"]
for i in range(2,10):
    kmeans = KMeans(i)
    kmeans.fit(v4)
    clusters = kmeans.predict(v4)
    col_name = "g"+(str(i).rjust(2)).replace(" ", "0")
    groups[col_name] = clusters

The above code actually applies the k-mean method to the data. But we have used a wide range of number of groups, from 2 to 10. Those were all stored into a dataframe so that we can later try to visualize and understand.

In [74]:
groups.head()

Unnamed: 0,name,g02,g03,g04,g05,g06,g07,g08,g09
0,100%_Bran,1,1,2,3,3,2,3,6
1,100%_Natural_Bran,1,1,3,2,1,4,2,0
2,All-Bran,1,1,2,3,3,2,3,6
3,All-Bran_with_Extra_Fiber,1,1,2,3,3,2,3,6
4,Almond_Delight,0,0,1,4,0,0,6,1


In [101]:
big_list = []
for i in range(2,10):
    col_val = groups.columns[i-1]
    lex_list = [[] for k in range(i)]
    for j in range(len(groups)):
        idx = groups[col_val][j]
#        print(idx)
        lex_list[idx].append(groups["name"][j])
    big_list.append(lex_list)

In [137]:
def print_big_list(lex_list):
    '''Organizes the groups from one trial (set number of groups) into a column-based printout'''
    Ncols = min(4, len(lex_list))
    sections = len(lex_list) // Ncols
    for k in range(sections):
        upper_lim = min((k+1)*4, len(lex_list))
        red_list = lex_list[k*4:upper_lim]
        MaxI = max([len(val) for val in lex_list])
        for j in range(Ncols):
            print((" Category "+str(j+k*4)).ljust(28), end="")
        print("")
        for i in range(MaxI):
            for j in range(Ncols):
                if i < len(red_list[j]):
                    print(red_list[j][i][:25].ljust(25)+"   ", end="")
                else:
                    print(" "*28, end="")
            print("")
        print("")
    

In [141]:
print_big_list(big_list[3])

 Category 0                  Category 1                  Category 2                  Category 3                 
Cheerios                    Bran_Chex                   100%_Natural_Bran           100%_Bran                   
Corn_Chex                   Cream_of_Wheat_(Quick)      Basic_4                     All-Bran                    
Corn_Flakes                 Double_Chex                 Clusters                    All-Bran_with_Extra_Fiber   
Crispix                     Frosted_Mini-Wheats         Cracklin'_Oat_Bran          Bran_Flakes                 
Just_Right_Crunchy__Nugge   Grape_Nuts_Flakes           Fruit_&_Fibre_Dates,_Waln                               
Kix                         Grape-Nuts                  Fruitful_Bran                                           
Nutri-grain_Wheat           Life                        Great_Grains_Pecan                                      
Product_19                  Maypo                       Just_Right_Fruit_&_Nut                  

We have 4 columns here.
 - Category 0 seems to be generic flakes and Os
 - Category 1 is clearly sugar cereals
 - Category 2 is the "naturals" category
 - Category 3 is exclusively fiber stuff

In [142]:
print_big_list(big_list[6])

 Category 0                  Category 1                  Category 2                  Category 3                 
Cheerios                    Cream_of_Wheat_(Quick)      100%_Natural_Bran           100%_Bran                   
Corn_Chex                   Maypo                       Bran_Chex                   All-Bran                    
Corn_Flakes                 Shredded_Wheat              Bran_Flakes                 All-Bran_with_Extra_Fiber   
Crispix                     Shredded_Wheat_'n'Bran      Clusters                                                
Double_Chex                 Shredded_Wheat_spoon_size   Cracklin'_Oat_Bran                                      
Kix                         Strawberry_Fruit_Wheats     Crispy_Wheat_&_Raisins                                  
Multi-Grain_Cheerios                                    Frosted_Mini-Wheats                                     
Nutri-grain_Wheat                                       Grape_Nuts_Flakes                       

Breaking it up into 8 categories, we see the emergence of new classes of cereals.
 - "Puff" cereals
 - Weight-loss cereals
 - Rains and dates
 - bran