Author: Sridhar Nerur
This notebook demonstrates K-Mean clustering on the company dataset that
was used to show MDS and hierarchical clustering.

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd #to read Excel files
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import string #to remove punctuation and digits
from glob import glob
from sklearn.cluster import KMeans

stopwords = list(ENGLISH_STOP_WORDS)
my_stopwords = ["include","finally", "significant"]
stopwords.extend(my_stopwords)
p = string.punctuation
d = string.digits
combined = p + d

docs = [] #will contain text for all the companies
company_names = [] #will contain our labels

#simple function to parse our text
def parse(txt):
    txt = txt.lower()
    #remove punctuation and digits
    table = str.maketrans(combined, len(combined) * " ")
    txt = txt.translate(table)
    #remove stopwords
    words = txt.split()
    cleaned_words = [w for w in words if w not in stopwords]
    cleaned_text = " ".join(cleaned_words)
    return cleaned_text

files = glob("/Users/snerur/Dropbox/patentsResearch/Patents_xls/*xlsx")


In [18]:
len(ENGLISH_STOP_WORDS)

318

In [3]:
len(files)

14

In [4]:
temp = ["apple","peaches","sugar"]
" ".join(temp)

'apple peaches sugar'

In [11]:
for f in files:
    df = pd.read_excel(f)
    df['AB'].dropna(inplace = True)
    abstracts = list(df['AB'])
    combined_abstracts = " ".join(abstracts)
    cleaned_abstracts = parse(combined_abstracts)
    #add this to the docs list
    docs.append(cleaned_abstracts)
    #get company name and add it to company_names list
    name = f.split("/")[-1][:-5]
    company_names.append(name)


In [14]:
company_names

['Huawei',
 'Google',
 'Pantech',
 'Nokia',
 'InterDigital',
 'Sony Ericsson',
 'ZTE',
 'HTC',
 'Oracle',
 'LG',
 'Kodak',
 'Samsung',
 'Apple']

In [15]:
vectorizer = TfidfVectorizer()
sparse_matrix = vectorizer.fit_transform(docs)
#sparse_matrix.todense().shape

In [16]:
sparse_matrix.shape

(13, 18935)

In [12]:
km = KMeans(n_clusters=4, random_state=999)
km.fit(sparse_matrix) #computes k-means clustering
cluster_membership = km.predict(sparse_matrix) #predicts closest cluster
company_distance_to_center = km.transform(sparse_clmatrix) #cluster distance

In [13]:
cluster_membership #assignment of companies to clusters

array([3, 3, 2, 1, 1, 0, 0, 1, 3, 1, 0, 1, 1], dtype=int32)

In [14]:
company_distance_to_center #how far is each company from the centroids

array([[0.78752632, 0.63044353, 0.97648638, 0.33855078],
       [0.82223364, 0.54719083, 0.99148941, 0.32440141],
       [0.9908749 , 0.92601728, 0.        , 0.95704288],
       [0.97458571, 0.56376899, 1.13266974, 0.77125959],
       [1.03481205, 0.6872545 , 1.16760799, 0.8884092 ],
       [0.55770449, 0.98848745, 1.14515587, 0.95817478],
       [0.38677668, 0.82089792, 1.07087921, 0.8578182 ],
       [0.85499133, 0.38924021, 0.99460062, 0.53952426],
       [0.94960969, 0.77395901, 1.09794839, 0.41763793],
       [0.97515844, 0.6095006 , 1.12863469, 0.90970352],
       [0.32870605, 0.83301597, 1.02762498, 0.84763039],
       [0.86202969, 0.58170157, 0.90402829, 0.68879928],
       [1.0928388 , 0.66032027, 1.2272073 , 0.95581338]])

In [16]:
clusters = zip(cluster_membership, company_names)
print("{0:<15s}{1:<9s}".format("Company_Name","Cluster#"))
for cluster_number, company_name in clusters:
    print("{0:<15s}{1:2d}".format(company_name,cluster_number))

Company_Name   Cluster# 
Apple           3
Google          3
HTC             2
Huawei          1
InterDigital    1
Kodak           0
LG              0
Nokia           1
Oracle          3
Pantech         1
Samsung         0
Sony Ericsson   1
ZTE             1


In [17]:
companies = {'Company': company_names, 'Cluster#': cluster_membership,\
            'Centroid_Dist0':company_distance_to_center[0:,0],\
            'Centroid_Dist1':company_distance_to_center[0:,1],\
            'Centroid_Dist2':company_distance_to_center[0:,2],\
            'Centroid_Dist3':company_distance_to_center[0:,3]
            }
#let us put this in a dataframe
import pandas as pd
df = pd.DataFrame(companies)
df

Unnamed: 0,Company,Cluster#,Centroid_Dist0,Centroid_Dist1,Centroid_Dist2,Centroid_Dist3
0,Apple,3,0.787526,0.630444,0.976486,0.338551
1,Google,3,0.822234,0.547191,0.991489,0.324401
2,HTC,2,0.990875,0.926017,0.0,0.957043
3,Huawei,1,0.974586,0.563769,1.13267,0.77126
4,InterDigital,1,1.034812,0.687254,1.167608,0.888409
5,Kodak,0,0.557704,0.988487,1.145156,0.958175
6,LG,0,0.386777,0.820898,1.070879,0.857818
7,Nokia,1,0.854991,0.38924,0.994601,0.539524
8,Oracle,3,0.94961,0.773959,1.097948,0.417638
9,Pantech,1,0.975158,0.609501,1.128635,0.909704
