In [1]:
import collections
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
def tokenizer(text):
    tokens = word_tokenize(text)
    stemmer = PorterStemmer() 
    tokens = [stemmer.stem(t) for t in tokens if t not in stopwords.words('english')]
    
    return tokens

In [3]:
def cluster_sentences(sentences, nb_of_clusters=2): 
    # create tf-ifd again: stopwords -> we filter out common words (I, my, the, and...)
    tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenizer, stop_words=stopwords.words('english'), lowercase=True)
    
    # builds a tf-idf matrix for the sentences
    tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
    
    kmeans = KMeans(n_clusters=nb_of_clusters)
    kmeans.fit(tfidf_matrix)
    
    # print(kmeans.cluster_centers_)
    
    clusters = collections.defaultdict(list)
    for i, label in enumerate(kmeans.labels_):
        clusters[label].append(i)

    return (dict(clusters), tfidf_matrix)

In [4]:
sentences = ["Quantuum physics is quite important in science nowadays.",
             "Software engineering is hotter and hotter topic in the silicon valley",
             "Investing in stocks and trading with them are not that easy",
             "FOREX is the stock market for trading currencies",
             "Warren Buffet is famous for making good investments. He knows stock markets",
             "Supercomputers play an important role in the field of computational science",
             "Chemistry is the scientific discipline involved with compounds composed of atoms",
             "Organic solar cells set remarkable energy record",
             "countries around the world decided how they would protect ozone"]

In [5]:
nclusters = 3
clusters, tfidf = cluster_sentences(sentences, nclusters)

In [6]:
clusters

{0: [1, 6, 7, 8], 1: [2, 3, 4], 2: [0, 5]}

In [7]:
for cluster in range(nclusters):
    print("CLUSTER ",cluster,":")
    for i,sentence in enumerate(clusters[cluster]):
        print("\t",sentences[sentence])

CLUSTER  0 :
	 Software engineering is hotter and hotter topic in the silicon valley
	 Chemistry is the scientific discipline involved with compounds composed of atoms
	 Organic solar cells set remarkable energy record
	 countries around the world decided how they would protect ozone
CLUSTER  1 :
	 Investing in stocks and trading with them are not that easy
	 FOREX is the stock market for trading currencies
	 Warren Buffet is famous for making good investments. He knows stock markets
CLUSTER  2 :
	 Quantuum physics is quite important in science nowadays.
	 Supercomputers play an important role in the field of computational science


In [8]:
tfidf

<9x52 sparse matrix of type '<class 'numpy.float64'>'
	with 60 stored elements in Compressed Sparse Row format>