In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import nltk
import collections

In [None]:
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("brown")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
from nltk.corpus import brown as corpus

In [None]:
for n,item in enumerate(corpus.words(corpus.fileids()[0])[:300]):
    print(item, end=" ")
    if (n%25) ==24:
      print(" ")

The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced `` no evidence '' that any irregularities took place .  
The jury further said in term-end presentments that the City Executive Committee , which had over-all charge of the election , `` deserves the praise  
and thanks of the City of Atlanta '' for the manner in which the election was conducted . The September-October term jury had been charged  
by Fulton Superior Court Judge Durwood Pye to investigate reports of possible `` irregularities '' in the hard-fought primary which was won by Mayor-nominate Ivan  
Allen Jr. . `` Only a relative handful of such reports was received '' , the jury said , `` considering the widespread interest in  
the election , the number of voters and the size of this city '' . The jury said it did find that many of Georgia's  
registration and election laws `` are outmoded or inadequate and often ambiguous '' . It recommended that Fulton legislators act `

In [None]:
len(corpus.fileids())

500

In [None]:
docs=[corpus.words(fileid) for fileid in corpus.fileids()]

print(docs[:5])
print("num of docs:", len(docs))

[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...], ['Austin', ',', 'Texas', '--', 'Committee', 'approval', ...], ['Several', 'defendants', 'in', 'the', 'Summerdale', ...], ['Oslo', 'The', 'most', 'positive', 'element', 'to', ...], ['East', 'Providence', 'should', 'organize', 'its', ...]]
num of docs: 500


In [None]:
en_stop = nltk.corpus.stopwords.words('english')

In [None]:
from nltk.corpus import wordnet as wn

def preprocess_word(word, stopwordset):
    
    word=word.lower()
    
    if word in [",",".","''","``"]:
        return None
    
    if word in stopwordset:
        return None
    
    lemma = wn.morphy(word)
    if lemma is None:
        return word

    elif lemma in stopwordset:
        return None
    else:
        return lemma
    

def preprocess_document(document):
    document=[preprocess_word(w, en_stop) for w in document]
    document=[w for w in document if w is not None]
    return document

def preprocess_documents(documents):
    return [preprocess_document(document) for document in documents]

In [None]:
print(docs[0][:25])

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']


In [None]:
print(preprocess_documents(docs)[0][:500])

['fulton', 'county', 'grand', 'jury', 'say', 'friday', 'investigation', "atlanta's", 'recent', 'primary', 'election', 'produce', 'evidence', 'irregularity', 'take', 'place', 'jury', 'say', 'term-end', 'presentment', 'city', 'executive', 'committee', 'over-all', 'charge', 'election', 'deserve', 'praise', 'thanks', 'city', 'atlanta', 'manner', 'election', 'conduct', 'september-october', 'term', 'jury', 'charge', 'fulton', 'superior', 'court', 'judge', 'durwood', 'pye', 'investigate', 'report', 'possible', 'irregularity', 'hard-fought', 'primary', 'mayor-nominate', 'ivan', 'allen', 'jr.', 'relative', 'handful', 'report', 'receive', 'jury', 'say', 'consider', 'widespread', 'interest', 'election', 'number', 'voter', 'size', 'city', 'jury', 'say', 'find', 'many', "georgia's", 'registration', 'election', 'laws', 'outmode', 'inadequate', 'often', 'ambiguous', 'recommend', 'fulton', 'legislator', 'act', 'laws', 'study', 'revise', 'end', 'modernize', 'improve', 'grand', 'jury', 'comment', 'numbe

In [None]:
pre_docs=preprocess_documents(docs)
pre_docs=[" ".join(doc) for doc in pre_docs]
print(pre_docs[0])

vectorizer = TfidfVectorizer(max_features=200, token_pattern=u'(?u)\\b\\w+\\b' )



In [None]:
tf_idf = vectorizer.fit_transform(pre_docs)

In [None]:
num_clusters = 15
km = KMeans(n_clusters=num_clusters, random_state = 0)

In [None]:
clusters = km.fit_predict(tf_idf)

In [None]:
print(len(clusters))

500


In [None]:
count = 0
for doc, cls in zip(pre_docs, clusters):
    if count < 300:
      print(cls,doc)
    else:
      break
    count += 1

4 austin texas -- committee approval gov. price daniel's abandon property act seem certain thursday despite adamant protest texas banker daniel personally led fight measure water considerably since rejection two previous legislature public hearing house committee revenue taxation committee rule go automatically subcommittee one week question committee member taunt banker appearing witness left little doubt recommend passage daniel term extremely conservative estimate would produce 17 million dollar help erase anticipate deficit 63 million dollar end current fiscal year next aug. 31 tell committee measure would merely provide means enforce escheat law book since texas republic permit state take bank account stocks personal property person miss seven years bill daniel say draft personally would force banks insurance firm pipeline company corporation report property state treasurer escheat law cannot enforce almost impossible locate property daniel declare dewey lawrence tyler lawyer repr

In [None]:
cluster_count = []
for i in range(num_clusters):
  cluster_count.append(0)

for doc, cls in zip(pre_docs, clusters):
    cluster_count[cls] += 1

print(cluster_count)

[32, 43, 19, 7, 11, 13, 20, 40, 16, 215, 3, 11, 22, 14, 34]


In [None]:
num_clusters = 5
km = KMeans(n_clusters=num_clusters, random_state = 0)
clusters = km.fit_predict(tf_idf)

cluster_count = []
for i in range(num_clusters):
  cluster_count.append(0)

for doc, cls in zip(pre_docs, clusters):
    cluster_count[cls] += 1

print(cluster_count)

[282, 30, 56, 73, 59]


In [None]:
num_clusters = 10
km = KMeans(n_clusters=num_clusters, random_state = 0)
clusters = km.fit_predict(tf_idf)

cluster_count = []
for i in range(num_clusters):
  cluster_count.append(0)

for doc, cls in zip(pre_docs, clusters):
    cluster_count[cls] += 1

print(cluster_count)

[38, 52, 22, 8, 12, 15, 32, 45, 15, 261]


In [None]:
num_clusters = 20
km = KMeans(n_clusters=num_clusters, random_state = 0)
clusters = km.fit_predict(tf_idf)

cluster_count = []
for i in range(num_clusters):
  cluster_count.append(0)

for doc, cls in zip(pre_docs, clusters):
    cluster_count[cls] += 1

print(cluster_count)

[13, 15, 14, 32, 22, 207, 13, 11, 3, 28, 18, 39, 9, 15, 8, 8, 9, 2, 18, 16]


In [None]:
num_clusters = 25
km = KMeans(n_clusters=num_clusters, random_state = 0)
clusters = km.fit_predict(tf_idf)

cluster_count = []
for i in range(num_clusters):
  cluster_count.append(0)

for doc, cls in zip(pre_docs, clusters):
    cluster_count[cls] += 1

print(cluster_count)

[11, 12, 14, 13, 2, 12, 19, 171, 37, 4, 11, 8, 21, 8, 13, 16, 35, 7, 9, 1, 18, 18, 9, 18, 13]


In [None]:
num_clusters = 30
km = KMeans(n_clusters=num_clusters, random_state = 0)
clusters = km.fit_predict(tf_idf)

cluster_count = []
for i in range(num_clusters):
  cluster_count.append(0)

for doc, cls in zip(pre_docs, clusters):
    cluster_count[cls] += 1

print(cluster_count)

[6, 12, 14, 12, 2, 12, 18, 157, 36, 4, 11, 8, 13, 8, 13, 16, 35, 7, 2, 1, 16, 18, 9, 17, 14, 7, 2, 10, 11, 9]
