# Machine Learning on Text: Clustering Assignment

In [1]:
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from yellowbrick.cluster import KElbowVisualizer



### Ingest the company_profiles data set into a corpus.

In [None]:
path = 'company_profiles/'
DOC_PATTERN = r'.*\.txt'
corpus = PlaintextCorpusReader(path, DOC_PATTERN)
corpus.fileids()

### Create a list of documents by extracting the raw text for each fileid in the corpus.

In [None]:
docs = [corpus.raw(fileid) for fileid in corpus.fileids()]
docs[0]

### Preprocess the documents, including the steps below.

- Word tokenize the document.
- Lowercase all tokens.
- Lemmatize and stem the tokens.
- Remove stop words and punctuation.

In [None]:
def preprocess(docs):
    lemmatizer = WordNetLemmatizer()
    stemmer = SnowballStemmer('english')
    preprocessed = []

    for doc in docs:
        tokenized = word_tokenize(doc)
        cleaned = [stemmer.stem(lemmatizer.lemmatize(token.lower())) 
                   for token in tokenized 
                   if token.lower() not in stopwords.words('english')
                  if token.isalpha()]
        untokenized = " ".join(cleaned)
        preprocessed.append(untokenized)
    
    return preprocessed

In [None]:
preprocessed = preprocess(docs)
preprocessed[0]

### TF-IDF vectorize the preprocessed documents.

In [None]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(preprocessed)
vectors

### Determine the optimal number of clusters using the Yellowbrick library's KElbow Visualizer and a KMeans clustering algorithm.

In [None]:
from pylab import rcParams
rcParams['figure.figsize'] = 12, 10

model = KMeans()

visualizer = KElbowVisualizer(model, k=(2,20), metric='silhouette')

visualizer.fit(vectors)

### Perform K-Means Clustering using the optimal number of clusters determine in the previous step.

In [None]:
kmeans = KMeans(n_clusters=10)

clusters = kmeans.fit_predict(vectors)
clusters

In [None]:
df = pd.DataFrame(docs, columns=['text'])
df.head()

### Perform Agglomerative Clustering using the same number of clusters.

In [None]:
agl = AgglomerativeClustering(n_clusters=10)

clusters = agl.fit_predict(vectors.toarray())

df = pd.DataFrame(docs, columns=['text'])
df['cluster'] = clusters
df['file'] = corpus.fileids()
df.head()

### Choose one of the three topic modeling approaches covered. Cluster into the optimal number of clusters and extract 5 keywords that represent the documents in each cluster.

In [None]:
from sklearn.decomposition import NMF

num_keywords = 5
num_topics = 8

model = NMF(n_components=num_topics)
model.fit_transform(vectors)

In [None]:
for index, topic in enumerate(model.components_):
    print('topic', index)
    for i in topic.argsort()[:-num_keywords - 1 : -1]:
        print(vectorizer.get_feature_names()[i], topic[i])
        
    print('__________')