# Natural Language Processing and Topic Modeling on Interview Response Dataset

# Part 1: Load Data & Exploration

In [None]:
import numpy as np
import pandas as pd
import nltk

from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

nltk.download('punkt')
nltk.download('stopwords')

In [None]:
# Load data into dataframe
df = pd.read_csv('data.tsv', sep='\t', error_bad_lines=False) 

In [None]:
df.head()

In [None]:
# Checking if there is any missing value
df.isnull().sum()

In [None]:
# Remove missing value
df.dropna(subset=['missing_column'],inplace=True)

In [None]:
df.reset_index(inplace=True, drop=True)

In [None]:
df.info()

In [None]:
# training data
# use the first 2000 data as our training data
data = df.loc[:1999, 'response_body'].tolist()

# Part 2: Tokenizing and Stemming

In [None]:
# Load stopwords and stemmer function from NLTK library.
# Use nltk's English stopwords.
stopwords = nltk.corpus.stopwords.words('english') 
stopwords.append("'s") 
stopwords.append("'m")
stopwords.append("br") 
stopwords.append("watch") 

print ("We use " + str(len(stopwords)) + " stop-words from nltk library.")
print (stopwords[:10])

In [None]:
# Use our defined functions to analyze (i.e. tokenize, stem) our responses.
from nltk.stem.snowball import SnowballStemmer
# from nltk.stem import WordNetLemmatizer 
stemmer = SnowballStemmer("english")

# tokenization and stemming
def tokenization_and_stemming(text): 
    tokens = []   
    for word in nltk.word_tokenize(text):
        if word.lower() not in stopwords:
            tokens.append(word.lower())

    filtered_tokens = []
    
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if token.isalpha():
            filtered_tokens.append(token)
     
    # stemming
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

# Part 3: TF-IDF

In this part, I will use the TfidfVectorizer() from the sklearn library to create the tf-idf matrix.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
# define vectorizer parameters
# max_df : maximum document frequency for the given word
# min_df : minimum document frequency for the given word
# max_features: maximum number of words
# use_idf: if not true, we only calculate tf
# stop_words : built-in stop words
# tokenizer: how to tokenize the document
# ngram_range: (min_value, max_value), eg. (1, 3) means the result will include 1-gram, 2-gram, 3-gram
tfidf_model = TfidfVectorizer(max_df=0.99, max_features=1000,
                                 min_df=0.01, stop_words='english',
                                 use_idf=True, tokenizer=tokenization_and_stemming, ngram_range=(1,1))

tfidf_matrix = tfidf_model.fit_transform(data) #fit the vectorizer to synopses

print ("In total, there are " + str(tfidf_matrix.shape[0]) + \
      " responses and " + str(tfidf_matrix.shape[1]) + " terms.")

In [None]:
tfidf_matrix

In [None]:
# Save the terms identified by TF-IDF. 
# words
tf_selected_words = tfidf_model.get_feature_names()

In [None]:
# print out words
tf_selected_words

# Part 4: K-means clustering

In this part, I will perform the K-means algorithm to find out possible clusters in our responses dataset.



In [None]:
# Use Elbow Method to find the optimal number of clusters
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import matplotlib.style as style

range_n_clusters = [1, 2, 3, 4, 5, 6,7,8,9,10]
avg_distance=[]
for n_clusters in range_n_clusters:
  clusterer = KMeans(n_clusters=n_clusters, random_state=42).fit(tfidf_matrix)
  avg_distance.append(clusterer.inertia_)

style.use("fivethirtyeight")
plt.plot(range_n_clusters, avg_distance)
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Distance")
plt.show()

From the plot above, we can see that after 5, the downward trend is not that great as less than 5. So I decide to use 5 as the number of clusters in the kmeans.

In [None]:
# Analyze K-means Result

# k-means clustering
from sklearn.cluster import KMeans
num_clusters = 5
km = KMeans(n_clusters=num_clusters)
km.fit(tfidf_matrix)
clusters = km.labels_.tolist()

# create DataFrame films from all of the input files.
product = { 'response': df[:2000].response_body, 'cluster': clusters}
frame = pd.DataFrame(product, columns = ['response', 'cluster'])

In [None]:
frame.head(10)

In [None]:
print ("Number of responses included in each cluster:")
frame['cluster'].value_counts().to_frame()

In [None]:
km.cluster_centers_

In [None]:
print ("<Document clustering result by K-means>")

#km.cluster_centers_ denotes the importances of each items in centroid.
#We need to sort it in decreasing-order and get the top k items.
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

Cluster_keywords_summary = {}
for i in range(num_clusters):
    print ("Cluster " + str(i) + " words:", end='')
    Cluster_keywords_summary[i] = []
    for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
        Cluster_keywords_summary[i].append(tf_selected_words[ind])
        print (tf_selected_words[ind] + ",", end='')
    print ()
    
    cluster_responses = frame[frame.cluster==i].response.tolist()
    print ("Cluster " + str(i) + " responses (" + str(len(cluster_responses)) + " ) ")
   

In [None]:
# Plot the kmeans result
from sklearn.decomposition import KernelPCA
import seaborn as sns
pca = KernelPCA(n_components=5)
tfidf_matrix_np=tfidf_matrix.toarray()
X = pca.fit_transform(tfidf_matrix)
xs, ys = X[:, 0], X[:,1]
pca_df = pd.DataFrame(dict(x = xs, y = ys, Cluster = clusters ))
plt.subplots(figsize=(16,9))
sns.scatterplot('x', 'y', data=pca_df, hue='Cluster')

From the plot above we can see that cluster 4 contains more negative responses while cluster 2 contains more positive responses. The responses in cluster 0 are more neutral.

# Part 5: Topic Modeling - Latent Dirichlet Allocation

In [None]:
# Use LDA for clustering
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=5)

In [None]:
# document topic matrix for tfidf_matrix_lda
lda_output = lda.fit_transform(tfidf_matrix)
print(lda_output.shape)
print(lda_output)

In [None]:
# topics and words matrix
topic_word = lda.components_
print(topic_word.shape)
print(topic_word)

In [None]:
# column names
topic_names = ["Topic" + str(i) for i in range(lda.n_components)]

# index names
doc_names = ["Doc" + str(i) for i in range(len(data))]

df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topic_names, index=doc_names)

# get dominant topic for each document
topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['topic'] = topic

df_document_topic.head(10)

In [None]:
df_document_topic['topic'].value_counts().to_frame()

In [None]:
print(lda.components_)
df_topic_words = pd.DataFrame(lda.components_)
df_topic_words.columns = tfidf_model.get_feature_names()
df_topic_words.index = topic_names
df_topic_words.head()

In [None]:
# print top n keywords for each topic
def print_topic_words(tfidf_model, lda_model, n_words):
    words = np.array(tfidf_model.get_feature_names())
    topic_words = []
    for topic_words_weights in lda_model.components_:
        top_words = topic_words_weights.argsort()[::-1][:n_words]
        topic_words.append(words.take(top_words))
    return topic_words

topic_keywords = print_topic_words(tfidf_model=tfidf_model, lda_model=lda, n_words=15)        

df_topic_words = pd.DataFrame(topic_keywords)
df_topic_words.columns = ['Word '+str(i) for i in range(df_topic_words.shape[1])]
df_topic_words.index = ['Topic '+str(i) for i in range(df_topic_words.shape[0])]
df_topic_words

# Discussion

K-means has some limitations. It is very sensitive to outliers. It can produce very small clusters corresponding to outliers. And K-means also has difficulties with clusters of different sizes and densities. 

Latent Dirichlet allocation (LDA) is a generative statistical model that allows sets of observations to be explained by unobserved groups that explain why some parts of the data are similar. The LDA model is highly modular and can, therefore, be easily extended. The main field of interest is modeling relations between topics. In this task, LDA did a better job of clustering the responses.