# Modern Data Science
**(Module 07: Natural Language Processing)**

---
- Materials in this module include resources collected from various open-source online repositories.
- You are free to use, change and distribute this package.
- If you found any issue/bug for this document, please submit an issue at [tulip-lab/mds](https://github.com/tulip-lab/mds/issues)

Prepared by and for 
**Student Members** |
2006-2019 [TULIP Lab](http://www.tulip.org.au)

---

# Session F - Topic Model - LDA

<a id = "Computing"></a>

## <span style="color:#0b486b">Import the packages</span>

In [None]:
# !pip install textblob
# !pip install ftfy
!pip install pyLDAvis

In [None]:
from nltk.corpus import stopwords
from nltk import stem,pos_tag
import re
from collections import Counter
import numpy as np
%pylab inline
from textblob import TextBlob
from sklearn.metrics import silhouette_score,confusion_matrix,accuracy_score,roc_curve
import pandas as pd
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import label_binarize,Normalizer
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp
import ftfy
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
from sklearn.grid_search import GridSearchCV
import statsmodels.api as sm
from seaborn import lmplot
import os  # for os.path.basename
import matplotlib as mpl
from sklearn.manifold import MDS,TSNE
%pylab inline
from mpl_toolkits.mplot3d import Axes3D
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.utils.extmath import density
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD,PCA,NMF
from sklearn.pipeline import make_pipeline
from sklearn.decomposition.online_lda import LatentDirichletAllocation

<a id = "Function"></a>

## <span style="color:#0b486b">Function to get titles of talks</span>

In [None]:
def get_titles_from_talks(talks):
    
    title = []
    # For all the talks in that category
    for talk in talks:
        # if the talk is not empty
        if talk != "":
            # split talk and the header
            h,s = talk.lower().split("\n\n")
            # Header contains the title and the number of views
            t,v = h.split("\n")
            # Get the titles
            title.append(t)
    return title

In [None]:
!pip install wget

In [None]:
import wget

entertainment = 'https://github.com/tulip-lab/mds/tree/master/Jupyter/data/entertainment.txt?raw=true'
entertainment = wget.download(entertainment) 

technology = 'https://github.com/tulip-lab/mds/tree/master/Jupyter/data/technology.txt?raw=true'
technology = wget.download(technology) 

science  = 'https://github.com/tulip-lab/mds/tree/master/Jupyter/data/science.txt?raw=true'
science = wget.download(science) 

business = 'https://github.com/tulip-lab/mds/tree/master/Jupyter/data/business.txt?raw=true'
business = wget.download(business) 

global_issues = 'https://github.com/tulip-lab/mds/tree/master/Jupyter/data/global_issues.txt?raw=true'
global_issues = wget.download(global_issues) 

In [None]:
with open(entertainment, 'r', encoding='utf-8') as f:
    talks = ftfy.fix_text(f.read()).split("\n\n\n\n")
    
talks= list(set((talks)))
       
ent = get_titles_from_talks(talks)



with open(technology, 'r', encoding='utf-8') as f:
    talks = ftfy.fix_text(f.read()).split("\n\n\n\n")
    
talks = list(set((talks)))
       
tech = get_titles_from_talks(talks)




with open(science, 'r', encoding='utf-8') as f:
    talks = ftfy.fix_text(f.read()).split("\n\n\n\n")
    
talks = list(set((talks)))
       
science = get_titles_from_talks(talks)




with open(business, 'r', encoding='utf-8') as f:
    talks = ftfy.fix_text(f.read()).split("\n\n\n\n")
    
talks = list(set((talks)))
       
business = get_titles_from_talks(talks)




with open(global_issues, 'r', encoding='utf-8') as f:
    talks = ftfy.fix_text(f.read()).split("\n\n\n\n")
    
talks = list(set((talks)))
       
glob = get_titles_from_talks(talks)

<a id = "Getting"></a>

## <span style="color:#0b486b">Getting the talks that occur only once over the whole corpus.</span>

In [None]:
topic_c = Counter()
# loop over all the talk-titles in each of the category
for topic in [ent,tech,glob,business,science]:
    #increment the counter for that title if it occurs more than once
    topic_c += Counter(topic)
# unzip the counter object
topic,c = zip(*topic_c.items())
# select the talks that only occur once
titles = np.array(topic)[np.where(np.array(c)==1)]



### These single letter labels make it easy to unserstand the output in some places

s -> science --- 0

t -> Technology --- 1

b ->business --- 2

g ->global --- 3

e -> entertainment ---4

<a id = "Creating"></a>

## <span style="color:#0b486b">Creating a dictionary of labels for all the talks</span>


### These are the actual labels : I'm getting them beacuse I'm loading talks category wise.

In [None]:
d = {}

for title in titles:
    if title in tech:  # Talks in technology  have title "t"
        d[title]="t"
    if title in ent:# Talks in entertainment  have title "e"
        d[title]="e"
    if title in business: # Talks in business  have title "b"
        d[title]="b"
    if title in glob: # Talks in global issues  have title "g"
        d[title]="g"
    if title in science: # Talks in science  have title "s"
        d[title]="s"

<a id = "Category"></a>

## <span style="color:#0b486b">Category sizes/splits :</span>

business = 123

entertainment = 152

global issues = 245

science = 209

technology = 249

In [None]:
Counter(d.values())

----

we donot have much class imbalance. 

Base line accuracy is 25-30%

-----

<a id = "Loading"></a>

## <span style="color:#0b486b">Loading all the documents together, retaining all the punctuations to get a proper count of pauses and question marks for EDA</span>

In [None]:
allt =  'https://github.com/tulip-lab/mds/tree/master/Jupyter/data/all.txt?raw=true'
allt = wget.download(allt) 

In [None]:
!pip install nltk

In [None]:
import nltk

In [None]:
nltk.download()

In [None]:
#Loading the file that contains all the talks. These belong to all the categories that we have
with open(allt, 'r', encoding='utf-8') as f:
    talks = ftfy.fix_text(f.read()).split("\n\n\n\n")

talks = list(set((talks)))
lemma = WordNetLemmatizer()
stop_w = stopwords.words("english")
views = []
speeches = []
title = []
labels = []
senti = []
talks.remove("")

# loop over talks
for talk in talks:
    

    # Split the header and the actual speech
    h,s = talk.lower().split("\n\n")
    #split the header , that contains title and the views
    t,v = h.split("\n")
    # If that title is predent in the talks that are unique , which we computed earlier
    if t in d:
        # some punctuation removal
        s = s.replace(". "," ").replace(", "," ").replace(","," ").replace("."," ").replace("  "," ").replace('"'," ")

        # removing stop words
        s = " ".join(i for i in s.split() if i not in stop_w)
        
        f = TextBlob(s)
        senti.append(f.polarity)
        
        speeches.append(s)
       
        labels.append(d[t])
        views.append(v)
        title.append(t)

<a id = "Creating labels"></a>

## <span style="color:#0b486b">Creating labels in terms of numbers is useful, some functions need them </span>

In [None]:
# Creating a list of labels in terms on numbers

num_label = []
for i in labels:
    if i =="s":
        num_label.append(0)
    if i =="t":
        num_label.append(1)
    if i =="b":
        num_label.append(2)
    if i =="g":
        num_label.append(3)
    if i ==s:
        num_label.append(3)
    if i =="e":
        num_label.append(4)


<a id = "Counting"></a>

## <span style="color:#0b486b">Counting functions for EDA</span>

In [None]:
def count_pause(speeches):
    counts = []
    for s in speeches:
        counts.append(s.count("--")+s.count(" --")+s.count("-- "))
    return counts  
def count_questions(speeches):
    counts= []
    for s in speeches:
        counts.append(s.count("?")+s.count(" ?")+s.count("? "))
    return counts 
def count_laughters(speeches):
    counts= []
    for s in speeches:
        counts.append(s.count("(laughter)")+s.count("laughter"))
    return counts 
def count_applause(speeches):
    counts= []
    for s in speeches:
        counts.append(s.count("(applause)")+s.count("applause"))
    return counts 


In [None]:
views = np.array(views).astype(int)
d_ = {"views":views,"labels":labels,'sentiment':senti,'pause': count_pause(speeches), 'questions': count_questions(speeches),'laughter':count_laughters(speeches),"applause":count_applause(speeches)}
df_eda = pd.DataFrame(data=d_)

<a id = "EDA"></a>

## <span style="color:#0b486b">EDA and summary stats</span>

In [None]:
df_ = df_eda.groupby("labels").sum()
df_

In [None]:
import seaborn as sns
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
sns.barplot(["business","entertainment","global","science","tech"],df_.views)
plt.title("views")
plt.subplot(1,2,2)
sns.barplot(["business","entertainment","global","science","tech"],df_.sentiment)
plt.title("sentiment")

In [None]:
plt.figure(figsize=(12,9))
plt.subplot(2,2,1)
sns.barplot(["business","entertainment","global","science","tech"],df_.laughter)
plt.title("laughter")
plt.subplot(2,2,2)
sns.barplot(["business","entertainment","global","science","tech"],df_.pause)
plt.title("pauses")
plt.subplot(2,2,3)
sns.barplot(["business","entertainment","global","science","tech"],df_.questions)
plt.title("questions")
plt.subplot(2,2,4)
sns.barplot(["business","entertainment","global","science","tech"],df_.applause)
plt.title("applause")
plt.tight_layout()

In [None]:
view = np.array(df_eda.views).astype(int)

In [None]:
plt.figure(figsize=(12,9))
sns.distplot(view);

<a id = "Pre-processing"></a>

## <span style="color:#0b486b">Pre-processing</span>


1)  encoding

2) punctuation and symbols 

3) stop words removal

4) Lemmatization 

5) POS tagging  retaining NN and NNP


In [None]:
stop_w = stopwords.words("english")
stop_w.extend(["say","we're","said","things","becae","jt","it's",'one','like','people','going','know',"that's",'think','see','really',"get","would","i'm","don't","us","actually","may","always","found","fact","lost","you've","end"])
stop_w.extend(["sided","something","thing","got","also","we've","there's","time","well","way","want","could","first","two","new","they're","you're","take","back","need","many","kind","ever","four","five","used","maybe","start"])
stop_w.extend(["go","right","make","look","much","even","little","good","work","lot","put","use","three","come","around","different","another","i'll","ask","took","came","tell","great","find","i've","give","went","called","didn't","talk"])
stop_w.extend(["every","thank","day","big","can't","made","started","still","might","let's","idea","000","what's","years","year","able","start","example","question","show","problem","next","part","let","ago","doesn't","he's","here's","help"])
stop_w.extend(["almost","living","none","we'd","people's","using","says","okay","yet","10","second","i'd","goes","try","point","20","without","getting","happen","anything","else","wheather","true","ok","30","isn't","per","given","others","we'll","wouldn't","size","who's"])
stop_w.extend(["yeah","simple","laughing","laughter","(laughter)"])

In [None]:
with open(allt, 'r', encoding='utf-8') as f:
    talks = ftfy.fix_text(f.read()).split("\n\n\n\n")

talks = list(set((talks)))
lemma = WordNetLemmatizer()
views = []
speeches = []
title = []
labels = []
talks.remove("")

for talk in talks:
    


    h,s = talk.lower().split("\n\n")
    
    t,v = h.split("\n")
    
    if t in d:
        
        #BASIC PUNCTUATION REMOVAL
        
        s = s.replace(". "," ").replace(", "," ").replace(","," ").replace("."," ").replace("  "," ").replace('"'," ")
        s = s.replace("-- "," ").replace(" --"," ").replace("? "," ").replace("?"," ").replace("  "," ")
        s = re.sub(r"\((.\w+)\)","",s)
        
        #STOP WORDS removal and LEMMATIZING
        
        s = " ".join(lemma.lemmatize(i) for i in s.split() if i not in stop_w)
        
    
        words,tag = zip(*pos_tag(s.split()))
        # POS tagging 
        index = set(np.where(np.array(tag)=="NN")[0])
        np_index = set(np.where(np.array(tag)=="NNP")[0])
        index = list(index.union(np_index))
 
        words = np.array(words)[index]
        s = " ".join(i for i in words)
        speeches.append(s)
       
        labels.append(d[t])
        views.append(v)
        title.append(t)

<a id = "Checking"></a>

## <span style="color:#0b486b">Checking the word counts for better understanding</span>

In [None]:
c = Counter()
#for each speech
for s in speeches:
    # counter for words
    c += Counter(s.split())
from operator import itemgetter
sorted_ = sorted(c.items(),key = itemgetter(1),reverse=True)[:10]
w,c = zip(*sorted_)

<a id = "Plotting"></a>

## <span style="color:#0b486b">Plotting the word counts </span>

In [None]:
indexes = np.arange(len(w))
width = 1

plt.bar(indexes, c, width)
plt.xticks(indexes + width * 0.5, w)
plt.title("word count")
plt.show()

<a id = "Count Vectorizer"></a>

## <span style="color:#0b486b">Count Vectorizer, Tfidf vectorizer</span>

In [None]:
df = pd.DataFrame(speeches)
df.columns=["speeches"]
df["labels"]=labels
# TFIDF VECTORIZER

tfidf_model =TfidfVectorizer(max_df=0.95 , min_df=2  , stop_words=stop_w)

vectorized_tfidf = tfidf_model.fit_transform(df.speeches)

#COUNT VECTORIZER

tf_model =CountVectorizer(max_df=0.95 , min_df=2  , stop_words=stop_w)

vectorized_tf = tf_model.fit_transform(df.speeches)


<a id = "Visualizing"></a>

## <span style="color:#0b486b">Visualizing the TFIDF vectors using a manifold similarity between document vectors</span>

In [None]:
dist = 1 - cosine_similarity(vectorized_tfidf)
MDS()

# convert two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.
mds = MDS(n_components=3, dissimilarity="precomputed", random_state=1)

pos = mds.fit_transform(dist)  # shape (n_components, n_samples)

xs, ys, zs = pos[:, 0], pos[:, 1],pos[:,2]

fig = pylab.figure(figsize=(10,7))
ax = fig.add_subplot(111, projection ="3d")
sc = ax.scatter(xs,ys,zs,c=num_label,cmap=plt.cm.rainbow)


> Visualizing the similarities among the document vectors. There is no clear pattern , but we will see by the end of the project that the doc2vec document vectors will make more sense. 

<a id = "Running LDA "></a>

## <span style="color:#0b486b">Running LDA on count vectorized documents</span>
LDA on count vectorized vectors gave better results as compared to LDA on TFIDF.

In [None]:
# build LDA model

lda = LatentDirichletAllocation(n_topics=5,
                                max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=42)

lda.fit(vectorized_tf)
topics_words = lda.components_
words = tf_model.get_feature_names()

# Print the words that represent the topics

def print_top_words(model,words,n_top_words=20):
    for index,topic in enumerate(model.components_):
        print("Topic #%d:" % index)
        print("|".join([words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
print("Topics in LDA model:")
tf_feature_names = tf_model.get_feature_names()
print_top_words(lda, tf_feature_names)

a,b=zip(*nltk.pos_tag(speeches[2].split()))
index = np.where(np.array(b)=="NN")
list(index).extend(np.where(np.array(b)=="NNP"))
np.array(a)[index]

How LDA performs compares to the actual labels that we have :

In [None]:
v = lda.transform(vectorized_tf)
Counter([np.argmax(i) for i in v])

> It doesnt perform that well, we can see that the documents are unevenly distributed among the clusters. But we donot have any class imbalance in our data.

In [None]:
filename = 'https://github.com/tulip-lab/mds/raw/master/Jupyter/image/LDA_clusters.png'

In [None]:
from IPython.display import Image
Image(filename, width=1000)

In [None]:
pyLDAvis.sklearn.prepare(lda,vectorized_tfidf,tfidf_model)

<a id = "NMF"></a>

## <span style="color:#0b486b">NMF - Topic Modeling using TFIDF </span>

In [None]:
#Build te NMF Model

nmf = NMF(init="nndsvd",
            n_components=5,
            max_iter=200)
nmf.fit(vectorized_tfidf)
topics_words = nmf.components_
words = tfidf_model.get_feature_names()

print("Topics in NMF model:")
tf_feature_names = tfidf_model.get_feature_names()
print_top_words(nmf, tf_feature_names)

<a id = "Manual"></a>

## <span style="color:#0b486b">Manual labelling of the topics</span>

0 -->  global

1 -->  technology

2 -->  science

3 --> entertainment

4 ---> business


<a id = "Checking"></a>

## <span style="color:#0b486b">Checking performance of Topic Modeling</span>

In [None]:
v = nmf.transform(vectorized_tfidf)

Counter([np.argmax(i) for i in v])

In [None]:
highest_weighted_topics = [np.argmax(i) for i in v]

> We can see that the documents are almost equally distributed over all the categories, which is how our initial data was divided into categories

### Manually labelling the topics 

In [None]:
p = []
for i in highest_weighted_topics:
    if i == 0:
        p.append("t") # global issues
    if i ==1:
        p.append("b") # tech
    if i ==2:
        p.append("s") # business
    if i ==3:
        p.append("g") # science
    if i ==4:
        p.append("e") # entertainment


** Note:

Make sure you label the topics manually every time you re-run NMF.  

NMF assigns random numbers to the topics that it prints. So read the words , label the topics using the previous cell. That is ->   p....    and then check for accuracy

Checking  for accuracy after doing the labeling 

In [None]:
print("accuracy of my topic modeling : ",np.round(accuracy_score(labels,p)*100),"%")

Getting an accuracy of 44% which is better than most of the classifiers that I tried on the tfidf matrix.

Visualizing the NMF model: There is a clear seperation between the topics, and they all make perfect sense when we examine the words in each topic.

In [None]:
NMF_filename = 'https://github.com/tulip-lab/mds/raw/master/Jupyter/image/NMF_clusters.png'

In [None]:
from IPython.display import Image
Image(NMF_filename, width=1000)

In [None]:
pyLDAvis.sklearn.prepare(nmf,vectorized_tfidf,tfidf_model)

<a id = "Default LDA"></a>

## <span style="color:#0b486b">Default LDA</span>

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
X_train, X_test, y_train, y_test = train_test_split(vectorized_tfidf, labels, test_size=.5,random_state=0)
lda_ =  LinearDiscriminantAnalysis()
lda_.fit(X_train.toarray(),y_train)
y_pred = lda_.predict(X_test)
accuracy_score(y_test,y_pred)

In [None]:
conf = pd.DataFrame(confusion_matrix(y_true=y_test,y_pred=y_pred))
conf.columns = ["business","entertain","global","science","tech"]
conf.index = ["business","entertain","global","science","tech"]
conf.columns.name = "True\Predicted"
conf

<a id = "ROC "></a>

## <span style="color:#0b486b">ROC </span>

In [None]:
y = label_binarize(num_label, classes=[0, 1, 2, 3, 4])
n_classes = y.shape[1]
X_train, X_test, y_train, y_test = train_test_split(df.speeches, y, test_size=.5,random_state=0)
cv = CountVectorizer()
cv.fit(X_train)
X_train,X_test = cv.transform(X_train),cv.transform(X_test)
classifier = OneVsRestClassifier(LinearDiscriminantAnalysis())
y_score = classifier.fit(X_train.toarray(), y_train).predict_proba(X_test)
#y_score[:,0]
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = metrics.roc_curve(y_test[:,i], y_score[:, i])
    roc_auc[i] = metrics.auc(fpr[i], tpr[i])
fpr["micro"], tpr["micro"], _ = metrics.roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = metrics.auc(fpr["micro"], tpr["micro"])
plt.figure()
plt.plot(fpr["micro"], tpr["micro"],
         label='micro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["micro"]))
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], label='ROC curve of class {0} (area = {1:0.2f})'
                                   ''.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title("Linear Discriminant Analysis")
plt.legend(bbox_to_anchor=(0, 2), loc='upper left', ncol=1)
plt.show()

----------------------

Trying to predict number of views using number of pauses, laughters, applauses, sentiment :

In [None]:

X = df_eda[["pause","questions","laughter","applause","sentiment"]].values
y = np.array(df_eda.views)

model = sm.OLS(y.astype(int), X)
results = model.fit()
results.summary()