In [None]:
# Import Required Libraries
import pandas as pd
import string
import re
import nltk
import gensim
import itertools  
import collections
import seaborn as sns
# pd.set_option('display.max_colwidth', 100)

In [None]:
# Create object of stopwords
stopwords = nltk.corpus.stopwords.words('english')

In [None]:
# Read CSV
df_tweet = pd.read_csv('Collected_tweet.csv')
df_tweet

In [None]:
# Drop unwanted column
df_tweet.drop(['Unnamed: 0'], axis=1)

# Data Cleaning

In [None]:
def clean_text(text):
    text = "".join([word for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [word for word in tokens if word not in stopwords]
    return text

df_tweet['tweets_nostop'] = df_tweet['tweets'].apply(lambda x: clean_text(x.lower()))

df_tweet.drop(['Unnamed: 0'], axis=1)

# Stemming the Tweets

In [None]:
ps = nltk.PorterStemmer()

In [None]:
def stemming(tokenized_text):
    text = [ps.stem(word) for word in tokenized_text]
    return text

df_tweet['tweets_stemmed'] = df_tweet['tweets_nostop'].apply(lambda x: stemming(x))

df_tweet.drop(['Unnamed: 0'], axis=1)

In [None]:
# df_tweet = pd.DataFrame({'text':df_tweet['tweets_lemmatized'], 'index':df_tweet.index})
# df_tweet

In [None]:
# Create object of dictionary
dictionary = gensim.corpora.Dictionary(df_tweet['tweets_stemmed'] )
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 250:
        break

In [None]:
# Filter the words
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [None]:
bow_corpus = [dictionary.doc2bow(doc) for doc in df_tweet['tweets_stemmed']]
bow_corpus[500]

In [None]:
bow_doc_500 = bow_corpus[500]
for i in range(len(bow_doc_500)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_500[i][0], 
                                               dictionary[bow_doc_500[i][0]], 
bow_doc_500[i][1]))

In [None]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

# LDA with Bag of Words

In [None]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=100, id2word=dictionary, passes=2, workers=2)

In [None]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

# Evalaute the model

In [None]:
df_tweet['tweets_stemmed'][300]

# LDA with TF-IDF

In [None]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=25, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

# Evaluate the model

In [None]:
for index, score in sorted(lda_model_tfidf[bow_corpus[300]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))

# Building the word cloud

In [None]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

In [None]:
for i in range(lda_model.num_topics):
    wordcloud = WordCloud(width=400, stopwords=stopwords, height=200, max_font_size=20, max_words=200, collocations=False, 
                                background_color='black').generate(str(lda_model.show_topic(i,250)))
    plt.figure()
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()

# Masking the word cloud with image

In [None]:
import numpy as np
from PIL import Image
mask = np.array(Image.open('tesla.jpg'))
wordcloud = WordCloud(width=1600, mask =   mask,stopwords=stopwords,height=800,max_font_size=200,max_words=50,
                      collocations=False).generate(str(lda_model.show_topic(i,250)))
f = plt.figure(figsize=(50,50))
f.add_subplot(1,2, 1)
plt.imshow(mask, cmap=plt.cm.gray, interpolation='bilinear')
plt.title('Original Image', size=40)
plt.axis("off")
f.add_subplot(1,2, 2)
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Generated Word Cloud', size=40)
plt.axis("off")
plt.show()

In [None]:
word_cloud = []
for i in range(lda_model.num_topics):
    word_cloud.extend(list(dict(lda_model.show_topic(i)).keys()))

In [None]:
word_cloud_df=pd.DataFrame({'Words':word_cloud})
word_cloud_df

In [None]:
word_cloud_df.to_csv('WC_Data.csv')

In [None]:
lower_case = [word.lower() for word in word_cloud_df['Words']]
lower_case

In [None]:
# Now we have only unique words|
set(lower_case)

In [None]:
words_in_tweet = [tweet.lower().split() for tweet in lower_case]
words_in_tweet[:2]

# A Bar Diagram showing the top-5 topics for any Twitter handler of your choice

In [None]:
# List of all words across tweets
all_words = list(itertools.chain(*words_in_tweet))

# Create counter
counts = collections.Counter(all_words)

counts.most_common(15)

In [None]:
# Statistical view
clean_tweets = pd.DataFrame(counts.most_common(5),
                             columns=['words', 'count'])

clean_tweets.head()

In [None]:
# Graphical view
sns.set(rc={'figure.figsize':(12,8),"axes.titlesize":20,"axes.labelsize":20})
plot = sns.barplot(x="count", y="words", data=clean_tweets, palette="bright")
plot.set(xlabel='Count', ylabel='Words')
plot.set_title('Top 5 Topics')
plt.show()