# Final Project Example

## Retrieve/Scrape the Data

In [None]:
import requests
import json
import pandas as pd

bearer_token = 'AAAAAAAAAAAAAAAAAAAAAID7MAEAAAAA5p8qahzG1zAHFQYYM7ACWUO3%2FWU%3DrfOMy0JdqDPZ66fgHcl5Dfdfr0OanE3weJlMy2J6eu98P9Khf3'
headers = {'Authorization':('Bearer '+ bearer_token)}

# In this example, only those tweets with photos/images are stored

n = 5 #500                          # The total number of tweets we want
max_results = 100                 # The number of tweets to pull per request; must be between 10 and 100
next_token = ""                   # Must be empty on first iteration
search_term = "covid"  # To form an advanced query, see here: https://twitter.com/search-advanced?lang=en
since_id = "1504999000000000000"  # The id of the oldest tweet you want to retrieve

# Create the empty DataFrame with the columns you want
df = pd.DataFrame(columns=['id', 'created_at', 'retweets', 'likes', 'replies', 'quotes', 'user_followers', 'user_following', 'user_listed', 'user_tweets', 'has_media', 'url', 'lang', 'text'], dtype=object)
df.set_index('id', inplace=True)



In [None]:
total_retrieved = 0               # To keep track of when to stop

# stop when we have n results
while total_retrieved < n:

  # the first time through the loop, we do not need the next_token parameter
  if next_token == "":
    url = f'https://api.twitter.com/2/tweets/search/recent?query={search_term}&max_results={max_results}&since_id={since_id}'
  else:
    url = f'https://api.twitter.com/2/tweets/search/recent?query={search_term}&max_results={max_results}&since_id={since_id}&next_token={next_token}'

  # These are the extra parameters we will add to the querystring; we won't store them all though; just want you to see what's possible
  url += f'&user.fields=id,public_metrics'
  url += f'&tweet.fields=attachments,public_metrics,text,created_at,author_id,lang'
  url += f'&expansions=attachments.media_keys,author_id'
  url += f'&media.fields=media_key,type,url'

  # make the request to the Twitter API Recent Search endpoint
  response = requests.request("GET", url, headers=headers)
  try:  # Just in case we get an error
    json_data = json.loads(response.text)
  except:
    print(response.text)

    
  # Error checking; print the results if valid data is not retrieved
  if not 'data' in json_data:
    json_clean = json.dumps(json_data, indent=2, sort_keys=True)
    print(json_clean)
    continue


  for tweet in json_data['data']:
    media_key = ""  # Reset to empty each time through the loop so that we can use it for a condition later

    # Store the data into variables
    tweet_id = tweet['id']
    try:
      author_id = tweet['author_id']
    except:
      print(tweet)
    created_at = tweet['created_at']
    retweet_count = tweet['public_metrics']['retweet_count']
    like_count = tweet['public_metrics']['like_count']
    reply_count = tweet['public_metrics']['reply_count']
    quote_count = tweet['public_metrics']['quote_count']
    user_followers = ""
    user_following = ""
    user_listed = ""
    user_tweets = ""
    has_media = False
    image_url = ""
    lang = tweet['lang']
    text = tweet['text']

    # Find out if there is media
    if 'attachments' in tweet:
      if 'media_keys' in tweet['attachments']:
        media_key = tweet['attachments']['media_keys'][0]
        
    # Iterate through all authors until we find the author of this tweet; then store their metrics
    for author in json_data['includes']['users']:
      if author['id'] == author_id:
        user_followers = author['public_metrics']['followers_count']
        user_following = author['public_metrics']['following_count']
        user_listed = author['public_metrics']['listed_count']
        user_tweets = author['public_metrics']['tweet_count']
        break

    # If there is a media key in this tweet, iterate through tweet['includes']['media'] until we find it
    if media_key != "":
      for media in json_data['includes']['media']:
        if media['media_key'] == media_key: # Only if the media_key matches the one we stored
          has_media = True
          if media['type'] == 'photo':      # Only if it is a photo; ignore videos
            image_url = media['url']        # Store the url in a variable
            
            # Only collect english tweets (to aid the natural language processing) that include a .jpg photo
            if (lang == 'en') and (image_url.split('.')[-1] == 'jpg'):
              total_retrieved += 1
              df.loc[tweet_id] = [created_at, retweet_count, like_count, reply_count, quote_count, user_followers, user_following, user_listed, user_tweets, has_media, image_url, lang, text]
            else:
              continue
            break

  # keep track of where to start next time, but quit if there are no more results
  try:
    #  total_retrieved += json_data['meta']['result_count'] # Use this when you have no other criterion for which tweets to keep
    next_token = json_data['meta']['next_token']
  except:
    break
    
  print(f'{total_retrieved}, ', end='') # This simply shows something in the output so that we know the loop is running

# Parse out the date into potentially useful features
df['created_at'] = pd.to_datetime(df['created_at'])
df['Weekday'] = df['created_at'].dt.day_name()
df['DayOfWeek'] = df['created_at'].dt.dayofweek
df['Hour'] = df['created_at'].dt.hour
df.to_csv('twitter.csv')
df.head()

## Exploratory Data Analysis (i.e. Data Understanding Phase)
### Begin with univariate analyses

In [None]:
import pandas as pd
df = pd.read_csv('twitter.csv')
# Convert these numbers to categories
df['id'] = df['id'].astype('object')
print(df.shape)
df.head()

In [None]:
df.drop(columns=['created_at'], inplace=True)
df.describe()

In [None]:
df.skew()


  
### Continue with bivariate analyses

In [None]:
# Create a heatmap over a correlation table

import seaborn as sns
sns.set_style("whitegrid")
import numpy as np
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 9))

matrix = np.triu(df.corr())
sns.heatmap(df.corr(), annot=True, fmt='.2f', 
            vmin=-1, vmax=1, center=0, cmap= 'coolwarm', 
            mask=matrix, square=True);

In [None]:
df.dtypes

In [None]:
sns.set(color_codes=True)
sns.set_style("whitegrid", {'axes.grid' : False})
sns.jointplot(x='likes', y='quotes', data=df);

In [None]:
sns.barplot(data=df, x="has_media", y="retweets");

# Text Processing

In [None]:
!pip install --upgrade pip
!pip install pyLDAvis
!pip install pyLDAvis.gensim
!pip install bokeh
!pip install gensim
!pip install spacy
!pip install logging
!pip install wordcloud
!pip install warnings
!pip install matplotlib
!pip install nltk
!pip install -U pip setuptools wheel
!pip install -U spacy
!python -m spacy download en_core_web_sm
!pip install -U seaborn
!pip install translators --upgrade
#!conda install -c conda-forge pyldavis

## Translate Non-English Tweets to English

In [None]:
import pandas as pd
df = pd.read_csv('twitter.csv')
# Convert these numbers to categories
df['id'] = df['id'].astype('object')
print(df.shape)
df.lang.value_counts()

In [None]:
def translate(text):
  import translators as ts
  translated = ""

  # professional field
  try:
    translated = ts.alibaba(text, professional_field='general') # ("general","message","offer")
  except:
    pass

  if translated == "":
    try:
      translated = ts.baidu(text, professional_field='common') # ('common','medicine','electronics','mechanics')
    except:
      pass

  # host service
  if translated == "":
    try:
      translated = ts.google(text, if_use_cn_host=True)
      translated = ts.bing(text, if_use_cn_host=False)
    except:
      pass

  return translated

for i, row in enumerate(df.itertuples()):
  if row[13] != 'en':
#     df.loc[row[0]] = translate(df.loc[row[0]][13])
    print(row[13], df.loc[row[0]][13])

In [None]:
df.to_csv('twitter.csv', index=False)

### Generate Stop Words List

In [None]:
import sys
import re
import numpy as np
from pprint import pprint

# Gensim
import gensim, spacy, logging, warnings, en_core_web_sm
import gensim.corpora as corpora
from gensim.utils import lemmatize, simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt

# NLTK Stop words
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 'co', 'https', 'http', 'twitter', 'amp', 'covid', 'gofundme']) # After reviewing the LDA, return to add words that you want to eliminate

### Tokenize and Clean
Remove line breaks, single quotes, email addresses.
Use Gensim's simple_preprocess to hash/tokenize each string

In [None]:
def sent_to_words(sentences):
    for sent in sentences:
        sent = re.sub('\\S*@\\S*\\s?', '', sent)  # remove emails
        sent = re.sub('\\s+', ' ', sent)  # remove newline chars
        sent = re.sub("\\'", "", sent)  # remove single quotes
        sent = gensim.utils.simple_preprocess(str(sent), deacc=True) 
        yield(sent)  

# Convert each tweet to a list of cleaned words and add to a master list
data = df.text.values.tolist()
data_words = list(sent_to_words(data))
for tweet in data_words[:5]: # print the first :n tweet word lists
  print(tweet)

df['words'] = data_words
df.head()

In [None]:
length = ''
for i in range(len(data_words)):
    for j in range(len(data_words[i])):
        length += data_words[i][j]
print(f'Corpus size: {str(len(length))}')

### Add Bigrams, Trigrams, and Stem

In [None]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

Remove stop words, add bigrams and trigrams, performed lemmatization/stemming

In [None]:
def process_words(texts, stop_words=stop_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """Remove Stopwords, Form Bigrams, Trigrams and perform Lemmatization"""
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    
    texts_out = []
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])    # Load spacy, but we don't need the parser or NER (named entity extraction) modules
    
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
        
    # remove stopwords once more after lemmatization
    texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out]    
    return texts_out

data_ready = process_words(data_words)
for tweet in data_ready[:5]:
  print(tweet)

df['words'] = data_ready

## **LDA Topic Model: Tweet Text**

### Build an LDA 
Build Latent Dirichlet Allocation model for detecting the top n topics in the corpus

In [None]:
def lda(data_ready=None, id2word=None, corpus=None, start=2, iterations=10, every=2):
  coherence_list = []
    
  print(f'Topics\tPerplexity\tCoherence')
  for topics in range(start, (start + iterations) * every, every):
    # Build LDA model
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=topics, random_state=100,
                                                update_every=1, chunksize=20, passes=20, alpha='symmetric',
                                                iterations=500,per_word_topics=True)

    # Compute LDA metrics
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_ready, dictionary=id2word, coherence='c_v')
    print(f'{topics}\t{round(lda_model.log_perplexity(corpus), 4)}\t\t{round(coherence_model_lda.get_coherence(), 4)}')
    coherence_list.append(coherence_model_lda.get_coherence())

  # Determine the numer of topics for the LDA with the highest coherence score
  best_topics = (coherence_list.index(max(coherence_list)) + start) * every
    
  lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=best_topics, random_state=100,
                                              update_every=1, chunksize=20, passes=20, alpha='symmetric',
                                              iterations=500,per_word_topics=True)

  ldatopics = lda_model.show_topics(formatted=False)
  pprint(lda_model.print_topics())
  return lda_model


# Create Dictionary
id2word = corpora.Dictionary(data_ready)

# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_ready]

lda_model = lda(data_ready, id2word, corpus, start=2, iterations=9, every=1)
num_topics = len(lda_model.get_topics())

### Dominant Topic
What is the Dominant topic and its % contribution in each tweet?

In LDA models, each document is composed of multiple topics. But, typically only one of the topics is dominant. The below code extracts this dominant topic for each sentence and shows the weight of the topic and the keywords in a nicely formatted output.

This way, you will know which document belongs predominantly to which topic.

In [None]:
def store_topic_scores(ldamodel=None, corpus=None, texts=data, df=df):
  # Create the new, zeroed columns to store the topic scores, dominant topic, and dominant topic score
  df['Dominant_topic'] = 0
  df['Dominant_score'] = 0.0
  num_topics = len(ldamodel.get_topics())
  for col in range(num_topics):
    df[f'topic_{col + 1}'] = 0.0
    
  # Store the topic score and dominant topic
  for i, words in enumerate(texts):
    doc = ldamodel[id2word.doc2bow(words)] # generate a corpus for this document set of workds
        
    for j, score in enumerate(doc[0]):
      df.iat[i, (len(df.columns) - ((num_topics) - score[0]))] = score[1]
        
    topic_score_list = [x[1] for x in doc[0]]
    dominant_topic = topic_score_list.index(max(topic_score_list))
    df.at[i, 'Dominant_topic'] = dominant_topic + 1
    df.at[i, 'Dominant_score'] = topic_score_list[dominant_topic]
    
  return(df)

In [None]:
df = store_topic_scores(lda_model, corpus, data_ready)
df.to_csv(f'twitter_with_LDA.csv', index=False)
df.head()

## **Visualize the LDA Topics**

### Frequency Distribution
How many words are in each tweet? When working with a large number of tweets, you want to know how big the tweet are as a whole and by topic. Let’s plot the tweet word counts distribution.

In [None]:
doc_lens = [len(d) for d in df.words]

import seaborn as sns
sns.set_style("whitegrid", {'axes.grid' : False})

plt.figure(figsize=(18,7), dpi=100)
sns.distplot(doc_lens)
plt.text(-7, .048, "Mean   : " + str(round(np.mean(doc_lens), 2)))
plt.text(-7, .046, "Median : " + str(round(np.median(doc_lens), 2)))
plt.text(-7, .044, "Stdev   : " + str(round(np.std(doc_lens), 2)))
plt.text(-7, .042, "1%ile    : " + str(round(np.quantile(doc_lens, q=0.01), 2)))
plt.text(-7, .040, "99%ile  : " + str(round(np.quantile(doc_lens, q=0.99), 2)))

plt.gca().set(ylabel='Number of Tweets', xlabel='Tweet Word Count')
plt.tick_params(size=16)
plt.xticks(np.linspace(0,27,28))
plt.title('Distribution of Tweet Word Counts', fontdict=dict(size=22))
plt.show()

### Word Counts by Dominant Topic

In [None]:
import matplotlib.colors as mcolors
import math
cols = [color for name, color in mcolors.XKCD_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS' or 'mcolors.CSS4_COLORS'

fig, axes = plt.subplots(math.ceil(num_topics**(1/2)), math.ceil(num_topics**(1/2)), figsize=(16,14), dpi=160, sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):    
    df_sub = df.loc[df.Dominant_topic == (i + 1), :]
    doc_lens = [len(d) for d in df_sub.words]
    ax.hist(doc_lens, color=cols[i])
    ax.tick_params(axis='y', labelcolor=cols[i], color=cols[i])
    sns.kdeplot(doc_lens, color="black", shade=False, ax=ax.twinx(), bw=1.5)
    ax.set(xlim=(0, 28), xlabel='')
    ax.set_ylabel('Number of Documents', color=cols[i])
    ax.set_title('Topic: '+str(i + 1), fontdict=dict(size=16, color=cols[i]))

fig.tight_layout()
fig.subplots_adjust(top=0.90)
plt.xticks(np.linspace(0,27,28))
fig.suptitle('Distribution of Document Word Counts by Dominant Topic', fontsize=22)
plt.show()

### Clouds of Top N Keywords
Update the max_words variable below to include more or less words per cloud. The coloring of the topics is used in subsequent vizs.

In [None]:
# 1. Wordcloud of Top N words in each topic
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors

cols = [color for name, color in mcolors.XKCD_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS', fewer colors: 'mcolors.TABLEAU_COLORS'

cloud = WordCloud(stopwords=stop_words,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=20,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

topics = lda_model.show_topics(formatted=False)

fig, axes = plt.subplots(math.ceil(num_topics**(1/2)), math.ceil(num_topics**(1/2)), figsize=(10,10), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    try:
      topic_words = dict(topics[i][1])
      cloud.generate_from_frequencies(topic_words, max_font_size=300)
      plt.gca().imshow(cloud)
      plt.gca().set_title('Topic ' + str(i+1), fontdict=dict(size=16))
      plt.gca().axis('off')
    except:
      continue


plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()

### Topic Keywords Counts
When it comes to the keywords in the topics, the importance (weights) of the keywords matters. Along with that, how frequently the words have appeared in the tweets is also interesting to see.

We will plot the word counts and the weights of each keyword in the same chart.

Look for words that occur in multiple topics and the ones whose relative frequency is more than the weight. Often such words turn out to be less important.

In [None]:
# Bar chart of word counts for each topic
from collections import Counter
topics = lda_model.show_topics(formatted=False)
data_flat = [w for w_list in data_ready for w in w_list]
counter = Counter(data_flat)

out = []
for i, topic in topics:
    for word, weight in topic:
        out.append([word, i + 1, weight, counter[word]])

df = pd.DataFrame(out, columns=['word', 'topic_id', 'importance', 'word_count'])        

# Plot Word Count and Weights of Topic Keywords
fig, axes = plt.subplots(math.ceil(num_topics**(1/2)), math.ceil(num_topics**(1/2)), figsize=(20,20), sharey=True, dpi=160)
cols = [color for name, color in mcolors.XKCD_COLORS.items()]
for i, ax in enumerate(axes.flatten()):
    ax.bar(x='word', height="word_count", data=df.loc[df.topic_id==i+1, :], color=cols[i+1], width=0.5, alpha=0.3, label='Word Count')
    ax_twin = ax.twinx()
    ax_twin.bar(x='word', height="importance", data=df.loc[df.topic_id==i+1, :], color=cols[i+1], width=0.2, label='Weights')
    ax.set_ylabel('Word Count', color=cols[i+1])
    # ax_twin.set_ylim(0, 0.030); ax.set_ylim(0, 3500)
    ax.set_title('Topic: ' + str(i + 1), color=cols[i+1], fontsize=16)
    ax.tick_params(axis='y', left=False)
    ax.set_xticklabels(df.loc[df.topic_id==i+1, 'word'], rotation=30, horizontalalignment= 'right')
    try:
      ax.legend(loc='upper center'); ax_twin.legend(loc='upper right')
    except:
      pass

fig.tight_layout(w_pad=2)    
fig.suptitle('Word Count and Importance of Topic Keywords', fontsize=22, y=1.05)    
plt.show()

### Sentence Chart Colored 
Each word in a tweet is representative of one of the 4 topics. You can color each word in a given tweet by the topic id it is attributed to.
The color of the enclosing rectangle is the topic assigned to the tweet.

In [None]:
# Sentence Coloring of N Tweets
from matplotlib.patches import Rectangle

def sentences_chart(lda_model=lda_model, corpus=corpus, start = 0, end = 13):
    corp = corpus[start:end]
    mycolors = [color for name, color in mcolors.XKCD_COLORS.items()]

    fig, axes = plt.subplots(end-start, 1, figsize=(20, (end-start)*0.95), dpi=160)       
    axes[0].axis('off')
    for i, ax in enumerate(axes):
        if i > 0:
            corp_cur = corp[i-1] 
            topic_percs, wordid_topics, wordid_phivalues = lda_model[corp_cur]
            word_dominanttopic = [(lda_model.id2word[wd], topic[0]) for wd, topic in wordid_topics]    
            ax.text(0.01, 0.5, "Doc " + str(i) + ": ", verticalalignment='center',
                    fontsize=16, color='black', transform=ax.transAxes, fontweight=700)

            # Draw Rectange
            topic_percs_sorted = sorted(topic_percs, key=lambda x: (x[1]), reverse=True)
            ax.add_patch(Rectangle((0.0, 0.05), 0.99, 0.90, fill=None, alpha=1, 
                                   color=mycolors[topic_percs_sorted[0][0]], linewidth=2))

            word_pos = 0.06
            for j, (word, topics) in enumerate(word_dominanttopic):
                if j < 14:
                    ax.text(word_pos, 0.5, word,
                            horizontalalignment='left',
                            verticalalignment='center',
                            fontsize=16, color=mycolors[topics],
                            transform=ax.transAxes, fontweight=700)
                    word_pos += .009 * len(word)  # to move the word for the next iter
                    ax.axis('off')
            ax.text(word_pos, 0.5, '. . .',
                    horizontalalignment='left',
                    verticalalignment='center',
                    fontsize=16, color='black',
                    transform=ax.transAxes)       

    plt.subplots_adjust(wspace=0, hspace=0)
    plt.suptitle('Sentence Topic Coloring for Tweets: ' + str(start + 1) + ' to ' + str(end-1), fontsize=22, y=0.95, fontweight=700)
    plt.tight_layout()
    plt.show()

sentences_chart()

### Most Common Topics
What are the most discussed topics in the tweets? We can compute the total number of tweets attributed to each topic

In [None]:
# Generate a list of the most dominant topics and then the three top keywords in each of those topics
def topics_per_document(model, corpus, start=0, end=1):
    corpus_sel = corpus[start:end]
    dominant_topics = []
    topic_percentages = []
    for i, corp in enumerate(corpus_sel):
        topic_percs, wordid_topics, wordid_phivalues = model[corp]
        dominant_topic = sorted(topic_percs, key = lambda x: x[1], reverse=True)[0][0]
        dominant_topics.append((i, dominant_topic))
        topic_percentages.append(topic_percs)
    return(dominant_topics, topic_percentages)

dominant_topics, topic_percentages = topics_per_document(model=lda_model, corpus=corpus, end=-1)            

# Distribution of Dominant Topics in Each Document
df = pd.DataFrame(dominant_topics, columns=['Document_Id', 'Dominant_Topic'])
dominant_topic_in_each_doc = df.groupby('Dominant_Topic').size()
df_dominant_topic_in_each_doc = dominant_topic_in_each_doc.to_frame(name='count').reset_index()

# Total Topic Distribution by actual weight
topic_weightage_by_doc = pd.DataFrame([dict(t) for t in topic_percentages])
df_topic_weightage_by_doc = topic_weightage_by_doc.sum().to_frame(name='count').reset_index()

# Top 3 Keywords for each Topic
topic_top3words = [(i, topic) for i, topics in lda_model.show_topics(formatted=False) 
                                 for j, (topic, wt) in enumerate(topics) if j < 3]

df_top3words_stacked = pd.DataFrame(topic_top3words, columns=['topic_id', 'words'])
df_top3words = df_top3words_stacked.groupby('topic_id').agg(', \n'.join)
df_top3words.reset_index(level=0,inplace=True)

In [None]:
# Two Plots:
#    Num tweets per topic by assigning the document to the topic that has the most weight in that document.
#    Num tweets per topic by summing up the actual weight contribution of each topic to respective documents.

from matplotlib.ticker import FuncFormatter

# Plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 4), dpi=120, sharey=True)

# Topic Distribution by Tweet Topics
ax1.bar(x='Dominant_Topic', height='count', data=df_dominant_topic_in_each_doc, width=.5, color='firebrick')
ax1.set_xticks(range(df_dominant_topic_in_each_doc.Dominant_Topic.unique().__len__()))
tick_formatter = FuncFormatter(lambda x, pos: 'Topic ' + str(x+1)+ '\n' + df_top3words.loc[df_top3words.topic_id==x, 'words'].values[0])
ax1.xaxis.set_major_formatter(tick_formatter)
ax1.set_title('Number of Tweets by Tweet Topic', fontdict=dict(size=10))
ax1.set_ylabel('Number of Tweets')
# ax1.set_ylim(0, 1000)

# Topic Distribution by Topic Weights
ax2.bar(x='index', height='count', data=df_topic_weightage_by_doc, width=.5, color='steelblue')
ax2.set_xticks(range(df_topic_weightage_by_doc.index.unique().__len__()))
ax2.xaxis.set_major_formatter(tick_formatter)
ax2.set_title('Number of Tweets by Topic Weightage', fontdict=dict(size=10))

plt.show()

### t-SNE Clustering Chart
Compute the total number of tweets attributed to each topic.

In [None]:
# Get topic weights and dominant topics
from sklearn.manifold import TSNE
from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook

# Get topic weights
topic_weights = []
for i, row_list in enumerate(lda_model[corpus]):
    topic_weights.append([w for i, w in row_list[0]])

# Array of topic weights    
arr = pd.DataFrame(topic_weights).fillna(0).values

# Keep the well separated points (optional)
arr = arr[np.amax(arr, axis=1) > 0.35]

# Dominant topic number in each doc
topic_num = np.argmax(arr, axis=1)

# tSNE Dimension Reduction
tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
tsne_lda = tsne_model.fit_transform(arr)

# Plot the Topic Clusters using Bokeh
output_notebook()
n_topics = 4
mycolors = np.array([color for name, color in mcolors.XKCD_COLORS.items()])
plot = figure(title="t-SNE Clustering of {} LDA Topics".format(n_topics), 
              plot_width=900, plot_height=700)
plot.scatter(x=tsne_lda[:,0], y=tsne_lda[:,1], color=mycolors[topic_num])
show(plot)

### pyLDAVis
Finally, pyLDAVis is the most commonly used and a nice way to visualise the information contained in a topic model.

In [None]:
# For some reason, pyLDAVis doesn't seem to work in this AWS kernel. It appears there is a conflict between some of the conda libraries.
# import pyLDAvis.gensim
# pyLDAvis.enable_notebook()
# viz = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word)
# viz

## Dataset with LDA Topic Scores

In [None]:
import pandas as pd
df = pd.read_csv('twitter_with_LDA.csv')
# Convert these numbers to categories
df['id'] = df['id'].astype('object')
print(df.shape)
df.head()

# Image Processing

## Download the Images

First, we will download all of the images we want to process and store them in S3. Let's first define a method that allows us to download an image from a URL and save it locally. The second block of code simply iterates over our data set and passes the URL and filename to the download_image function.

In [None]:
def download_image(url, name):
  import requests
  file_type = url.split('.')[-1]
  img_data = requests.get(url).content
  try:
    with open(f'images/{name}.{file_type}', 'wb') as handler:
      handler.write(img_data)
  except:
    with open(f'images/{name}.{file_type}', 'wb') as handler:
      handler.write(img_data)

In [None]:
for i, row in enumerate(df.itertuples()):
  if not pd.isnull(row.url):
    download_image(row.url, row.id)
    print(i, row.url, row.id)

## Save them to S3 Bucket
Now that we have all of the files stored locally, we need to upload them to an S3 bucket.

In [None]:
import glob
import boto3
import os
import pandas as pd

bucket = "tmeservy-mldata"    # replace this with your bucket name
prefix = "photos/twitter" # replace this with the path to your images

In [None]:
#upload the files to the S3 bucket
images = glob.glob("images/*.jpg")
for filename in images:
  boto3.Session().resource('s3').Bucket(bucket).upload_file(filename,f'{prefix}/{os.path.basename(filename)}')

## Use AWS Rekognition to Scrape Entities

In [None]:
import pandas as pd
df = pd.read_csv('twitter_with_LDA.csv')
# Convert these numbers to categories
df['id'] = df['id'].astype('object')
print(df.shape)
df.head()

Initially, you may want to limit the number of rows that you are processing until you get your code working just right. You can use the following code to do so.

In [None]:
#limit number of rows for testing-if you don't want to process everything
#df=df.head(40)
#df.dtypes

In [None]:
#get the rekognition client
client = boto3.client('rekognition')

In [None]:
#get the s3 client
s3 = boto3.resource('s3')
my_bucket = s3.Bucket(bucket)
#files = my_bucket.objects.filter(Prefix=prefix)

In previous classes when we processed images with Rekognition we retrieved a list of all files in S3 and then passed each file from the list to Rekognition. Here, instead, we will iterate over our dataframe and construct the name of the file we want it to process from the TweetID.

In [None]:
dfFaces = pd.DataFrame([])
i=0

for row in df.itertuples():
    file = f"{row.id}"
    extension = f"{row.url.split('.')[-1]}"
    print(f"Processing TweetID - {file}.{extension}")    
    if not pd.isnull(file):
        #print(i, row[1], file, f"{prefix}/{row[1]}.{file.split('.')[-1]}")
    
    
        if extension == 'jpg':
            # call rekognition for this next file
            response = client.detect_faces(
                Image={
                    'S3Object': {
                        'Bucket': bucket,
                        'Name': f"{prefix}/{file}.{extension}"
                    }
                },
                Attributes=[
                    'ALL',
                ]
            )

            # now add all of the facial features for every person found in the photo
            for fd in response["FaceDetails"]:    
                i=i+1
                dfFaces.loc[i,'TweetID'] = file
                dfFaces.loc[i,'PersonID'] = i
                dfFaces.loc[i,'AgeRange-Low'] = fd["AgeRange"]["Low"]
                dfFaces.loc[i,'AgeRange-High'] = fd["AgeRange"]["High"]
                dfFaces.loc[i,'Smile'] = fd["Smile"]["Value"]
                dfFaces.loc[i,'Gender'] = fd["Gender"]["Value"]
                dfFaces.loc[i,'Emotion'] = fd["Emotions"][0]["Type"] #get dominant emotion
                dfFaces.loc[i,'Emotion-Confidence'] = fd["Emotions"][0]["Confidence"] #get dominant emotion



In [None]:
dfFaces.describe

In this example we extracted information from people that were detected in the image. Of course, we could have used other Rekgonition methods to extract text or identify different types of objects that were identified. We now need to summarize this data into a single row for each tweet.

In [None]:
dfFaceSummary = pd.DataFrame([])

#summarize stats per file
#aggregate functions include min, max, mean, count, and more.
dfFaceSummary['Count-People'] = dfFaces.groupby('TweetID')['PersonID'].count()
dfFaceSummary['Avg-AgeRange-Low'] = dfFaces.groupby('TweetID')['AgeRange-Low'].min()
dfFaceSummary['Avg-AgeRange-High'] = dfFaces.groupby('TweetID')['AgeRange-High'].max()
dfFaceSummary['Count-Smile'] = dfFaces[dfFaces['Smile']==True].groupby('TweetID')['Smile'].count()
dfFaceSummary['Count-Smile'] = dfFaceSummary['Count-Smile'].fillna(0)
dfFaceSummary['Percent-Smile'] = dfFaceSummary['Count-Smile']/df2['Count-People']
dfFaceSummary['Percent-Smile'] = dfFaceSummary['Percent-Smile'].fillna(0)

dfFaceSummary = dfFaceSummary.merge(dfFaces.groupby('TweetID')['Emotion'].value_counts().unstack().fillna(0), on='TweetID')
dfFaceSummary = dfFaceSummary.merge(dfFaces.groupby('TweetID')['Gender'].value_counts().unstack().fillna(0), on='TweetID')

#print out 
dfFaceSummary


Now that we have summary information, we need to merge it back to our data frame and save it to a CSV file so that we can use it during modeling.

In [None]:
#the tweet id is an object, so we need to convert it to the same data type as our id
dfFaceSummary.index = dfFaceSummary.index.astype(int)

#Now merge the frames together
df = df.merge(dfFaceSummary, left_on='id', right_on='TweetID')
df


In [None]:
#now save this out to a new csv
df.to_csv('twitter_with_LDA_and_image_data.csv', index=False)

# Modeling

In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor # Import Decision Tree Regression algorithm
from sklearn.ensemble import GradientBoostingRegressor # Import XGBoost algorithm 
from sklearn.model_selection import train_test_split # Import train_test_split function
# for a completelist of available algorithms: https://scikit-learn.org/stable/supervised_learning.html
# Which one should I use?: https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

df = pd.read_csv('twitter_with_LDA_and_image_data.csv')
# Convert these numbers to categories
df['id'] = df['id'].astype('object')
print(df.shape)

# Determine what you want to predict:
label = 'likes'

In [None]:
# Print out the columns so that we can identify which ones we want to drop (e.g. unique identifiers, original text before processing, image file names, dates)
for col in df.columns:
  print(f'\'{col}\', ', end="")

drop_list = ['id', 'created_at', 'url', 'text', 'words', 'File', 'PersonID', 'topic_1', 'topic_2', 'topic_3', 'topic_4', 'topic_5', 'topic_6', 'topic_7', 'topic_8', 'topic_9']

In [None]:
# Eliminate anything remaining in the drop list without throwing an error in case we removed it earlier
for col in df.columns:
  if col in drop_list:
    try:
      df.drop(columns=[col], inplace=True)
    except:
      continue



In [None]:
df.head()
#df.dtypes

In [None]:
df.dropna(inplace=True)       # Remove any rows with null values
print(f'Records: {len(df)}')  # Count and print the number of rows remaining



In [None]:
df['Dominant_topic'] = df['Dominant_topic'].astype('object') # Topics are categorical so this needs to be modified since it's a number



In [None]:
# Create dummy codes for all features and not the label
for col in df.columns:
  if not pd.api.types.is_numeric_dtype(df[col]):
    df = pd.get_dummies(df, columns=[col], prefix=col)
    



In [None]:
# Eliminate columns with only one unique value:
for col in df.columns:
  if (df[col].nunique() < 2):
    try:
      df.drop(columns=[col], inplace=True)
    except:
      continue

In [None]:
df

In [None]:
# Split dataset in features and target variable

y = df[label] # Label
X = df.drop(columns=[label]) # Features
X = X.select_dtypes(np.number)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test
X_train.head()

In [None]:
# This Linear Regression is only for model interpretation purposes

import statsmodels.api as sm

# Run the multiple linear regression model
model = sm.OLS(y, X)
results = model.fit()

# View results
print(results.summary())

In [None]:
# Create Decision Tree regressor object
clf = DecisionTreeRegressor()

# Train Decision Tree regressor
clf = clf.fit(X_train,y_train)

# Predict the labels for test dataset
y_pred = clf.predict(X_test)

In [None]:
output_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred,})
output_df.head(10)

In [None]:
# Import scikit-learn metrics module. See complete list of Classification metrics here: https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics
from sklearn import metrics
    
print(f'R squared:\t{metrics.r2_score(y_test, y_pred)}')
print(f'MAE:\t\t{metrics.mean_absolute_error(y_test, y_pred)}')
print(f'RMSE:\t\t{metrics.mean_squared_error(y_test, y_pred)**(1/2)}')

In [None]:
# Create XGBoost regressor object
clr = GradientBoostingRegressor()

# Train Decision Tree regression
clr = clf.fit(X_train,y_train)

# Predict the labels for test dataset
y_pred = clr.predict(X_test)

In [None]:
output_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred,})
output_df.head(10)

In [None]:
from sklearn import metrics
    
print(f'R squared:\t{metrics.r2_score(y_test, y_pred)}')
print(f'MAE:\t\t{metrics.mean_absolute_error(y_test, y_pred)}')
print(f'RMSE:\t\t{metrics.mean_squared_error(y_test, y_pred)**(1/2)}')

In [None]:
import pickle

# Save the model with the highest fit metric
pickle.dump(clr, open('stored_model.sav', 'wb'))  # OPTION 1: pickle

In [None]:
# ...some time later

import pickle
import numpy as np

# OPTION 1: Using pickle
# load the model from 'stored_model.sav'
loaded_model = pickle.load(open('stored_model.sav', 'rb'))

# for a single prediction, enter a row of data and reshape into numpy array
case = X_test.iloc[0]
print(f'Single prediction:\t{loaded_model.predict(np.array(case).reshape(1, -1))[0]}\n\n{case}\n')

# for a batch prediction, enter a Pandas DataFrame or a Numpy array of arrays
predictions = loaded_model.predict(X_test) 
batch_results = pd.DataFrame({'Actual':y_test, 'Predicted':predictions, 'Diff':(predictions - y_test)})
print(f'MAE:\t{batch_results.Diff.abs().mean()}\n')
batch_results.head(5)