# Twitter Ratio

This folder contains data behind the story [The Worst Tweeter In Politics Isn’t Trump](https://fivethirtyeight.com/features/the-worst-tweeter-in-politics-isnt-trump/).

`senators.csv` contains tweets from all senators collected on Oct. 19 and 20.

`BarackObama.csv` contains tweets from [@BarackObama](https://twitter.com/BarackObama) collected on Oct. 20.

`realDonaldTrump.csv` contains tweets from [@realDonaldTrump](https://twitter.com/realDonaldTrump) collected on Oct. 23.

## Senator Tweets

We'll start of by looking at the `senators.csv` dataset.

In [2]:
# The usual suspects ...
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

import logging
import string
import re
import pyLDAvis.gensim
import pandas_profiling
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import time_series as ts

# And their accomplices ...
from pyspark.sql import SparkSession
from scipy import stats
from matplotlib.ticker import FuncFormatter
from textblob import TextBlob
from gensim import corpora
from gensim import models
from gensim import similarities
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
from collections import defaultdict
from collections import Counter
from sklearn.cluster import KMeans
from pprint import pprint

In [3]:
# Spark Session
spark = SparkSession.builder.appName('Twitter Ratio').getOrCreate()

In [4]:
# Reading in the data
tweets = spark.read.csv('senators.csv', header=True, mode='DROPMALFORMED', encoding="ISO-8859-1")
tweets.printSchema()

root
 |-- created_at: string (nullable = true)
 |-- text: string (nullable = true)
 |-- url: string (nullable = true)
 |-- replies: string (nullable = true)
 |-- retweets: string (nullable = true)
 |-- favorites: string (nullable = true)
 |-- user: string (nullable = true)
 |-- bioguide_id: string (nullable = true)
 |-- party: string (nullable = true)
 |-- state: string (nullable = true)



In [5]:
# Top rows
tweets.show(10)

+--------------+--------------------+--------------------+-------+--------+---------+------------+-----------+-----+-----+
|    created_at|                text|                 url|replies|retweets|favorites|        user|bioguide_id|party|state|
+--------------+--------------------+--------------------+-------+--------+---------+------------+-----------+-----+-----+
|10/19/17 21:47|We released bipar...|https://twitter.c...|     21|     129|      533|amyklobuchar|    K000367|    D|   MN|
|10/19/17 18:48|I spoke with @Mor...|https://twitter.c...|      8|      46|      150|amyklobuchar|    K000367|    D|   MN|
|10/19/17 18:14|Lots of interest ...|https://twitter.c...|     36|     227|      932|amyklobuchar|    K000367|    D|   MN|
|10/19/17 18:04|Today's the day @...|https://twitter.c...|     17|     167|      550|amyklobuchar|    K000367|    D|   MN|
|10/19/17 16:33|.@MarkWarner, @Se...|https://twitter.c...|     31|     279|      893|amyklobuchar|    K000367|    D|   MN|
|10/19/17 15:14|

In [6]:
df = tweets.toPandas()

#### Preprocessing

In [7]:
# Removing @user references and links
def strip_links(text):
    '''Removes links in text.'''
    link_regex = re.compile(r'((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
    links = re.findall(link_regex, text)
    for link in links:
        text = text.replace(link[0], ', ')
    return text

def strip_all_entities(text):
    '''Removes @user references and hashtags.'''
    entity_prefixes = ['@', '#']
    for separator in string.punctuation:
        if separator not in entity_prefixes:
            text = text.replace(separator, ' ')
    words = list()
    for word in text.split():
        word = word.strip()
        if word:
            if word[0] not in entity_prefixes:
                words.append(word)
    return ' '.join(words)

for i in range(len(df)):
    df.loc[i, 'text'] = strip_all_entities(strip_links(df.loc[i, 'text']))
df['text'].head(10)

KeyboardInterrupt: 

In [None]:
# Text corpus
def create_document_corpus(df, column):
    '''Creates document corpus.'''
    return [i for i in df[column]]

# Removing common words and tokenize
def remove_common_words_and_tokenize(document_corpus):
    '''Removes common words and tokenizes text.'''
    stop_words = set(stopwords.words('english'))
    stop_words.update(['-', '=', '+', '*','.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']',
                       '{', '}', 'amp', 'kkk', 'hahaha', 'haha', 'ha', 'RT', 'i’m', '…', '–', 'http'])
    stop_words.update([i for i in string.ascii_lowercase]) # Including lowercase letters
    stop_words.update([i for i in string.ascii_uppercase]) # Including uppercase letters
    for doc in document_corpus:
        list_of_words = [i.lower() for i in wordpunct_tokenize(doc) if i.lower() not in stop_words]
    stop_words.update(list_of_words)

    # Removing common words
    return [[word for word in doc.lower().split() if word not in stop_words] for doc in document_corpus]

# Removing words that appear only once
def remove_words_appearing_only_once(text_corpus):
    '''Removes words that appear only once.'''
    frequency = defaultdict(int)
    for text in text_corpus:
        for token in text:
            frequency[token] += 1

    return [[token for token in text if frequency[token] > 1] for text in text_corpus]

# Removing emojis
def remove_emojis(text_corpus):
    '''Removes emojis and emoticons from text corpus.'''
    # Emoticons and emojis
    # HappyEmoticons
    emoticons_happy = set([':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
                           ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
                           '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
                           'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
                           '<3'])

    # Sad Emoticons
    emoticons_sad = set([':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
                         ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
                         ':c', ':{', '>:\\', ';('])

    # Emoji patterns
    emoji_pattern = re.compile("["
             u"\U0001F600-\U0001F64F"  # emoticons
             u"\U0001F300-\U0001F5FF"  # symbols & pictographs
             u"\U0001F680-\U0001F6FF"  # transport & map symbols
             u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
             u"\U00002702-\U000027B0"
             u"\U000024C2-\U0001F251"
             "]+", flags=re.UNICODE)

    # Combine
    emoticons = emoticons_happy.union(emoticons_sad)
    return [[token for token in text if token not in emoticons] for text in text_corpus]

# Removing empty tokens
def remove_empty_corpus_tokens(text_corpus):
    '''Removes empty text corpus tokens.'''
    return [text for text in text_corpus if len(text) > 1]

# Formatting
document = create_document_corpus(df, 'text')
text_corpus = remove_common_words_and_tokenize(document)
refined_text_corpus = remove_words_appearing_only_once(text_corpus)
no_emojis_refined_text_corpus = remove_emojis(refined_text_corpus)
no_empty_tokens_refined_text_corpus = remove_empty_corpus_tokens(no_emojis_refined_text_corpus)

# Printing the top 10
pprint(no_empty_tokens_refined_text_corpus[:10])