In [1]:
import pyspark, pickle
from pyspark import SparkContext
from pyspark.sql.functions import countDistinct, regexp_replace, monotonically_increasing_id
from pyspark.storagelevel import StorageLevel
import pandas as pd
import numpy as np
from pyspark.ml.feature import CountVectorizer, StringIndexer, StopWordsRemover, NGram, RegexTokenizer

from nltk.corpus import stopwords
import nltk, re

from pyspark.ml import PipelineModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, CrossValidatorModel
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.classification import NaiveBayes
from pyspark.ml import Pipeline

from pyspark.sql.types import StringType

pd.options.display.max_colwidth = -1



In [2]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()
sc = spark.sparkContext

# Load tweets and persist

In [3]:
tweets = spark.read.parquet('tweets_all.parquet')
tweets = tweets.orderBy('tweet_id').select('*', monotonically_increasing_id().alias('row'))
tweets.persist(StorageLevel.MEMORY_AND_DISK);

# Clean tweets for model pipeline, and run through pipeline

In [4]:
# Text needs to be renamed to 'tweet' for my model
# Links need to be replace with '[link]'
tweet_pipeline = tweets.select(regexp_replace('text', 'https?://[^ ,]+', '[link]').alias('tweet'))

In [5]:
nb_model = PipelineModel.load('./nb_model_pipeline/')

In [6]:
cc_accept_predictions = nb_model.transform(tweet_pipeline)
cc_accept_predictions.persist(StorageLevel.MEMORY_AND_DISK)
cc_accept_predictions = cc_accept_predictions. \
    select('probability', 'prediction', monotonically_increasing_id().alias('row'))

# Join predictions to tweets, and check results for known users

In [7]:
tweets = tweets.join(cc_accept_predictions, how='left',  on='row')
tweets.registerTempTable('tweets')

### 'Accept' / 'deny' classification ratio for known climate change deniers:

In [19]:
# 0 is 'accept,' 1 is 'deny'

spark.sql("""
    select screen_name, prediction, count(prediction)
    from tweets
    where screen_name = 'ClimateRealists'
    group by screen_name, prediction
    order by prediction
""").show(truncate=False)

+---------------+----------+-----------------+
|screen_name    |prediction|count(prediction)|
+---------------+----------+-----------------+
|ClimateRealists|0.0       |413              |
|ClimateRealists|1.0       |89               |
+---------------+----------+-----------------+



In [20]:
spark.sql("""
    select screen_name, prediction, count(prediction)
    from tweets
    where screen_name = 'SteveSGoddard'
    group by screen_name, prediction
    order by prediction
""").show(truncate=False)

+-------------+----------+-----------------+
|screen_name  |prediction|count(prediction)|
+-------------+----------+-----------------+
|SteveSGoddard|0.0       |139              |
|SteveSGoddard|1.0       |62               |
+-------------+----------+-----------------+



In [21]:
spark.sql("""
    select screen_name, prediction, count(prediction)
    from tweets
    where screen_name = 'ScottAdamsSays'
    group by screen_name, prediction
    order by prediction
""").show(truncate=False)

+--------------+----------+-----------------+
|screen_name   |prediction|count(prediction)|
+--------------+----------+-----------------+
|ScottAdamsSays|0.0       |22               |
|ScottAdamsSays|1.0       |10               |
+--------------+----------+-----------------+



In [27]:
spark.sql("""
    select screen_name, prediction, count(prediction)
    from tweets
    where screen_name = 'JunkScience'
    group by screen_name, prediction
    order by prediction
""").show(truncate=False)

+-----------+----------+-----------------+
|screen_name|prediction|count(prediction)|
+-----------+----------+-----------------+
|JunkScience|0.0       |89               |
|JunkScience|1.0       |24               |
+-----------+----------+-----------------+



### Classification ratios for known climate change acceptors:

In [22]:
# 0 is 'accept,' 1 is 'deny'

spark.sql("""
    select screen_name, prediction, count(prediction)
    from tweets
    where screen_name = 'CoralMDavenport'
    group by screen_name, prediction
    order by prediction
""").show(truncate=False)

+---------------+----------+-----------------+
|screen_name    |prediction|count(prediction)|
+---------------+----------+-----------------+
|CoralMDavenport|0.0       |9                |
|CoralMDavenport|1.0       |2                |
+---------------+----------+-----------------+



In [24]:
spark.sql("""
    select screen_name, prediction, count(prediction)
    from tweets
    where screen_name = 'NOAA'
    group by screen_name, prediction
    order by prediction
""").show(truncate=False)

+-----------+----------+-----------------+
|screen_name|prediction|count(prediction)|
+-----------+----------+-----------------+
|NOAA       |0.0       |11               |
+-----------+----------+-----------------+



In [26]:
spark.sql("""
    select screen_name, prediction, count(prediction)
    from tweets
    where screen_name = 'BillNye'
    group by screen_name, prediction
    order by prediction
""").show(truncate=False)

+-----------+----------+-----------------+
|screen_name|prediction|count(prediction)|
+-----------+----------+-----------------+
|BillNye    |0.0       |12               |
|BillNye    |1.0       |1                |
+-----------+----------+-----------------+



In [27]:
spark.sql("""
    select screen_name, prediction, count(prediction)
    from tweets
    where screen_name = 'EricHolthaus'
    group by screen_name, prediction
    order by prediction
""").show(truncate=False)

+------------+----------+-----------------+
|screen_name |prediction|count(prediction)|
+------------+----------+-----------------+
|EricHolthaus|0.0       |72               |
|EricHolthaus|1.0       |11               |
+------------+----------+-----------------+



# Explore the model's behavior further

In [34]:
# Create dataframe with model vocabulary and weights for each word

theta = nb_model.stages[5].theta.toArray().transpose()
model_weights = pd.DataFrame(theta, index=nb_model.stages[4].vocabulary, columns=['accept', 'deny'])
model_weights.head()

Unnamed: 0,accept,deny
[link],-2.885952,-3.504653
global,-3.396628,-3.194367
warming,-3.425696,-3.194367
climate,-3.194328,-4.200755
change,-3.208675,-4.225719


In [37]:
# Calculate ratios for the weights of each word. Which ratios are furthest from 1?
model_weights.apply(lambda x: abs(1-(x.accept/x.deny)), axis=1).sort_values(ascending=False)

# Results look pretty good. The most predictive words look to be those used by climate change deniers.
# The  least predictive words could plausibly be used by either side.

scam              0.625005
utah              0.496431
scandal           0.445010
conspiracy        0.425592
al                0.411500
lie               0.404683
gore              0.396014
contradict        0.357004
fanatics          0.357004
#sgp              0.357004
#teaparty         0.340909
lol               0.331330
fading            0.329171
blankenship       0.329171
anyway            0.329171
bogus             0.329171
fraud             0.314563
hysteria          0.313440
#gop              0.300363
apparently        0.297509
@algore           0.297509
discredits        0.297509
@mattyglesias     0.297509
builds            0.297509
irony             0.297509
oops              0.297509
representative    0.297509
ass               0.297509
#ipcc             0.297509
yeah              0.297509
                    ...   
thank             0.003862
analysis          0.003862
shut              0.003862
effect            0.003862
questions         0.003862
current           0.003862
t

In [43]:
# One problem with the model is that it can be very confident while being wrong
# EricHolthaus accepts climate change, yet the model sometimes predicts 1 ('deny') with high probability

spark.sql("""
    select screen_name, probability, prediction
    from tweets
    where screen_name = 'EricHolthaus' and prediction = 1
""").show(truncate=False)

+------------+-----------------------------------------+----------+
|screen_name |probability                              |prediction|
+------------+-----------------------------------------+----------+
|EricHolthaus|[0.23493622840396688,0.7650637715960331] |1.0       |
|EricHolthaus|[0.09120387545123637,0.9087961245487637] |1.0       |
|EricHolthaus|[0.3490159684331323,0.6509840315668677]  |1.0       |
|EricHolthaus|[0.01161253247199993,0.9883874675280001] |1.0       |
|EricHolthaus|[0.019730474608470723,0.9802695253915292]|1.0       |
|EricHolthaus|[0.3118981173583994,0.6881018826416007]  |1.0       |
|EricHolthaus|[0.200202779648672,0.799797220351328]    |1.0       |
|EricHolthaus|[0.39095000413382786,0.6090499958661721] |1.0       |
|EricHolthaus|[0.07610776429816785,0.9238922357018321] |1.0       |
|EricHolthaus|[0.25781973895784455,0.7421802610421555] |1.0       |
|EricHolthaus|[0.3416685841640517,0.6583314158359482]  |1.0       |
+------------+----------------------------------

In [44]:
# Another problem: many of the words in my tweets are not in my model vocabulary
# Take this tweet as an example, and output the words that are in the vocab:

t = []
for word in 'In which case most people are right. Not even the IPCC says climate change is "entirely" human-made'.split():
    if word in nb_model.stages[4].vocabulary:
        t.append(word)
print(t)

['case', 'people', 'even', 'says', 'climate', 'change']


## Aggregate predictions over tweets for each user. I have a few options:
#### 1. Use the modal prediction per user
#### 2. Use some ratio of accept climate change/deny climate change tweets

In [None]:
# Modal prediction. Not very good.

# predictions_per_user = spark.sql("""
#     select screen_name, min(prediction) as prediction
#     from
#         (select screen_name, prediction, count, 
#             rank() over (partition by screen_name order by count desc) as rank
#         from 
#             (select screen_name, prediction, count(*) as count
#             from tweets
#             group by screen_name, prediction) sub
#         ) sub2
#     where rank = 1
#     group by screen_name
# """)

In [8]:
# Ratio-based prediction. 'Acceptors' are those who have 4 times more 'accept' tweets than 'deny'

predictions_per_user = spark.sql("""
    select screen_name, case
                when n_denies = 0 then 'accept'
                when n_accepts/n_denies >= 4 then 'accept'
                else 'deny' end as prediction
    from
        (select screen_name,
            sum(case when prediction = 0 then 1 else 0 end) as n_accepts,
            sum(case when prediction = 1 then 1 else 0 end) as n_denies
        from tweets
        group by screen_name
        order by screen_name) sub
""")

# Note: these predictions didn't end up being good enough to be useful. I don't use them in my other final notebooks.

# Save results for later use

In [48]:
predictions_per_user.write.parquet('user_pred.parquet')

In [51]:
tweets.write.parquet('tweets_all_pred.parquet')