In [1]:
import pyspark, pickle
from pyspark import SparkContext
from pyspark.sql.functions import countDistinct
from pyspark.storagelevel import StorageLevel
import pandas as pd
import numpy as np
from pyspark.ml.feature import CountVectorizer, StringIndexer, StopWordsRemover, NGram, RegexTokenizer

from nltk.corpus import stopwords
import nltk, re

from pyspark.ml import PipelineModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, CrossValidatorModel
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.classification import NaiveBayes
from pyspark.ml import Pipeline

from pyspark.sql.types import StringType

from sklearn import svm, grid_search, datasets
from spark_sklearn import GridSearchCV

pd.options.display.max_colwidth = -1



In [2]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()
sc = spark.sparkContext

## Read data and clean

In [7]:
tweet_data = pd.read_csv('labeled_tweets.csv')

In [8]:
# Clean tweet_data

tweet_data.columns = ['garbage', 'tweet', 'existence', 'existence_conf']
tweet_data.drop('garbage', axis=1, inplace=True)
tweet_data.existence.replace(['Yes', 'No', 'Y', 'N'], ['yes', 'no', 'yes', 'no'], inplace=True)
tweet_data.tweet = tweet_data.tweet.str.replace('https?://[^ ,]+', '[link]') #replace links with '[link]'
tweet_data.existence.fillna('neutral', inplace=True)
tweet_data = tweet_data.sort_values('existence_conf', ascending=False).drop_duplicates(subset=['tweet'])
tweet_data = tweet_data[tweet_data.tweet != '[link]']   # One tweet consists only of a link

In [9]:
# Remove 'neutral' tweets. Will build model using only 'accept' and 'deny' classes
tweet_data = tweet_data[(tweet_data.existence != 'neutral')]

In [10]:
print(tweet_data.shape)
tweet_data.head()

(3352, 3)


Unnamed: 0,tweet,existence,existence_conf
0,"Global warming report urges governments to act|BRUSSELS, Belgium (AP) - The world faces increased hunger and .. [link]",yes,1.0
3146,New on [link] -- Heavy Snow Events: Not a Contradiction to Global Warming Theory -- [link],yes,1.0
3109,Interesting... RT @Heritage The DC Blizzard: More proof of Global Warming! [link],yes,1.0
3111,@Adam4004 That's sort of beside the point. People are acting like a winter storm in the mid-Atlantic disproves global warming. It doesn't.,yes,1.0
3124,RT @time D.C. Snowstorm: How Global Warming Makes Blizzards Worse - TIME [link] #green #climate #smh,yes,1.0


In [11]:
# Check class balance
tweet_data.existence.value_counts(dropna=False, normalize=True)

yes    0.706146
no     0.293854
Name: existence, dtype: float64

## Convert to Spark dataframe and do train/test split

In [12]:
# Create spark dataframe of tweets called 'tweets spark'

tweet_sp = spark.createDataFrame(tweet_data)
tweet_sp.persist()
tweet_sp.show()

+--------------------+---------+--------------+
|               tweet|existence|existence_conf|
+--------------------+---------+--------------+
|Global warming re...|      yes|           1.0|
|New on [link] -- ...|      yes|           1.0|
|Interesting... RT...|      yes|           1.0|
|@Adam4004 That's ...|      yes|           1.0|
|RT @time D.C. Sno...|      yes|           1.0|
|Actress Q'orianka...|      yes|           1.0|
|@Mac80537 Climate...|      yes|           1.0|
|Man! I wish there...|       no|           1.0|
|RT @WWFUS Climate...|      yes|           1.0|
|Fox News has Al G...|      yes|           1.0|
|Climate Denial Cr...|      yes|           1.0|
|FOX  "You can't m...|       no|           1.0|
|It's official: Th...|      yes|           1.0|
|Damn global warmi...|      yes|           1.0|
|Snowstorm: E Coas...|      yes|           1.0|
|So much for globa...|       no|           1.0|
|Article: If #clim...|      yes|           1.0|
|THANKS! Its not g...|       no|        

In [13]:
(train, test) = tweet_sp.randomSplit([0.7, 0.3], seed = 100)

train.persist()
test.persist()

print(train.count())
print(test.count())

2327
1025


# Naive Bayes model
## Set parameters

In [15]:
# Create regex tokenizer that is useful for Twitter data (preserves emoticons, hashtags, etc.)
# I used code from here, with some modifications: https://github.com/adonoho/TweetTokenizers/blob/master/PottsTweetTokenizer.py

# I got rid of ellipsis matcher
# I modified words with apostrophes to not keep after the apostrophe
# I got rid of the 'everything else that isn't whitespace' matcher
# I added regex to capture words in quotes as separate items: r'(?:["\'][a-zA-Z0-9/-]+["\'])'
pattern = r"""(?:\[link\])|(?:(?:\+?[01][\-\s.]*)?(?:[\(]?\d{3}[\-\s.\)]*)?\d{3}[\-\s.]*\d{4})|(?:(?<= )[<>]?[:;=8][\-o\*\']?[\)\]\(\[dDpP/\:\}\{@\|\\]|[\)\]\(\[dDpP/\:\}\{@\|\\][\-o\*\']?[:;=8][<>]?)|(<[^>]+>)|(?:@[\w_]+)|(?:\#+[\w_]+[\w\'_\-]*[\w_]+)|(?:["\'][a-z0-9/-]+["\'])|(?:[a-z][a-z\-_]+[a-z])|(?:[+\-]?\d+[,/.:-]\d+[+\-]?)|(?:[\w_]+)"""

word_re = re.compile(pattern, re.VERBOSE | re.I | re.UNICODE)

In [16]:
# Test Regex
df = spark.createDataFrame(["RT @contessabrewer: @newsbusters That's not what I said. >:(  :D I said that snowstorms don't refute global warming. (RE: [link]",
                           "@OTOOLEFAN REAL science. Not Algore's climate change 'science' hacks with their man-made global warming hoax. Gore=Palin"],
                           StringType())
df.show()

+--------------------+
|               value|
+--------------------+
|RT @contessabrewe...|
|@OTOOLEFAN REAL s...|
+--------------------+



In [18]:
RegexTokenizer(inputCol="value", outputCol="tokens", gaps=False, pattern=word_re.pattern).transform(df).take(2)

[Row(value="RT @contessabrewer: @newsbusters That's not what I said. >:(  :D I said that snowstorms don't refute global warming. (RE: [link]", tokens=['rt', '@contessabrewer', '@newsbusters', 'that', 's', 'not', 'what', 'i', 'said', '>:(', ':d', 'i', 'said', 'that', 'snowstorms', 'don', 't', 'refute', 'global', 'warming', 're', '[link]']),
 Row(value="@OTOOLEFAN REAL science. Not Algore's climate change 'science' hacks with their man-made global warming hoax. Gore=Palin", tokens=['@otoolefan', 'real', 'science', 'not', 'algore', 's', 'climate', 'change', "'science'", 'hacks', 'with', 'their', 'man-made', 'global', 'warming', 'hoax', 'gore', 'palin'])]

In [44]:
# Use this to check how labels are indexed by StringIndexer

#StringIndexer(inputCol="existence", outputCol="label").fit(train).transform(train).show()

In [None]:
# Number my labels
# Label numbering goes in order of most frequent label, descending
label_indxr = StringIndexer(inputCol="existence", outputCol="label")

# Tokenize tweets
tokenizer = RegexTokenizer(inputCol="tweet", outputCol="tokens", gaps=False, pattern=word_re.pattern)

# Remove stopwords
stp_rmv = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='new_tokens',
                           stopWords=stopwords.words('english'))

# Ngrams
ngram = NGram(inputCol=stp_rmv.getOutputCol(), outputCol="ngrams")

# Count occurences of words
cnvk = CountVectorizer(inputCol=ngram.getOutputCol(), outputCol='counts')

# Train a NaiveBayes model
nb = NaiveBayes(featuresCol=cnvk.getOutputCol(), modelType="multinomial")

# Pipeline
pipeline = Pipeline(stages=[label_indxr, tokenizer, stp_rmv, ngram, cnvk, nb])

# Create evaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                             metricName="accuracy")
# evaluator = BinaryClassificationEvaluator(labelCol='label',
#                                         rawPredictionCol='prediction',
#                                         metricName='areaUnderROC')

# Program search params
param_grid = (ParamGridBuilder() 
    .addGrid(nb.smoothing, [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]) 
    .addGrid(ngram.n, [1, 2, 3]) \
    .build())

# Put pipeline together with param search
cv_pipe = CrossValidator(estimator=pipeline, estimatorParamMaps=param_grid, evaluator=evaluator)

## Train model and evaluate

In [19]:
# Train model
nb_model = cv_pipe.fit(train)

In [20]:
# Average accuracy across grid search:
nb_model.avgMetrics

[0.7356521881130746,
 0.7560753512411365,
 0.7511342511667729,
 0.7800135610129255,
 0.765709673123103,
 0.7517374644013859,
 0.7882529290815801,
 0.7673312018901837,
 0.7525598860436528,
 0.7890630860255661,
 0.7715647808409496,
 0.7520081999961119,
 0.7907668230432867,
 0.7721222000017398,
 0.7503277774958442,
 0.7842581897828638,
 0.7686791467505643,
 0.7498791458224481]

In [32]:
# Show each model param alongside accuracy
list(zip(param_grid, nb_model2.avgMetrics))

[({Param(parent='NGram_45d1aef11812936053cc', name='n', doc='number of elements per n-gram (>=1)'): 1,
   Param(parent='NaiveBayes_4ecbbd74374dfe252c01', name='smoothing', doc='The smoothing parameter, should be >= 0, default is 1.0'): 0.0},
  0.7356521881130746),
 ({Param(parent='NGram_45d1aef11812936053cc', name='n', doc='number of elements per n-gram (>=1)'): 1,
   Param(parent='NaiveBayes_4ecbbd74374dfe252c01', name='smoothing', doc='The smoothing parameter, should be >= 0, default is 1.0'): 0.2},
  0.7800135610129255),
 ({Param(parent='NGram_45d1aef11812936053cc', name='n', doc='number of elements per n-gram (>=1)'): 1,
   Param(parent='NaiveBayes_4ecbbd74374dfe252c01', name='smoothing', doc='The smoothing parameter, should be >= 0, default is 1.0'): 0.4},
  0.7882529290815801),
 ({Param(parent='NGram_45d1aef11812936053cc', name='n', doc='number of elements per n-gram (>=1)'): 1,
   Param(parent='NaiveBayes_4ecbbd74374dfe252c01', name='smoothing', doc='The smoothing parameter, sho

In [21]:
# See accuracy on test set
test_prediction = nb_model.transform(test)

accuracy = evaluator.evaluate(test_prediction)
print("Model Accuracy: ", accuracy)

Model Accuracy:  0.8117073170731708


In [23]:
metrics = MulticlassMetrics(test_prediction.select("prediction", "label").rdd)

print('Yes recall:', metrics.recall(0))
print('No recall:', metrics.recall(1), '\n')

print('Yes precision:', metrics.precision(0))
print('No precision:', metrics.precision(1))

Yes recall: 0.8897849462365591
No recall: 0.604982206405694 

Yes precision: 0.8564036222509702
No precision: 0.6746031746031746


In [24]:
# Show normalized confusion matrix
conf = metrics.confusionMatrix().toArray()
conf_norm = conf/conf.sum(axis=1)[:, np.newaxis]
print(conf_norm)

[[ 0.88978495  0.11021505]
 [ 0.39501779  0.60498221]]


## Save naive bayes model pipeline

In [25]:
nb_model.bestModel.save('./nb_model_pipeline')

In [None]:
#model = PipelineModel.load('./nb_model_pipeline/')

# SVM Model
## Set parameters

In [27]:
# Number my labels
# Label numbering goes in order of most frequent label, descending
label_indxr_svm = StringIndexer(inputCol="existence", outputCol="label")

# Tokenize tweets
tokenizer_svm = RegexTokenizer(inputCol="tweet", outputCol="tokens", gaps=False, pattern=word_re.pattern)

# Remove stopwords
stp_rmv_svm = StopWordsRemover(inputCol=tokenizer_svm.getOutputCol(), outputCol='new_tokens',
                           stopWords=stopwords.words('english'))

# Count occurences of words
cnvk_svm = CountVectorizer(inputCol=stp_rmv_svm.getOutputCol(), outputCol='counts')

# Pipeline
pipeline_svm = Pipeline(stages=[label_indxr_svm, tokenizer_svm, stp_rmv_svm, cnvk_svm])

# Create evaluator
evaluator_svm = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                             metricName="f1")

## Train model  
## Pyspark SVM doesn't have rbf kernel, so I have to use spark_sklearn library

In [28]:
# Count Vectorize my data
countvec_data_svm = pipeline_svm.fit(train).transform(train)
countvec_data_svm.show()

+--------------------+---------+--------------+-----+--------------------+--------------------+--------------------+
|               tweet|existence|existence_conf|label|              tokens|          new_tokens|              counts|
+--------------------+---------+--------------+-----+--------------------+--------------------+--------------------+
|"Even McCain supp...|       no|           1.0|  1.0|[even, mccain, su...|[even, mccain, su...|(6439,[1,2,50,102...|
|"Forests are grow...|      yes|           1.0|  0.0|[forests, are, gr...|[forests, growing...|(6439,[0,3,4,233,...|
|"How to fabricate...|       no|           1.0|  1.0|[how, to, fabrica...|[fabricate, clima...|(6439,[0,3,4,6,16...|
|"Kerry Graham Lie...|      yes|           1.0|  0.0|[kerry, graham, l...|[kerry, graham, l...|(6439,[0,1,2,3,37...|
|"Proof Of Global ...|      yes|           1.0|  0.0|[proof, of, globa...|[proof, global, w...|(6439,[0,1,2,145]...|
|"SCAM, SCAM, SCAM...|       no|           1.0|  1.0|[scam, scam

In [29]:
# Split up the sparse matrix that Spark gives me and create a standard sklearn X matrix
countvec_data_svm_x = pd.DataFrame(countvec_data_svm.select('counts').rdd.map(
                        lambda x: x['counts'].toArray()).collect())

In [30]:
# Create standard sklearn Y vector
countvec_data_svm_y = countvec_data_svm.select('existence').toPandas().existence

In [31]:
# Train model
parameters = [{'kernel': ['rbf'], 'gamma': [1,1e-1,1e-2,1e-3, 1e-4], 'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

svr = svm.SVC()
clf = GridSearchCV(sc, svr, parameters)
clf.fit(countvec_data_svm_x, countvec_data_svm_y)

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'C': [1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']}, {'C': [1, 10, 100, 1000], 'kernel': ['linear']}],
       pre_dispatch='2*n_jobs', refit=True,
       sc=<pyspark.context.SparkContext object at 0x7fa8efe04eb8>,
       scoring=None, verbose=0)

## Evaluate model based on CV accuracy, and test accuracy

In [32]:
# CV accuracy:
clf.best_score_

0.7666523420713365

In [33]:
clf.best_params_

{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}

In [34]:
# Run test data through pipeline
test_svm = pipeline_svm.fit(train).transform(test)
test_svm_x = pd.DataFrame(test_svm.select('counts').rdd.map(
                        lambda x: x['counts'].toArray()).collect())
test_svm_y = test_svm.select('existence').toPandas().existence.to_frame()

In [35]:
# Make predictions
svm_predicts = clf.predict(test_svm_x)

In [36]:
# Compare predictions and ground truth
test_svm_y['predict'] = svm_predicts
test_svm_y.head()

Unnamed: 0,existence,predict
0,no,no
1,yes,yes
2,yes,no
3,yes,no
4,yes,yes


In [37]:
# Recall for each class
test_svm_y.groupby('existence').predict.value_counts(normalize=True)

existence  predict
no         yes        0.679715
           no         0.320285
yes        yes        0.862903
           no         0.137097
Name: predict, dtype: float64

In [38]:
# Precision for each class
test_svm_y.groupby('predict').existence.value_counts(normalize=True)

predict  existence
no       yes          0.531250
         no           0.468750
yes      yes          0.770708
         no           0.229292
Name: existence, dtype: float64

SVM performs worse than Naive Bayes here, so I won't keep it