Connect to Spark and read in the data from the csv file (using the Amazon Web Services S3 url)

In [1]:
import os
# Find the latest version of spark 3.0  from http://www-us.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.0.2'
spark_version = 'spark-3.1.2' #<enter version>'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Get:1 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
0% [Connecting to archive.ubuntu.com (91.189.88.142)] [1 InRelease 14.2 kB/88.70% [Connecting to archive.ubuntu.com (91.189.88.142)] [Connected to cloud.r-pro                                                                               Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
0% [Connecting to archive.ubuntu.com (91.189.88.142)] [Waiting for headers] [Wa0% [1 InRelease gpgv 88.7 kB] [Waiting for headers] [Waiting for headers] [Wait                                                                               Hit:3 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
0% [1 InRelease gpgv 88.7 kB] [Waiting for headers] [Waiting for headers] [Wait                                                                               Ign:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/

In [2]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Hashing").getOrCreate()

In [3]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

In [4]:
# Read in data from S3 Buckets
from pyspark import SparkFiles
url ="https://butlerunit22.s3.us-east-2.amazonaws.com/all_articles_final.csv"
spark.sparkContext.addFile(url)
news_import_df = spark.read.csv(SparkFiles.get("all_articles_final.csv"), sep=",", header=True)

# Show DataFrame
news_import_df.show()

+---+----+---------+--------------------+--------------------+-------+-----------------+
|_c0|type|news_type|               title|                text|subject|             date|
+---+----+---------+--------------------+--------------------+-------+-----------------+
|  0|   0|     fake| Donald Trump Sen...|Donald Trump just...|   News|December 31, 2017|
|  1|   0|     fake| Drunk Bragging T...|House Intelligenc...|   News|December 31, 2017|
|  2|   0|     fake| Sheriff David Cl...|On Friday, it was...|   News|December 30, 2017|
|  3|   0|     fake| Trump Is So Obse...|On Christmas day,...|   News|December 29, 2017|
|  4|   0|     fake| Pope Francis Jus...|Pope Francis used...|   News|December 25, 2017|
|  5|   0|     fake| Racist Alabama C...|The number of cas...|   News|December 25, 2017|
|  6|   0|     fake| Fresh Off The Go...|Donald Trump spen...|   News|December 23, 2017|
|  7|   0|     fake| Trump Said Some ...|In the wake of ye...|   News|December 23, 2017|
|  8|   0|     fake| 

Identify the count, view end of the data, and dropna

In [5]:
news_import_df.count()

44238

In [6]:
news_import_df[news_import_df['_c0'] > 44225].show()

+-----+----+---------+--------------------+--------------------+---------+----------------+
|  _c0|type|news_type|               title|                text|  subject|            date|
+-----+----+---------+--------------------+--------------------+---------+----------------+
|44226|   1|     true|LexisNexis withdr...|LexisNexis, a pro...|worldnews|August 22, 2017 |
|44227|   1|     true|Minsk cultural hu...|In the shadow of ...|worldnews|August 22, 2017 |
|44228|   1|     true|Vatican upbeat on...|Vatican Secretary...|worldnews|August 22, 2017 |
|44229|   1|     true|Indonesia to buy ...|Indonesia will bu...|worldnews|August 22, 2017 |
+-----+----+---------+--------------------+--------------------+---------+----------------+



In [17]:
news_import_df = news_import_df.dropna()
news_import_df.show()

+----+----+---------+--------------------+--------------------+--------------------+--------------------+
| _c0|type|news_type|               title|                text|             subject|                date|
+----+----+---------+--------------------+--------------------+--------------------+--------------------+
| 203|   0|     fake| Trump Gets His A...|Donald Trump stuc...|                News|    October 20, 2017|
| 226|   0|     fake| White Pro-Trump ...|Hahahahahahahaha....|                News|    October 12, 2017|
| 461|   0|     fake| Kellyanne Conway...|White House advis...|                News|     August 31, 2017|
| 617|   0|     fake| Sean Hannity Swe...|For some unknown ...|                News|      August 9, 2017|
| 683|   0|     fake| McCain Just BRUT...|Donald Trump fina...|                News|      August 2, 2017|
| 715|   0|     fake| Federal Court Ru...|One of the things...|                News|       July 28, 2017|
| 943|   0|     fake| Chris Christie H...|Once

In [18]:
news_import_df.count()

44230

In [9]:
news_import_df[news_import_df['_c0'] > 9725].show(10)

+----+----+---------+--------------------+--------------------+----------------+-----------+
| _c0|type|news_type|               title|                text|         subject|       date|
+----+----+---------+--------------------+--------------------+----------------+-----------+
|9726|   0|     fake|BREAKING NEWS: NF...|NFL reporter Jaso...|        politics|Oct 8, 2017|
|9727|   0|     fake|TRUMP BLASTS SENA...|President Trump j...|        politics|Oct 8, 2017|
|9728|   0|     fake|BREAKING…VP PENCE...|Mike Pence tweete...|        politics|Oct 8, 2017|
|9729|   0|     fake|MAYORAL CANDIDATE...|If John Persinger...|        politics|Oct 8, 2017|
|9730|   0|     fake|ANTI-AMERICAN GEO...|We just discovere...|        politics|Oct 7, 2017|
|9731|   0|     fake|#WINNING UPDATE: ...|Watch President T...|        politics|Oct 7, 2017|
|9732|   0|     fake|DRAMA QUEEN CHRIS...|MSNBC host Chris ...|        politics|Oct 7, 2017|
|9733|   0|     fake|WATCH STEVE SCALI...|House Majority Wh...|       

In [10]:
# All text is still included for one thought to be at issue in the CSV
news_import_df[news_import_df['_c0'] == 9730].show(truncate=False)

+----+----+---------+------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [11]:
news_import_df.dtypes

[('_c0', 'string'),
 ('type', 'string'),
 ('news_type', 'string'),
 ('title', 'string'),
 ('text', 'string'),
 ('subject', 'string'),
 ('date', 'string')]

In [23]:
cleaned_news = news_import_df.filter((news_import_df['news_type'] == 'fake') | (news_import_df['news_type'] == 'true') )
cleaned_news.count()


44226

In [26]:
news = cleaned_news.groupby('news_type')
news.count().show()

+---------+-----+
|news_type|count|
+---------+-----+
|     fake|22838|
|     true|21388|
+---------+-----+



Tokenize, find length of the tokens, remove words in stop_list and hashing 

In [29]:
# Tokenize DataFrame
title_tokened = Tokenizer(inputCol="title", outputCol="title_words")
title_tokened_transformed = title_tokened.transform(cleaned_news)
title_tokened_transformed.show()

+----+----+---------+--------------------+--------------------+--------------------+--------------------+--------------------+
| _c0|type|news_type|               title|                text|             subject|                date|         title_words|
+----+----+---------+--------------------+--------------------+--------------------+--------------------+--------------------+
| 203|   0|     fake| Trump Gets His A...|Donald Trump stuc...|                News|    October 20, 2017|[, trump, gets, h...|
| 226|   0|     fake| White Pro-Trump ...|Hahahahahahahaha....|                News|    October 12, 2017|[, white, pro-tru...|
| 461|   0|     fake| Kellyanne Conway...|White House advis...|                News|     August 31, 2017|[, kellyanne, con...|
| 617|   0|     fake| Sean Hannity Swe...|For some unknown ...|                News|      August 9, 2017|[, sean, hannity,...|
| 683|   0|     fake| McCain Just BRUT...|Donald Trump fina...|                News|      August 2, 2017|[, mcc

In [30]:
title_tokened_transformed.count()

44226

In [31]:
text_tokened = Tokenizer(inputCol="text", outputCol="text_words")
text_tokened_transformed = text_tokened.transform(cleaned_news)
text_tokened_transformed.show()

+----+----+---------+--------------------+--------------------+--------------------+--------------------+--------------------+
| _c0|type|news_type|               title|                text|             subject|                date|          text_words|
+----+----+---------+--------------------+--------------------+--------------------+--------------------+--------------------+
| 203|   0|     fake| Trump Gets His A...|Donald Trump stuc...|                News|    October 20, 2017|[donald, trump, s...|
| 226|   0|     fake| White Pro-Trump ...|Hahahahahahahaha....|                News|    October 12, 2017|[hahahahahahahaha...|
| 461|   0|     fake| Kellyanne Conway...|White House advis...|                News|     August 31, 2017|[white, house, ad...|
| 617|   0|     fake| Sean Hannity Swe...|For some unknown ...|                News|      August 9, 2017|[for, some, unkno...|
| 683|   0|     fake| McCain Just BRUT...|Donald Trump fina...|                News|      August 2, 2017|[donal

In [32]:
# Counting the tokens for both the title_words and the text_words
def word_list_length(word_list):
  if word_list:
    return len(word_list)
  return 0

In [33]:
count_tokens = udf(word_list_length, IntegerType())
count_tokens

<function __main__.word_list_length>

In [34]:
title_tokenized = title_tokened.transform(cleaned_news)

title_tokened_transformed = title_tokenized\
  .withColumn('title_tokens', count_tokens(col('title_words')))
  #.show(truncate=True)

In [35]:
title_tokened_transformed.show()

+----+----+---------+--------------------+--------------------+--------------------+--------------------+--------------------+------------+
| _c0|type|news_type|               title|                text|             subject|                date|         title_words|title_tokens|
+----+----+---------+--------------------+--------------------+--------------------+--------------------+--------------------+------------+
| 203|   0|     fake| Trump Gets His A...|Donald Trump stuc...|                News|    October 20, 2017|[, trump, gets, h...|          19|
| 226|   0|     fake| White Pro-Trump ...|Hahahahahahahaha....|                News|    October 12, 2017|[, white, pro-tru...|          15|
| 461|   0|     fake| Kellyanne Conway...|White House advis...|                News|     August 31, 2017|[, kellyanne, con...|          13|
| 617|   0|     fake| Sean Hannity Swe...|For some unknown ...|                News|      August 9, 2017|[, sean, hannity,...|          15|
| 683|   0|     fake

In [36]:
text_tokenized = text_tokened.transform(cleaned_news)

text_tokened_transformed = text_tokenized\
  .withColumn('text_tokens', count_tokens(col('text_words')))
  #.show(truncate=True)

In [37]:
text_tokened_transformed.show()

+----+----+---------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+
| _c0|type|news_type|               title|                text|             subject|                date|          text_words|text_tokens|
+----+----+---------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+
| 203|   0|     fake| Trump Gets His A...|Donald Trump stuc...|                News|    October 20, 2017|[donald, trump, s...|        371|
| 226|   0|     fake| White Pro-Trump ...|Hahahahahahahaha....|                News|    October 12, 2017|[hahahahahahahaha...|        225|
| 461|   0|     fake| Kellyanne Conway...|White House advis...|                News|     August 31, 2017|[white, house, ad...|        318|
| 617|   0|     fake| Sean Hannity Swe...|For some unknown ...|                News|      August 9, 2017|[for, some, unkno...|        409|
| 683|   0|     fake| McCai

In [38]:
# Use same stop_list on the text_words
stop_list = ["the", "a", "an", "to", "there", "this", "of", "is", "on", "he", "she", "in", "as", "for", "are", "were", "has", "will", "and", "at"]
remover = StopWordsRemover(inputCol="text_words", outputCol="text_filtered", stopWords=stop_list)
removed_text_frame = remover.transform(text_tokened_transformed)
removed_text_frame.show(truncate=True)

+----+----+---------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+--------------------+
| _c0|type|news_type|               title|                text|             subject|                date|          text_words|text_tokens|       text_filtered|
+----+----+---------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+--------------------+
| 203|   0|     fake| Trump Gets His A...|Donald Trump stuc...|                News|    October 20, 2017|[donald, trump, s...|        371|[donald, trump, s...|
| 226|   0|     fake| White Pro-Trump ...|Hahahahahahahaha....|                News|    October 12, 2017|[hahahahahahahaha...|        225|[hahahahahahahaha...|
| 461|   0|     fake| Kellyanne Conway...|White House advis...|                News|     August 31, 2017|[white, house, ad...|        318|[white, house, ad...|
| 617|   0|     fake| Sean Hannity Swe..

In [39]:
# Use same stop_list on the title_words
remover_title = StopWordsRemover(inputCol="title_words", outputCol="title_filtered", stopWords=stop_list)
removed_title_frame = remover_title.transform(title_tokened_transformed)
removed_title_frame.show(truncate=True)

+----+----+---------+--------------------+--------------------+--------------------+--------------------+--------------------+------------+--------------------+
| _c0|type|news_type|               title|                text|             subject|                date|         title_words|title_tokens|      title_filtered|
+----+----+---------+--------------------+--------------------+--------------------+--------------------+--------------------+------------+--------------------+
| 203|   0|     fake| Trump Gets His A...|Donald Trump stuc...|                News|    October 20, 2017|[, trump, gets, h...|          19|[, trump, gets, h...|
| 226|   0|     fake| White Pro-Trump ...|Hahahahahahahaha....|                News|    October 12, 2017|[, white, pro-tru...|          15|[, white, pro-tru...|
| 461|   0|     fake| Kellyanne Conway...|White House advis...|                News|     August 31, 2017|[, kellyanne, con...|          13|[, kellyanne, con...|
| 617|   0|     fake| Sean Hannity

In [41]:
# Hashing with removed_text_frame and removed_title_frame
text_hashing = HashingTF(inputCol="text_filtered", outputCol="text_hashedValues", numFeatures=pow(2,16)) 

text_hashed_df = text_hashing.transform(removed_text_frame)
text_hashed_df.select("text_hashedValues").show(truncate=True)

+--------------------+
|   text_hashedValues|
+--------------------+
|(65536,[231,778,8...|
|(65536,[178,568,9...|
|(65536,[282,521,2...|
|(65536,[471,740,8...|
|(65536,[1097,1524...|
|(65536,[90,460,97...|
|(65536,[835,1193,...|
|(65536,[9,308,368...|
|(65536,[835,1837,...|
|(65536,[231,1077,...|
|(65536,[1103,1451...|
|(65536,[81,364,56...|
|(65536,[329,377,5...|
|(65536,[749,835,1...|
|(65536,[308,401,6...|
|(65536,[454,972,1...|
|(65536,[600,619,8...|
|(65536,[178,764,8...|
|(65536,[1903,2257...|
|(65536,[12,1004,1...|
+--------------------+
only showing top 20 rows



In [42]:
# Fit the IDF on the data set 
text_idf = IDF(inputCol="text_hashedValues", outputCol="text_features")
text_idfModel = text_idf.fit(text_hashed_df)
text_rescaledData = text_idfModel.transform(text_hashed_df)

In [43]:
# Hashing with removed_text_frame and removed_title_frame
title_hashing = HashingTF(inputCol="title_filtered", outputCol="title_hashedValues", numFeatures=pow(2,16)) 

title_hashed_df = title_hashing.transform(removed_title_frame)
title_hashed_df.show()

+----+----+---------+--------------------+--------------------+--------------------+--------------------+--------------------+------------+--------------------+--------------------+
| _c0|type|news_type|               title|                text|             subject|                date|         title_words|title_tokens|      title_filtered|  title_hashedValues|
+----+----+---------+--------------------+--------------------+--------------------+--------------------+--------------------+------------+--------------------+--------------------+
| 203|   0|     fake| Trump Gets His A...|Donald Trump stuc...|                News|    October 20, 2017|[, trump, gets, h...|          19|[, trump, gets, h...|(65536,[3085,3149...|
| 226|   0|     fake| White Pro-Trump ...|Hahahahahahahaha....|                News|    October 12, 2017|[, white, pro-tru...|          15|[, white, pro-tru...|(65536,[8348,9092...|
| 461|   0|     fake| Kellyanne Conway...|White House advis...|                News|     A

In [44]:
# Fit the IDF on the data set 
title_idf = IDF(inputCol="title_hashedValues", outputCol="title_features")
title_idfModel = title_idf.fit(title_hashed_df)
title_rescaledData = title_idfModel.transform(title_hashed_df)