In [1]:
import pyspark
import pandas as pd
from pyspark.sql import SQLContext, SparkSession
from pyspark import SparkContext,SparkConf
from pyspark.sql.types import StructType,StructField,StringType
from pyspark.sql.functions import col


directory="./jars/*"

spark = SparkSession \
    .builder \
    .appName("NewsClassifier_datafeed") \
    .config("spark.mongodb.input.uri", "mongodb://localhost:27017/news_database.news_feed") \
    .config("spark.mongodb.output.uri", "mongodb://localhost:27017/news_database.news_feed") \
    .config('spark.driver.extraClassPath', directory) \
    .getOrCreate()


print(spark)

df = spark.read.format("mongo").load()

print("Schema:")
df.printSchema()

print("show top 5 records: ")
df.show(5)


<pyspark.sql.session.SparkSession object at 0x000002AD087DF4C0>
Schema:
root
 |-- _id: string (nullable = true)
 |-- _score: string (nullable = true)
 |-- author: string (nullable = true)
 |-- authors: string (nullable = true)
 |-- clean_url: string (nullable = true)
 |-- country: string (nullable = true)
 |-- is_opinion: string (nullable = true)
 |-- language: string (nullable = true)
 |-- link: string (nullable = true)
 |-- media: string (nullable = true)
 |-- published_date: string (nullable = true)
 |-- published_date_precision: string (nullable = true)
 |-- rank: string (nullable = true)
 |-- rights: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- title: string (nullable = true)
 |-- topic: string (nullable = true)
 |-- twitter_account: string (nullable = true)

show top 5 records: 
+--------------------+---------+--------------------+--------------------+---------------+-------+----------+--------+--------------------+--------------------+-------------------+

In [2]:
type(df)

pyspark.sql.dataframe.DataFrame

In [3]:
columns_to_drop = ['_id','_score','clean_url','country','language','link','published_date','published_date_precision','rank','rights','twitter_account']
df = df.drop(*columns_to_drop)
df.show(5)
df.columns

+--------------------+--------------------+----------+--------------------+--------------------+--------------------+--------+
|              author|             authors|is_opinion|               media|             summary|               title|   topic|
+--------------------+--------------------+----------+--------------------+--------------------+--------------------+--------+
|University Of Str...|["University Of S...|     false|https://scx2.b-cd...|Credit: Pixabay/C...|Riding the waves ...|    news|
|                null|                  []|     false|https://s.yimg.co...|Over 30,000 Hocke...|FLOSPORTS BOLSTER...|business|
|                null|                  []|     false|https://c.ndtvimg...|Chennai Super Kin...|CSK vs PBKS, Chen...|    news|
|                null|                  []|     false|https://c.ndtvimg...|The Indian Premie...|DC vs CSK, When A...|    news|
|Belinda Cleary Fo...|["Belinda Cleary"...|     false|https://i.dailyma...|Published: 22:05 ...|Australia has o

['author', 'authors', 'is_opinion', 'media', 'summary', 'title', 'topic']

In [4]:
df.printSchema()

root
 |-- author: string (nullable = true)
 |-- authors: string (nullable = true)
 |-- is_opinion: string (nullable = true)
 |-- media: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- title: string (nullable = true)
 |-- topic: string (nullable = true)



In [5]:
df.filter(df.topic.isNull()).show()
#(df.filter(col("author").isNull())

+------+-------+----------+-----+-------+-----+-----+
|author|authors|is_opinion|media|summary|title|topic|
+------+-------+----------+-----+-------+-----+-----+
+------+-------+----------+-----+-------+-----+-----+



In [6]:
df.show()

+--------------------+--------------------+----------+--------------------+--------------------+--------------------+---------+
|              author|             authors|is_opinion|               media|             summary|               title|    topic|
+--------------------+--------------------+----------+--------------------+--------------------+--------------------+---------+
|University Of Str...|["University Of S...|     false|https://scx2.b-cd...|Credit: Pixabay/C...|Riding the waves ...|     news|
|                null|                  []|     false|https://s.yimg.co...|Over 30,000 Hocke...|FLOSPORTS BOLSTER...| business|
|                null|                  []|     false|https://c.ndtvimg...|Chennai Super Kin...|CSK vs PBKS, Chen...|     news|
|                null|                  []|     false|https://c.ndtvimg...|The Indian Premie...|DC vs CSK, When A...|     news|
|Belinda Cleary Fo...|["Belinda Cleary"...|     false|https://i.dailyma...|Published: 22:05 ...|Australi

In [7]:
columns_to_drop = ['author', 'authors', 'is_opinion', 'media']
df = df.drop(*columns_to_drop)
df.show(5)

+--------------------+--------------------+--------+
|             summary|               title|   topic|
+--------------------+--------------------+--------+
|Credit: Pixabay/C...|Riding the waves ...|    news|
|Over 30,000 Hocke...|FLOSPORTS BOLSTER...|business|
|Chennai Super Kin...|CSK vs PBKS, Chen...|    news|
|The Indian Premie...|DC vs CSK, When A...|    news|
|Published: 22:05 ...|Australia has one...|    news|
+--------------------+--------------------+--------+
only showing top 5 rows



In [8]:
df.printSchema()

root
 |-- summary: string (nullable = true)
 |-- title: string (nullable = true)
 |-- topic: string (nullable = true)



In [9]:
type(df)

pyspark.sql.dataframe.DataFrame

In [10]:
# removing the values which has both null in title and summary
df=df.filter(df.title.isNotNull() & col("summary").isNotNull())


In [11]:
df.show()

+--------------------+--------------------+---------+
|             summary|               title|    topic|
+--------------------+--------------------+---------+
|Credit: Pixabay/C...|Riding the waves ...|     news|
|Over 30,000 Hocke...|FLOSPORTS BOLSTER...| business|
|Chennai Super Kin...|CSK vs PBKS, Chen...|     news|
|The Indian Premie...|DC vs CSK, When A...|     news|
|Published: 22:05 ...|Australia has one...|     news|
|England Hockey ha...|England pulls out...|    sport|
|Topic | Australia...|Topic  Australian...|     news|
|PTIOdisha CM Patn...|Hockey: India swe...|economics|
|In 2014 I broke m...|Like Peloton, but...|     news|
|Tokyo 2020 Olympi...|India withdraw ho...|     news|
|AUSTIN, Texas, Oc...|FLOSPORTS BOLSTER...| business|
|Althea Mercer Man...|On her way to Blo...|     news|
|(CNN)Ukrainian ho...|Ukrainian Hockey ...|    sport|
|Paralympic champi...|Paralympic champi...|     news|
|Virat Kohli-led R...|RCB vs SRH, Royal...|     news|
|1of12Pittsburgh P...|Jarry,

In [12]:
#ensuring no null values
df.filter(df.title.isNull() & col("summary").isNull()).show()

+-------+-----+-----+
|summary|title|topic|
+-------+-----+-----+
+-------+-----+-----+



In [13]:
df.filter(col("summary").isNull()).show()

+-------+-----+-----+
|summary|title|topic|
+-------+-----+-----+
+-------+-----+-----+



In [14]:
df.filter(df.topic.isNull()).show()

+-------+-----+-----+
|summary|title|topic|
+-------+-----+-----+
+-------+-----+-----+



In [15]:
df.show()

+--------------------+--------------------+---------+
|             summary|               title|    topic|
+--------------------+--------------------+---------+
|Credit: Pixabay/C...|Riding the waves ...|     news|
|Over 30,000 Hocke...|FLOSPORTS BOLSTER...| business|
|Chennai Super Kin...|CSK vs PBKS, Chen...|     news|
|The Indian Premie...|DC vs CSK, When A...|     news|
|Published: 22:05 ...|Australia has one...|     news|
|England Hockey ha...|England pulls out...|    sport|
|Topic | Australia...|Topic  Australian...|     news|
|PTIOdisha CM Patn...|Hockey: India swe...|economics|
|In 2014 I broke m...|Like Peloton, but...|     news|
|Tokyo 2020 Olympi...|India withdraw ho...|     news|
|AUSTIN, Texas, Oc...|FLOSPORTS BOLSTER...| business|
|Althea Mercer Man...|On her way to Blo...|     news|
|(CNN)Ukrainian ho...|Ukrainian Hockey ...|    sport|
|Paralympic champi...|Paralympic champi...|     news|
|Virat Kohli-led R...|RCB vs SRH, Royal...|     news|
|1of12Pittsburgh P...|Jarry,

In [16]:
df.filter(df.topic.isNotNull()).show()

+--------------------+--------------------+---------+
|             summary|               title|    topic|
+--------------------+--------------------+---------+
|Credit: Pixabay/C...|Riding the waves ...|     news|
|Over 30,000 Hocke...|FLOSPORTS BOLSTER...| business|
|Chennai Super Kin...|CSK vs PBKS, Chen...|     news|
|The Indian Premie...|DC vs CSK, When A...|     news|
|Published: 22:05 ...|Australia has one...|     news|
|England Hockey ha...|England pulls out...|    sport|
|Topic | Australia...|Topic  Australian...|     news|
|PTIOdisha CM Patn...|Hockey: India swe...|economics|
|In 2014 I broke m...|Like Peloton, but...|     news|
|Tokyo 2020 Olympi...|India withdraw ho...|     news|
|AUSTIN, Texas, Oc...|FLOSPORTS BOLSTER...| business|
|Althea Mercer Man...|On her way to Blo...|     news|
|(CNN)Ukrainian ho...|Ukrainian Hockey ...|    sport|
|Paralympic champi...|Paralympic champi...|     news|
|Virat Kohli-led R...|RCB vs SRH, Royal...|     news|
|1of12Pittsburgh P...|Jarry,

In [17]:
df.show()

+--------------------+--------------------+---------+
|             summary|               title|    topic|
+--------------------+--------------------+---------+
|Credit: Pixabay/C...|Riding the waves ...|     news|
|Over 30,000 Hocke...|FLOSPORTS BOLSTER...| business|
|Chennai Super Kin...|CSK vs PBKS, Chen...|     news|
|The Indian Premie...|DC vs CSK, When A...|     news|
|Published: 22:05 ...|Australia has one...|     news|
|England Hockey ha...|England pulls out...|    sport|
|Topic | Australia...|Topic  Australian...|     news|
|PTIOdisha CM Patn...|Hockey: India swe...|economics|
|In 2014 I broke m...|Like Peloton, but...|     news|
|Tokyo 2020 Olympi...|India withdraw ho...|     news|
|AUSTIN, Texas, Oc...|FLOSPORTS BOLSTER...| business|
|Althea Mercer Man...|On her way to Blo...|     news|
|(CNN)Ukrainian ho...|Ukrainian Hockey ...|    sport|
|Paralympic champi...|Paralympic champi...|     news|
|Virat Kohli-led R...|RCB vs SRH, Royal...|     news|
|1of12Pittsburgh P...|Jarry,

In [18]:
print((df.count(), len(df.columns)))

(231131, 3)


In [19]:
pd_Df=df.toPandas()

In [20]:
type(pd_Df)

pandas.core.frame.DataFrame

In [21]:
pd_Df.head(10)

Unnamed: 0,summary,title,topic
0,Credit: Pixabay/CC0 Public Domain \nThe sight ...,Riding the waves keeps ducks in a row,news
1,"Over 30,000 Hockey Games Annually On HockeyTV ...",FLOSPORTS BOLSTERS ITS HOCKEY OFFERING WITH AC...,business
2,Chennai Super Kings (CSK) will spearhead again...,"CSK vs PBKS, Chennai Super Kings vs Punjab Kin...",news
3,The Indian Premier League (IPL) 2021 has reach...,"DC vs CSK, When And Where To Watch: Live Telec...",news
4,"Published: 22:05 EDT, 2 October 2021 | Updated...",Australia has one of the world's best skinny d...,news
5,England Hockey has informed the FIH of its dec...,England pulls out of men's Junior Hockey World...,sport
6,Topic | Australian swimming | The AgeWe're sor...,Topic Australian swimming,news
7,PTIOdisha CM Patnaik felicitated Indian hockey...,Hockey: India sweeps FIH annual awards; Belgiu...,economics
8,"In 2014 I broke my butt (meaning, I pulled my ...","Like Peloton, but…in the pool? Guided swim wor...",news
9,Tokyo 2020 Olympics - Hockey - Men - Semifinal...,India withdraw hockey teams from Commonwealth ...,news


In [22]:
# adding the title null values as summary just incase if we come accross null vales
pd_Df["title"].fillna('summary', inplace=True)

In [23]:
pd_Df.head(10)

Unnamed: 0,summary,title,topic
0,Credit: Pixabay/CC0 Public Domain \nThe sight ...,Riding the waves keeps ducks in a row,news
1,"Over 30,000 Hockey Games Annually On HockeyTV ...",FLOSPORTS BOLSTERS ITS HOCKEY OFFERING WITH AC...,business
2,Chennai Super Kings (CSK) will spearhead again...,"CSK vs PBKS, Chennai Super Kings vs Punjab Kin...",news
3,The Indian Premier League (IPL) 2021 has reach...,"DC vs CSK, When And Where To Watch: Live Telec...",news
4,"Published: 22:05 EDT, 2 October 2021 | Updated...",Australia has one of the world's best skinny d...,news
5,England Hockey has informed the FIH of its dec...,England pulls out of men's Junior Hockey World...,sport
6,Topic | Australian swimming | The AgeWe're sor...,Topic Australian swimming,news
7,PTIOdisha CM Patnaik felicitated Indian hockey...,Hockey: India sweeps FIH annual awards; Belgiu...,economics
8,"In 2014 I broke my butt (meaning, I pulled my ...","Like Peloton, but…in the pool? Guided swim wor...",news
9,Tokyo 2020 Olympics - Hockey - Men - Semifinal...,India withdraw hockey teams from Commonwealth ...,news


In [25]:
pd_Df["summary"].fillna('title', inplace=True)
pd_Df["topic"].fillna("General news", inplace=True)

In [26]:
pd_Df.head(10)

Unnamed: 0,summary,title,topic
0,Credit: Pixabay/CC0 Public Domain \nThe sight ...,Riding the waves keeps ducks in a row,news
1,"Over 30,000 Hockey Games Annually On HockeyTV ...",FLOSPORTS BOLSTERS ITS HOCKEY OFFERING WITH AC...,business
2,Chennai Super Kings (CSK) will spearhead again...,"CSK vs PBKS, Chennai Super Kings vs Punjab Kin...",news
3,The Indian Premier League (IPL) 2021 has reach...,"DC vs CSK, When And Where To Watch: Live Telec...",news
4,"Published: 22:05 EDT, 2 October 2021 | Updated...",Australia has one of the world's best skinny d...,news
5,England Hockey has informed the FIH of its dec...,England pulls out of men's Junior Hockey World...,sport
6,Topic | Australian swimming | The AgeWe're sor...,Topic Australian swimming,news
7,PTIOdisha CM Patnaik felicitated Indian hockey...,Hockey: India sweeps FIH annual awards; Belgiu...,economics
8,"In 2014 I broke my butt (meaning, I pulled my ...","Like Peloton, but…in the pool? Guided swim wor...",news
9,Tokyo 2020 Olympics - Hockey - Men - Semifinal...,India withdraw hockey teams from Commonwealth ...,news


In [27]:
#sending the input to model as title + summary
pd_Df['model_input_txt']=pd_Df['title'] + pd_Df['summary']

In [28]:
pd_Df.head()

Unnamed: 0,summary,title,topic,model_input_txt
0,Credit: Pixabay/CC0 Public Domain \nThe sight ...,Riding the waves keeps ducks in a row,news,Riding the waves keeps ducks in a rowCredit: P...
1,"Over 30,000 Hockey Games Annually On HockeyTV ...",FLOSPORTS BOLSTERS ITS HOCKEY OFFERING WITH AC...,business,FLOSPORTS BOLSTERS ITS HOCKEY OFFERING WITH AC...
2,Chennai Super Kings (CSK) will spearhead again...,"CSK vs PBKS, Chennai Super Kings vs Punjab Kin...",news,"CSK vs PBKS, Chennai Super Kings vs Punjab Kin..."
3,The Indian Premier League (IPL) 2021 has reach...,"DC vs CSK, When And Where To Watch: Live Telec...",news,"DC vs CSK, When And Where To Watch: Live Telec..."
4,"Published: 22:05 EDT, 2 October 2021 | Updated...",Australia has one of the world's best skinny d...,news,Australia has one of the world's best skinny d...
