In [1]:
import os
import csv
import requests
import datetime
import pandas as pd
from pyspark.sql import SparkSession, functions
from pyspark.sql.types import *
from utilities import (
    send_request_reddit_get_new_post,
    format_reddit_created_date,
    get_last_execution_date,
    save_new_execution_date
)

In [2]:
spark = SparkSession \
      .builder \
      .appName("Spark Ingestion") \
      .getOrCreate()

In [3]:
datasets_path = '/home/jovyan/text_classification_data'

In [4]:
def ingest_textClassification_data(datasets_path):
    trainPath = os.path.join(datasets_path, 'news_category_train.csv')
    trainDataset = spark.read.option("header", True).csv(trainPath)
    testPath = os.path.join(datasets_path, 'news_category_test.csv')
    testDataset = spark.read.option("header", True).csv(testPath)
    df = trainDataset.union(testDataset)
    df = df.withColumnRenamed("description", "descriptions")
    df.write \
      .mode("append") \
      .format("jdbc") \
      .option("driver","com.mysql.cj.jdbc.Driver") \
      .option("url", "jdbc:mysql://web-database/Web") \
      .option("dbtable", "textClassification") \
      .option("user", "root") \
      .option("password", "123") \
      .save()

In [5]:
def get_and_ingest_redditData():
    last_time = get_last_execution_date()
    list_link = ['https://oauth.reddit.com/r/business/new/',
                 'https://oauth.reddit.com/r/technology/new/',
                 'https://oauth.reddit.com/r/sports/new/',
                 'https://oauth.reddit.com/r/worldnews/new/']
    subreddit_url = ['https://www.reddit.com/r/business/comments/',
                     'https://www.reddit.com/r/technology/comments/',
                     'https://www.reddit.com/r/sports/comments/',
                     'https://www.reddit.com/r/worldnews/comments/']
    label_dict = {0: 'Business',
                  1: 'Sci/Tech',
                  2: 'Sports',
                  3: 'World'}
    execution_time = []
    redditDF = pd.DataFrame(columns = ['post_id', 'descriptions', 'created_utc', 'source_url', 'post_url', 'category'])
    for num in range(len(list_link)) :
        reddit_response, reddit_status_code = send_request_reddit_get_new_post(list_link[num])
        subreddit_execution_time = ''
        subreddit_last_time = last_time[num]
        for post in reddit_response['data']['children'] :
            # only get posts which is after the last execution
            if datetime.datetime.strptime(format_reddit_created_date(post['data']['created_utc']), '%Y-%m-%d %H:%M:%S')  > subreddit_last_time :
                if post['data']['selftext'] == '' :
                    post_id = post['data']['id']
                    descriptions = post['data']['title']
                    created_utc = format_reddit_created_date(post['data']['created_utc'])
                    source_url = post['data']['url']
                    post_url = subreddit_url[num] + post_id
                    category = label_dict[num]

                    redditDF_aux = pd.DataFrame({'post_id': [post_id], 'descriptions': [descriptions], 
                                                'created_utc': [created_utc], 'source_url': [source_url], 
                                                'post_url': [post_url], 'category': [category]})

                    redditDF = pd.concat([redditDF_aux, redditDF], ignore_index = True, axis = 0)

                    if subreddit_execution_time == '' : 
                        subreddit_execution_time = created_utc
        if subreddit_execution_time == '' :
            subreddit_execution_time = subreddit_last_time
        execution_time.append(subreddit_execution_time)
    schema = StructType([StructField("post_id", StringType(), True), \
                         StructField("descriptions", StringType(), True), \
                         StructField("created_utc", StringType(), True), \
                         StructField("source_url", StringType(), True), \
                         StructField("post_url", StringType(), True), \
                         StructField("category", StringType(), True)])
    df = spark.createDataFrame(data = redditDF, schema = schema)
    df = df.withColumn('created_utc', functions.to_timestamp(df['created_utc'], 'yyyy-MM-dd HH:mm:ss'))
    df.write \
      .mode("append") \
      .format("jdbc") \
      .option("driver","com.mysql.cj.jdbc.Driver") \
      .option("url", "jdbc:mysql://web-database/Web") \
      .option("dbtable", "redditData") \
      .option("user", "root") \
      .option("password", "123") \
      .save()
    rBusiness_execution_time = execution_time[0]
    rTechnology_execution_time = execution_time[1]
    rSports_execution_time = execution_time[2] 
    rWorldNews_execution_time = execution_time[3]
    save_new_execution_date(rBusiness_execution_time, rTechnology_execution_time, rSports_execution_time, rWorldNews_execution_time)

In [6]:
if __name__ == '__main__' : 
    get_and_ingest_redditData()

  for column, series in pdf.iteritems():


Extraction datetime added to History-database


In [7]:
df =  spark.read \
      .format("jdbc") \
      .option("driver","com.mysql.cj.jdbc.Driver") \
      .option("url", "jdbc:mysql://web-database/Web") \
      .option("dbtable", "redditData") \
      .option("user", "root") \
      .option("password", "123").load()

In [8]:
df.count()

905